1<?php
2
3/**
4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
6 *
7 * @note
8 *    Recent changes to PHP's DOM extension have resulted in some fatal
9 *    error conditions with the original version of PH5P. Pending changes,
10 *    this lexer will punt to DirectLex if DOM throws an exception.
11 */
12
13class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
14{
15    /**
16     * @param string $html
17     * @param HTMLPurifier_Config $config
18     * @param HTMLPurifier_Context $context
19     * @return HTMLPurifier_Token[]
20     */
21    public function tokenizeHTML($html, $config, $context)
22    {
23        $new_html = $this->normalize($html, $config, $context);
24        $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
25        try {
26            $parser = new HTML5($new_html);
27            $doc = $parser->save();
28        } catch (DOMException $e) {
29            // Uh oh, it failed. Punt to DirectLex.
30            $lexer = new HTMLPurifier_Lexer_DirectLex();
31            $context->register('PH5PError', $e); // save the error, so we can detect it
32            return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33        }
34        $tokens = array();
35        $this->tokenizeDOM(
36            $doc->getElementsByTagName('html')->item(0)-> // <html>
37                  getElementsByTagName('body')->item(0) //   <body>
38            ,
39            $tokens, $config
40        );
41        return $tokens;
42    }
43}
44
45/*
46
47Copyright 2007 Jeroen van der Meer <http://jero.net/>
48
49Permission is hereby granted, free of charge, to any person obtaining a
50copy of this software and associated documentation files (the
51"Software"), to deal in the Software without restriction, including
52without limitation the rights to use, copy, modify, merge, publish,
53distribute, sublicense, and/or sell copies of the Software, and to
54permit persons to whom the Software is furnished to do so, subject to
55the following conditions:
56
57The above copyright notice and this permission notice shall be included
58in all copies or substantial portions of the Software.
59
60THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67
68*/
69
70class HTML5
71{
72    private $data;
73    private $char;
74    private $EOF;
75    private $state;
76    private $tree;
77    private $token;
78    private $content_model;
79    private $escape = false;
80    private $entities = array(
81        'AElig;',
82        'AElig',
83        'AMP;',
84        'AMP',
85        'Aacute;',
86        'Aacute',
87        'Acirc;',
88        'Acirc',
89        'Agrave;',
90        'Agrave',
91        'Alpha;',
92        'Aring;',
93        'Aring',
94        'Atilde;',
95        'Atilde',
96        'Auml;',
97        'Auml',
98        'Beta;',
99        'COPY;',
100        'COPY',
101        'Ccedil;',
102        'Ccedil',
103        'Chi;',
104        'Dagger;',
105        'Delta;',
106        'ETH;',
107        'ETH',
108        'Eacute;',
109        'Eacute',
110        'Ecirc;',
111        'Ecirc',
112        'Egrave;',
113        'Egrave',
114        'Epsilon;',
115        'Eta;',
116        'Euml;',
117        'Euml',
118        'GT;',
119        'GT',
120        'Gamma;',
121        'Iacute;',
122        'Iacute',
123        'Icirc;',
124        'Icirc',
125        'Igrave;',
126        'Igrave',
127        'Iota;',
128        'Iuml;',
129        'Iuml',
130        'Kappa;',
131        'LT;',
132        'LT',
133        'Lambda;',
134        'Mu;',
135        'Ntilde;',
136        'Ntilde',
137        'Nu;',
138        'OElig;',
139        'Oacute;',
140        'Oacute',
141        'Ocirc;',
142        'Ocirc',
143        'Ograve;',
144        'Ograve',
145        'Omega;',
146        'Omicron;',
147        'Oslash;',
148        'Oslash',
149        'Otilde;',
150        'Otilde',
151        'Ouml;',
152        'Ouml',
153        'Phi;',
154        'Pi;',
155        'Prime;',
156        'Psi;',
157        'QUOT;',
158        'QUOT',
159        'REG;',
160        'REG',
161        'Rho;',
162        'Scaron;',
163        'Sigma;',
164        'THORN;',
165        'THORN',
166        'TRADE;',
167        'Tau;',
168        'Theta;',
169        'Uacute;',
170        'Uacute',
171        'Ucirc;',
172        'Ucirc',
173        'Ugrave;',
174        'Ugrave',
175        'Upsilon;',
176        'Uuml;',
177        'Uuml',
178        'Xi;',
179        'Yacute;',
180        'Yacute',
181        'Yuml;',
182        'Zeta;',
183        'aacute;',
184        'aacute',
185        'acirc;',
186        'acirc',
187        'acute;',
188        'acute',
189        'aelig;',
190        'aelig',
191        'agrave;',
192        'agrave',
193        'alefsym;',
194        'alpha;',
195        'amp;',
196        'amp',
197        'and;',
198        'ang;',
199        'apos;',
200        'aring;',
201        'aring',
202        'asymp;',
203        'atilde;',
204        'atilde',
205        'auml;',
206        'auml',
207        'bdquo;',
208        'beta;',
209        'brvbar;',
210        'brvbar',
211        'bull;',
212        'cap;',
213        'ccedil;',
214        'ccedil',
215        'cedil;',
216        'cedil',
217        'cent;',
218        'cent',
219        'chi;',
220        'circ;',
221        'clubs;',
222        'cong;',
223        'copy;',
224        'copy',
225        'crarr;',
226        'cup;',
227        'curren;',
228        'curren',
229        'dArr;',
230        'dagger;',
231        'darr;',
232        'deg;',
233        'deg',
234        'delta;',
235        'diams;',
236        'divide;',
237        'divide',
238        'eacute;',
239        'eacute',
240        'ecirc;',
241        'ecirc',
242        'egrave;',
243        'egrave',
244        'empty;',
245        'emsp;',
246        'ensp;',
247        'epsilon;',
248        'equiv;',
249        'eta;',
250        'eth;',
251        'eth',
252        'euml;',
253        'euml',
254        'euro;',
255        'exist;',
256        'fnof;',
257        'forall;',
258        'frac12;',
259        'frac12',
260        'frac14;',
261        'frac14',
262        'frac34;',
263        'frac34',
264        'frasl;',
265        'gamma;',
266        'ge;',
267        'gt;',
268        'gt',
269        'hArr;',
270        'harr;',
271        'hearts;',
272        'hellip;',
273        'iacute;',
274        'iacute',
275        'icirc;',
276        'icirc',
277        'iexcl;',
278        'iexcl',
279        'igrave;',
280        'igrave',
281        'image;',
282        'infin;',
283        'int;',
284        'iota;',
285        'iquest;',
286        'iquest',
287        'isin;',
288        'iuml;',
289        'iuml',
290        'kappa;',
291        'lArr;',
292        'lambda;',
293        'lang;',
294        'laquo;',
295        'laquo',
296        'larr;',
297        'lceil;',
298        'ldquo;',
299        'le;',
300        'lfloor;',
301        'lowast;',
302        'loz;',
303        'lrm;',
304        'lsaquo;',
305        'lsquo;',
306        'lt;',
307        'lt',
308        'macr;',
309        'macr',
310        'mdash;',
311        'micro;',
312        'micro',
313        'middot;',
314        'middot',
315        'minus;',
316        'mu;',
317        'nabla;',
318        'nbsp;',
319        'nbsp',
320        'ndash;',
321        'ne;',
322        'ni;',
323        'not;',
324        'not',
325        'notin;',
326        'nsub;',
327        'ntilde;',
328        'ntilde',
329        'nu;',
330        'oacute;',
331        'oacute',
332        'ocirc;',
333        'ocirc',
334        'oelig;',
335        'ograve;',
336        'ograve',
337        'oline;',
338        'omega;',
339        'omicron;',
340        'oplus;',
341        'or;',
342        'ordf;',
343        'ordf',
344        'ordm;',
345        'ordm',
346        'oslash;',
347        'oslash',
348        'otilde;',
349        'otilde',
350        'otimes;',
351        'ouml;',
352        'ouml',
353        'para;',
354        'para',
355        'part;',
356        'permil;',
357        'perp;',
358        'phi;',
359        'pi;',
360        'piv;',
361        'plusmn;',
362        'plusmn',
363        'pound;',
364        'pound',
365        'prime;',
366        'prod;',
367        'prop;',
368        'psi;',
369        'quot;',
370        'quot',
371        'rArr;',
372        'radic;',
373        'rang;',
374        'raquo;',
375        'raquo',
376        'rarr;',
377        'rceil;',
378        'rdquo;',
379        'real;',
380        'reg;',
381        'reg',
382        'rfloor;',
383        'rho;',
384        'rlm;',
385        'rsaquo;',
386        'rsquo;',
387        'sbquo;',
388        'scaron;',
389        'sdot;',
390        'sect;',
391        'sect',
392        'shy;',
393        'shy',
394        'sigma;',
395        'sigmaf;',
396        'sim;',
397        'spades;',
398        'sub;',
399        'sube;',
400        'sum;',
401        'sup1;',
402        'sup1',
403        'sup2;',
404        'sup2',
405        'sup3;',
406        'sup3',
407        'sup;',
408        'supe;',
409        'szlig;',
410        'szlig',
411        'tau;',
412        'there4;',
413        'theta;',
414        'thetasym;',
415        'thinsp;',
416        'thorn;',
417        'thorn',
418        'tilde;',
419        'times;',
420        'times',
421        'trade;',
422        'uArr;',
423        'uacute;',
424        'uacute',
425        'uarr;',
426        'ucirc;',
427        'ucirc',
428        'ugrave;',
429        'ugrave',
430        'uml;',
431        'uml',
432        'upsih;',
433        'upsilon;',
434        'uuml;',
435        'uuml',
436        'weierp;',
437        'xi;',
438        'yacute;',
439        'yacute',
440        'yen;',
441        'yen',
442        'yuml;',
443        'yuml',
444        'zeta;',
445        'zwj;',
446        'zwnj;'
447    );
448
449    const PCDATA = 0;
450    const RCDATA = 1;
451    const CDATA = 2;
452    const PLAINTEXT = 3;
453
454    const DOCTYPE = 0;
455    const STARTTAG = 1;
456    const ENDTAG = 2;
457    const COMMENT = 3;
458    const CHARACTR = 4;
459    const EOF = 5;
460
461    public function __construct($data)
462    {
463        $this->data = $data;
464        $this->char = -1;
465        $this->EOF = strlen($data);
466        $this->tree = new HTML5TreeConstructer;
467        $this->content_model = self::PCDATA;
468
469        $this->state = 'data';
470
471        while ($this->state !== null) {
472            $this->{$this->state . 'State'}();
473        }
474    }
475
476    public function save()
477    {
478        return $this->tree->save();
479    }
480
481    private function char()
482    {
483        return ($this->char < $this->EOF)
484            ? $this->data[$this->char]
485            : false;
486    }
487
488    private function character($s, $l = 0)
489    {
490        if ($s + $l < $this->EOF) {
491            if ($l === 0) {
492                return $this->data[$s];
493            } else {
494                return substr($this->data, $s, $l);
495            }
496        }
497    }
498
499    private function characters($char_class, $start)
500    {
501        return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502    }
503
504    private function dataState()
505    {
506        // Consume the next input character
507        $this->char++;
508        $char = $this->char();
509
510        if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511            /* U+0026 AMPERSAND (&)
512            When the content model flag is set to one of the PCDATA or RCDATA
513            states: switch to the entity data state. Otherwise: treat it as per
514            the "anything else"    entry below. */
515            $this->state = 'entityData';
516
517        } elseif ($char === '-') {
518            /* If the content model flag is set to either the RCDATA state or
519            the CDATA state, and the escape flag is false, and there are at
520            least three characters before this one in the input stream, and the
521            last four characters in the input stream, including this one, are
522            U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523            and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524            if (($this->content_model === self::RCDATA || $this->content_model ===
525                    self::CDATA) && $this->escape === false &&
526                $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527            ) {
528                $this->escape = true;
529            }
530
531            /* In any case, emit the input character as a character token. Stay
532            in the data state. */
533            $this->emitToken(
534                array(
535                    'type' => self::CHARACTR,
536                    'data' => $char
537                )
538            );
539
540            /* U+003C LESS-THAN SIGN (<) */
541        } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542                (($this->content_model === self::RCDATA ||
543                        $this->content_model === self::CDATA) && $this->escape === false))
544        ) {
545            /* When the content model flag is set to the PCDATA state: switch
546            to the tag open state.
547
548            When the content model flag is set to either the RCDATA state or
549            the CDATA state and the escape flag is false: switch to the tag
550            open state.
551
552            Otherwise: treat it as per the "anything else" entry below. */
553            $this->state = 'tagOpen';
554
555            /* U+003E GREATER-THAN SIGN (>) */
556        } elseif ($char === '>') {
557            /* If the content model flag is set to either the RCDATA state or
558            the CDATA state, and the escape flag is true, and the last three
559            characters in the input stream including this one are U+002D
560            HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561            set the escape flag to false. */
562            if (($this->content_model === self::RCDATA ||
563                    $this->content_model === self::CDATA) && $this->escape === true &&
564                $this->character($this->char, 3) === '-->'
565            ) {
566                $this->escape = false;
567            }
568
569            /* In any case, emit the input character as a character token.
570            Stay in the data state. */
571            $this->emitToken(
572                array(
573                    'type' => self::CHARACTR,
574                    'data' => $char
575                )
576            );
577
578        } elseif ($this->char === $this->EOF) {
579            /* EOF
580            Emit an end-of-file token. */
581            $this->EOF();
582
583        } elseif ($this->content_model === self::PLAINTEXT) {
584            /* When the content model flag is set to the PLAINTEXT state
585            THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586            the text and emit it as a character token. */
587            $this->emitToken(
588                array(
589                    'type' => self::CHARACTR,
590                    'data' => substr($this->data, $this->char)
591                )
592            );
593
594            $this->EOF();
595
596        } else {
597            /* Anything else
598            THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599            otherwise would also be treated as a character token and emit it
600            as a single character token. Stay in the data state. */
601            $len = strcspn($this->data, '<&', $this->char);
602            $char = substr($this->data, $this->char, $len);
603            $this->char += $len - 1;
604
605            $this->emitToken(
606                array(
607                    'type' => self::CHARACTR,
608                    'data' => $char
609                )
610            );
611
612            $this->state = 'data';
613        }
614    }
615
616    private function entityDataState()
617    {
618        // Attempt to consume an entity.
619        $entity = $this->entity();
620
621        // If nothing is returned, emit a U+0026 AMPERSAND character token.
622        // Otherwise, emit the character token that was returned.
623        $char = (!$entity) ? '&' : $entity;
624        $this->emitToken(
625            array(
626                'type' => self::CHARACTR,
627                'data' => $char
628            )
629        );
630
631        // Finally, switch to the data state.
632        $this->state = 'data';
633    }
634
635    private function tagOpenState()
636    {
637        switch ($this->content_model) {
638            case self::RCDATA:
639            case self::CDATA:
640                /* If the next input character is a U+002F SOLIDUS (/) character,
641                consume it and switch to the close tag open state. If the next
642                input character is not a U+002F SOLIDUS (/) character, emit a
643                U+003C LESS-THAN SIGN character token and switch to the data
644                state to process the next input character. */
645                if ($this->character($this->char + 1) === '/') {
646                    $this->char++;
647                    $this->state = 'closeTagOpen';
648
649                } else {
650                    $this->emitToken(
651                        array(
652                            'type' => self::CHARACTR,
653                            'data' => '<'
654                        )
655                    );
656
657                    $this->state = 'data';
658                }
659                break;
660
661            case self::PCDATA:
662                // If the content model flag is set to the PCDATA state
663                // Consume the next input character:
664                $this->char++;
665                $char = $this->char();
666
667                if ($char === '!') {
668                    /* U+0021 EXCLAMATION MARK (!)
669                    Switch to the markup declaration open state. */
670                    $this->state = 'markupDeclarationOpen';
671
672                } elseif ($char === '/') {
673                    /* U+002F SOLIDUS (/)
674                    Switch to the close tag open state. */
675                    $this->state = 'closeTagOpen';
676
677                } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678                    /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679                    Create a new start tag token, set its tag name to the lowercase
680                    version of the input character (add 0x0020 to the character's code
681                    point), then switch to the tag name state. (Don't emit the token
682                    yet; further details will be filled in before it is emitted.) */
683                    $this->token = array(
684                        'name' => strtolower($char),
685                        'type' => self::STARTTAG,
686                        'attr' => array()
687                    );
688
689                    $this->state = 'tagName';
690
691                } elseif ($char === '>') {
692                    /* U+003E GREATER-THAN SIGN (>)
693                    Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694                    U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695                    $this->emitToken(
696                        array(
697                            'type' => self::CHARACTR,
698                            'data' => '<>'
699                        )
700                    );
701
702                    $this->state = 'data';
703
704                } elseif ($char === '?') {
705                    /* U+003F QUESTION MARK (?)
706                    Parse error. Switch to the bogus comment state. */
707                    $this->state = 'bogusComment';
708
709                } else {
710                    /* Anything else
711                    Parse error. Emit a U+003C LESS-THAN SIGN character token and
712                    reconsume the current input character in the data state. */
713                    $this->emitToken(
714                        array(
715                            'type' => self::CHARACTR,
716                            'data' => '<'
717                        )
718                    );
719
720                    $this->char--;
721                    $this->state = 'data';
722                }
723                break;
724        }
725    }
726
727    private function closeTagOpenState()
728    {
729        $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730        $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731
732        if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733            (!$the_same || ($the_same && (!preg_match(
734                            '/[\t\n\x0b\x0c >\/]/',
735                            $this->character($this->char + 1 + strlen($next_node))
736                        ) || $this->EOF === $this->char)))
737        ) {
738            /* If the content model flag is set to the RCDATA or CDATA states then
739            examine the next few characters. If they do not match the tag name of
740            the last start tag token emitted (case insensitively), or if they do but
741            they are not immediately followed by one of the following characters:
742                * U+0009 CHARACTER TABULATION
743                * U+000A LINE FEED (LF)
744                * U+000B LINE TABULATION
745                * U+000C FORM FEED (FF)
746                * U+0020 SPACE
747                * U+003E GREATER-THAN SIGN (>)
748                * U+002F SOLIDUS (/)
749                * EOF
750            ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751            token, a U+002F SOLIDUS character token, and switch to the data state
752            to process the next input character. */
753            $this->emitToken(
754                array(
755                    'type' => self::CHARACTR,
756                    'data' => '</'
757                )
758            );
759
760            $this->state = 'data';
761
762        } else {
763            /* Otherwise, if the content model flag is set to the PCDATA state,
764            or if the next few characters do match that tag name, consume the
765            next input character: */
766            $this->char++;
767            $char = $this->char();
768
769            if (preg_match('/^[A-Za-z]$/', $char)) {
770                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771                Create a new end tag token, set its tag name to the lowercase version
772                of the input character (add 0x0020 to the character's code point), then
773                switch to the tag name state. (Don't emit the token yet; further details
774                will be filled in before it is emitted.) */
775                $this->token = array(
776                    'name' => strtolower($char),
777                    'type' => self::ENDTAG
778                );
779
780                $this->state = 'tagName';
781
782            } elseif ($char === '>') {
783                /* U+003E GREATER-THAN SIGN (>)
784                Parse error. Switch to the data state. */
785                $this->state = 'data';
786
787            } elseif ($this->char === $this->EOF) {
788                /* EOF
789                Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790                SOLIDUS character token. Reconsume the EOF character in the data state. */
791                $this->emitToken(
792                    array(
793                        'type' => self::CHARACTR,
794                        'data' => '</'
795                    )
796                );
797
798                $this->char--;
799                $this->state = 'data';
800
801            } else {
802                /* Parse error. Switch to the bogus comment state. */
803                $this->state = 'bogusComment';
804            }
805        }
806    }
807
808    private function tagNameState()
809    {
810        // Consume the next input character:
811        $this->char++;
812        $char = $this->character($this->char);
813
814        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815            /* U+0009 CHARACTER TABULATION
816            U+000A LINE FEED (LF)
817            U+000B LINE TABULATION
818            U+000C FORM FEED (FF)
819            U+0020 SPACE
820            Switch to the before attribute name state. */
821            $this->state = 'beforeAttributeName';
822
823        } elseif ($char === '>') {
824            /* U+003E GREATER-THAN SIGN (>)
825            Emit the current tag token. Switch to the data state. */
826            $this->emitToken($this->token);
827            $this->state = 'data';
828
829        } elseif ($this->char === $this->EOF) {
830            /* EOF
831            Parse error. Emit the current tag token. Reconsume the EOF
832            character in the data state. */
833            $this->emitToken($this->token);
834
835            $this->char--;
836            $this->state = 'data';
837
838        } elseif ($char === '/') {
839            /* U+002F SOLIDUS (/)
840            Parse error unless this is a permitted slash. Switch to the before
841            attribute name state. */
842            $this->state = 'beforeAttributeName';
843
844        } else {
845            /* Anything else
846            Append the current input character to the current tag token's tag name.
847            Stay in the tag name state. */
848            $this->token['name'] .= strtolower($char);
849            $this->state = 'tagName';
850        }
851    }
852
853    private function beforeAttributeNameState()
854    {
855        // Consume the next input character:
856        $this->char++;
857        $char = $this->character($this->char);
858
859        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860            /* U+0009 CHARACTER TABULATION
861            U+000A LINE FEED (LF)
862            U+000B LINE TABULATION
863            U+000C FORM FEED (FF)
864            U+0020 SPACE
865            Stay in the before attribute name state. */
866            $this->state = 'beforeAttributeName';
867
868        } elseif ($char === '>') {
869            /* U+003E GREATER-THAN SIGN (>)
870            Emit the current tag token. Switch to the data state. */
871            $this->emitToken($this->token);
872            $this->state = 'data';
873
874        } elseif ($char === '/') {
875            /* U+002F SOLIDUS (/)
876            Parse error unless this is a permitted slash. Stay in the before
877            attribute name state. */
878            $this->state = 'beforeAttributeName';
879
880        } elseif ($this->char === $this->EOF) {
881            /* EOF
882            Parse error. Emit the current tag token. Reconsume the EOF
883            character in the data state. */
884            $this->emitToken($this->token);
885
886            $this->char--;
887            $this->state = 'data';
888
889        } else {
890            /* Anything else
891            Start a new attribute in the current tag token. Set that attribute's
892            name to the current input character, and its value to the empty string.
893            Switch to the attribute name state. */
894            $this->token['attr'][] = array(
895                'name' => strtolower($char),
896                'value' => null
897            );
898
899            $this->state = 'attributeName';
900        }
901    }
902
903    private function attributeNameState()
904    {
905        // Consume the next input character:
906        $this->char++;
907        $char = $this->character($this->char);
908
909        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910            /* U+0009 CHARACTER TABULATION
911            U+000A LINE FEED (LF)
912            U+000B LINE TABULATION
913            U+000C FORM FEED (FF)
914            U+0020 SPACE
915            Stay in the before attribute name state. */
916            $this->state = 'afterAttributeName';
917
918        } elseif ($char === '=') {
919            /* U+003D EQUALS SIGN (=)
920            Switch to the before attribute value state. */
921            $this->state = 'beforeAttributeValue';
922
923        } elseif ($char === '>') {
924            /* U+003E GREATER-THAN SIGN (>)
925            Emit the current tag token. Switch to the data state. */
926            $this->emitToken($this->token);
927            $this->state = 'data';
928
929        } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930            /* U+002F SOLIDUS (/)
931            Parse error unless this is a permitted slash. Switch to the before
932            attribute name state. */
933            $this->state = 'beforeAttributeName';
934
935        } elseif ($this->char === $this->EOF) {
936            /* EOF
937            Parse error. Emit the current tag token. Reconsume the EOF
938            character in the data state. */
939            $this->emitToken($this->token);
940
941            $this->char--;
942            $this->state = 'data';
943
944        } else {
945            /* Anything else
946            Append the current input character to the current attribute's name.
947            Stay in the attribute name state. */
948            $last = count($this->token['attr']) - 1;
949            $this->token['attr'][$last]['name'] .= strtolower($char);
950
951            $this->state = 'attributeName';
952        }
953    }
954
955    private function afterAttributeNameState()
956    {
957        // Consume the next input character:
958        $this->char++;
959        $char = $this->character($this->char);
960
961        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962            /* U+0009 CHARACTER TABULATION
963            U+000A LINE FEED (LF)
964            U+000B LINE TABULATION
965            U+000C FORM FEED (FF)
966            U+0020 SPACE
967            Stay in the after attribute name state. */
968            $this->state = 'afterAttributeName';
969
970        } elseif ($char === '=') {
971            /* U+003D EQUALS SIGN (=)
972            Switch to the before attribute value state. */
973            $this->state = 'beforeAttributeValue';
974
975        } elseif ($char === '>') {
976            /* U+003E GREATER-THAN SIGN (>)
977            Emit the current tag token. Switch to the data state. */
978            $this->emitToken($this->token);
979            $this->state = 'data';
980
981        } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982            /* U+002F SOLIDUS (/)
983            Parse error unless this is a permitted slash. Switch to the
984            before attribute name state. */
985            $this->state = 'beforeAttributeName';
986
987        } elseif ($this->char === $this->EOF) {
988            /* EOF
989            Parse error. Emit the current tag token. Reconsume the EOF
990            character in the data state. */
991            $this->emitToken($this->token);
992
993            $this->char--;
994            $this->state = 'data';
995
996        } else {
997            /* Anything else
998            Start a new attribute in the current tag token. Set that attribute's
999            name to the current input character, and its value to the empty string.
1000            Switch to the attribute name state. */
1001            $this->token['attr'][] = array(
1002                'name' => strtolower($char),
1003                'value' => null
1004            );
1005
1006            $this->state = 'attributeName';
1007        }
1008    }
1009
1010    private function beforeAttributeValueState()
1011    {
1012        // Consume the next input character:
1013        $this->char++;
1014        $char = $this->character($this->char);
1015
1016        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017            /* U+0009 CHARACTER TABULATION
1018            U+000A LINE FEED (LF)
1019            U+000B LINE TABULATION
1020            U+000C FORM FEED (FF)
1021            U+0020 SPACE
1022            Stay in the before attribute value state. */
1023            $this->state = 'beforeAttributeValue';
1024
1025        } elseif ($char === '"') {
1026            /* U+0022 QUOTATION MARK (")
1027            Switch to the attribute value (double-quoted) state. */
1028            $this->state = 'attributeValueDoubleQuoted';
1029
1030        } elseif ($char === '&') {
1031            /* U+0026 AMPERSAND (&)
1032            Switch to the attribute value (unquoted) state and reconsume
1033            this input character. */
1034            $this->char--;
1035            $this->state = 'attributeValueUnquoted';
1036
1037        } elseif ($char === '\'') {
1038            /* U+0027 APOSTROPHE (')
1039            Switch to the attribute value (single-quoted) state. */
1040            $this->state = 'attributeValueSingleQuoted';
1041
1042        } elseif ($char === '>') {
1043            /* U+003E GREATER-THAN SIGN (>)
1044            Emit the current tag token. Switch to the data state. */
1045            $this->emitToken($this->token);
1046            $this->state = 'data';
1047
1048        } else {
1049            /* Anything else
1050            Append the current input character to the current attribute's value.
1051            Switch to the attribute value (unquoted) state. */
1052            $last = count($this->token['attr']) - 1;
1053            $this->token['attr'][$last]['value'] .= $char;
1054
1055            $this->state = 'attributeValueUnquoted';
1056        }
1057    }
1058
1059    private function attributeValueDoubleQuotedState()
1060    {
1061        // Consume the next input character:
1062        $this->char++;
1063        $char = $this->character($this->char);
1064
1065        if ($char === '"') {
1066            /* U+0022 QUOTATION MARK (")
1067            Switch to the before attribute name state. */
1068            $this->state = 'beforeAttributeName';
1069
1070        } elseif ($char === '&') {
1071            /* U+0026 AMPERSAND (&)
1072            Switch to the entity in attribute value state. */
1073            $this->entityInAttributeValueState('double');
1074
1075        } elseif ($this->char === $this->EOF) {
1076            /* EOF
1077            Parse error. Emit the current tag token. Reconsume the character
1078            in the data state. */
1079            $this->emitToken($this->token);
1080
1081            $this->char--;
1082            $this->state = 'data';
1083
1084        } else {
1085            /* Anything else
1086            Append the current input character to the current attribute's value.
1087            Stay in the attribute value (double-quoted) state. */
1088            $last = count($this->token['attr']) - 1;
1089            $this->token['attr'][$last]['value'] .= $char;
1090
1091            $this->state = 'attributeValueDoubleQuoted';
1092        }
1093    }
1094
1095    private function attributeValueSingleQuotedState()
1096    {
1097        // Consume the next input character:
1098        $this->char++;
1099        $char = $this->character($this->char);
1100
1101        if ($char === '\'') {
1102            /* U+0022 QUOTATION MARK (')
1103            Switch to the before attribute name state. */
1104            $this->state = 'beforeAttributeName';
1105
1106        } elseif ($char === '&') {
1107            /* U+0026 AMPERSAND (&)
1108            Switch to the entity in attribute value state. */
1109            $this->entityInAttributeValueState('single');
1110
1111        } elseif ($this->char === $this->EOF) {
1112            /* EOF
1113            Parse error. Emit the current tag token. Reconsume the character
1114            in the data state. */
1115            $this->emitToken($this->token);
1116
1117            $this->char--;
1118            $this->state = 'data';
1119
1120        } else {
1121            /* Anything else
1122            Append the current input character to the current attribute's value.
1123            Stay in the attribute value (single-quoted) state. */
1124            $last = count($this->token['attr']) - 1;
1125            $this->token['attr'][$last]['value'] .= $char;
1126
1127            $this->state = 'attributeValueSingleQuoted';
1128        }
1129    }
1130
1131    private function attributeValueUnquotedState()
1132    {
1133        // Consume the next input character:
1134        $this->char++;
1135        $char = $this->character($this->char);
1136
1137        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138            /* U+0009 CHARACTER TABULATION
1139            U+000A LINE FEED (LF)
1140            U+000B LINE TABULATION
1141            U+000C FORM FEED (FF)
1142            U+0020 SPACE
1143            Switch to the before attribute name state. */
1144            $this->state = 'beforeAttributeName';
1145
1146        } elseif ($char === '&') {
1147            /* U+0026 AMPERSAND (&)
1148            Switch to the entity in attribute value state. */
1149            $this->entityInAttributeValueState();
1150
1151        } elseif ($char === '>') {
1152            /* U+003E GREATER-THAN SIGN (>)
1153            Emit the current tag token. Switch to the data state. */
1154            $this->emitToken($this->token);
1155            $this->state = 'data';
1156
1157        } else {
1158            /* Anything else
1159            Append the current input character to the current attribute's value.
1160            Stay in the attribute value (unquoted) state. */
1161            $last = count($this->token['attr']) - 1;
1162            $this->token['attr'][$last]['value'] .= $char;
1163
1164            $this->state = 'attributeValueUnquoted';
1165        }
1166    }
1167
1168    private function entityInAttributeValueState()
1169    {
1170        // Attempt to consume an entity.
1171        $entity = $this->entity();
1172
1173        // If nothing is returned, append a U+0026 AMPERSAND character to the
1174        // current attribute's value. Otherwise, emit the character token that
1175        // was returned.
1176        $char = (!$entity)
1177            ? '&'
1178            : $entity;
1179
1180        $last = count($this->token['attr']) - 1;
1181        $this->token['attr'][$last]['value'] .= $char;
1182    }
1183
1184    private function bogusCommentState()
1185    {
1186        /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187        character (>) or the end of the file (EOF), whichever comes first. Emit
1188        a comment token whose data is the concatenation of all the characters
1189        starting from and including the character that caused the state machine
1190        to switch into the bogus comment state, up to and including the last
1191        consumed character before the U+003E character, if any, or up to the
1192        end of the file otherwise. (If the comment was started by the end of
1193        the file (EOF), the token is empty.) */
1194        $data = $this->characters('^>', $this->char);
1195        $this->emitToken(
1196            array(
1197                'data' => $data,
1198                'type' => self::COMMENT
1199            )
1200        );
1201
1202        $this->char += strlen($data);
1203
1204        /* Switch to the data state. */
1205        $this->state = 'data';
1206
1207        /* If the end of the file was reached, reconsume the EOF character. */
1208        if ($this->char === $this->EOF) {
1209            $this->char = $this->EOF - 1;
1210        }
1211    }
1212
1213    private function markupDeclarationOpenState()
1214    {
1215        /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216        characters, consume those two characters, create a comment token whose
1217        data is the empty string, and switch to the comment state. */
1218        if ($this->character($this->char + 1, 2) === '--') {
1219            $this->char += 2;
1220            $this->state = 'comment';
1221            $this->token = array(
1222                'data' => null,
1223                'type' => self::COMMENT
1224            );
1225
1226            /* Otherwise if the next seven chacacters are a case-insensitive match
1227            for the word "DOCTYPE", then consume those characters and switch to the
1228            DOCTYPE state. */
1229        } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230            $this->char += 7;
1231            $this->state = 'doctype';
1232
1233            /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234            The next character that is consumed, if any, is the first character
1235            that will be in the comment. */
1236        } else {
1237            $this->char++;
1238            $this->state = 'bogusComment';
1239        }
1240    }
1241
1242    private function commentState()
1243    {
1244        /* Consume the next input character: */
1245        $this->char++;
1246        $char = $this->char();
1247
1248        /* U+002D HYPHEN-MINUS (-) */
1249        if ($char === '-') {
1250            /* Switch to the comment dash state  */
1251            $this->state = 'commentDash';
1252
1253            /* EOF */
1254        } elseif ($this->char === $this->EOF) {
1255            /* Parse error. Emit the comment token. Reconsume the EOF character
1256            in the data state. */
1257            $this->emitToken($this->token);
1258            $this->char--;
1259            $this->state = 'data';
1260
1261            /* Anything else */
1262        } else {
1263            /* Append the input character to the comment token's data. Stay in
1264            the comment state. */
1265            $this->token['data'] .= $char;
1266        }
1267    }
1268
1269    private function commentDashState()
1270    {
1271        /* Consume the next input character: */
1272        $this->char++;
1273        $char = $this->char();
1274
1275        /* U+002D HYPHEN-MINUS (-) */
1276        if ($char === '-') {
1277            /* Switch to the comment end state  */
1278            $this->state = 'commentEnd';
1279
1280            /* EOF */
1281        } elseif ($this->char === $this->EOF) {
1282            /* Parse error. Emit the comment token. Reconsume the EOF character
1283            in the data state. */
1284            $this->emitToken($this->token);
1285            $this->char--;
1286            $this->state = 'data';
1287
1288            /* Anything else */
1289        } else {
1290            /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291            character to the comment token's data. Switch to the comment state. */
1292            $this->token['data'] .= '-' . $char;
1293            $this->state = 'comment';
1294        }
1295    }
1296
1297    private function commentEndState()
1298    {
1299        /* Consume the next input character: */
1300        $this->char++;
1301        $char = $this->char();
1302
1303        if ($char === '>') {
1304            $this->emitToken($this->token);
1305            $this->state = 'data';
1306
1307        } elseif ($char === '-') {
1308            $this->token['data'] .= '-';
1309
1310        } elseif ($this->char === $this->EOF) {
1311            $this->emitToken($this->token);
1312            $this->char--;
1313            $this->state = 'data';
1314
1315        } else {
1316            $this->token['data'] .= '--' . $char;
1317            $this->state = 'comment';
1318        }
1319    }
1320
1321    private function doctypeState()
1322    {
1323        /* Consume the next input character: */
1324        $this->char++;
1325        $char = $this->char();
1326
1327        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328            $this->state = 'beforeDoctypeName';
1329
1330        } else {
1331            $this->char--;
1332            $this->state = 'beforeDoctypeName';
1333        }
1334    }
1335
1336    private function beforeDoctypeNameState()
1337    {
1338        /* Consume the next input character: */
1339        $this->char++;
1340        $char = $this->char();
1341
1342        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343            // Stay in the before DOCTYPE name state.
1344
1345        } elseif (preg_match('/^[a-z]$/', $char)) {
1346            $this->token = array(
1347                'name' => strtoupper($char),
1348                'type' => self::DOCTYPE,
1349                'error' => true
1350            );
1351
1352            $this->state = 'doctypeName';
1353
1354        } elseif ($char === '>') {
1355            $this->emitToken(
1356                array(
1357                    'name' => null,
1358                    'type' => self::DOCTYPE,
1359                    'error' => true
1360                )
1361            );
1362
1363            $this->state = 'data';
1364
1365        } elseif ($this->char === $this->EOF) {
1366            $this->emitToken(
1367                array(
1368                    'name' => null,
1369                    'type' => self::DOCTYPE,
1370                    'error' => true
1371                )
1372            );
1373
1374            $this->char--;
1375            $this->state = 'data';
1376
1377        } else {
1378            $this->token = array(
1379                'name' => $char,
1380                'type' => self::DOCTYPE,
1381                'error' => true
1382            );
1383
1384            $this->state = 'doctypeName';
1385        }
1386    }
1387
1388    private function doctypeNameState()
1389    {
1390        /* Consume the next input character: */
1391        $this->char++;
1392        $char = $this->char();
1393
1394        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395            $this->state = 'AfterDoctypeName';
1396
1397        } elseif ($char === '>') {
1398            $this->emitToken($this->token);
1399            $this->state = 'data';
1400
1401        } elseif (preg_match('/^[a-z]$/', $char)) {
1402            $this->token['name'] .= strtoupper($char);
1403
1404        } elseif ($this->char === $this->EOF) {
1405            $this->emitToken($this->token);
1406            $this->char--;
1407            $this->state = 'data';
1408
1409        } else {
1410            $this->token['name'] .= $char;
1411        }
1412
1413        $this->token['error'] = ($this->token['name'] === 'HTML')
1414            ? false
1415            : true;
1416    }
1417
1418    private function afterDoctypeNameState()
1419    {
1420        /* Consume the next input character: */
1421        $this->char++;
1422        $char = $this->char();
1423
1424        if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425            // Stay in the DOCTYPE name state.
1426
1427        } elseif ($char === '>') {
1428            $this->emitToken($this->token);
1429            $this->state = 'data';
1430
1431        } elseif ($this->char === $this->EOF) {
1432            $this->emitToken($this->token);
1433            $this->char--;
1434            $this->state = 'data';
1435
1436        } else {
1437            $this->token['error'] = true;
1438            $this->state = 'bogusDoctype';
1439        }
1440    }
1441
1442    private function bogusDoctypeState()
1443    {
1444        /* Consume the next input character: */
1445        $this->char++;
1446        $char = $this->char();
1447
1448        if ($char === '>') {
1449            $this->emitToken($this->token);
1450            $this->state = 'data';
1451
1452        } elseif ($this->char === $this->EOF) {
1453            $this->emitToken($this->token);
1454            $this->char--;
1455            $this->state = 'data';
1456
1457        } else {
1458            // Stay in the bogus DOCTYPE state.
1459        }
1460    }
1461
1462    private function entity()
1463    {
1464        $start = $this->char;
1465
1466        // This section defines how to consume an entity. This definition is
1467        // used when parsing entities in text and in attributes.
1468
1469        // The behaviour depends on the identity of the next character (the
1470        // one immediately after the U+0026 AMPERSAND character):
1471
1472        switch ($this->character($this->char + 1)) {
1473            // U+0023 NUMBER SIGN (#)
1474            case '#':
1475
1476                // The behaviour further depends on the character after the
1477                // U+0023 NUMBER SIGN:
1478                switch ($this->character($this->char + 1)) {
1479                    // U+0078 LATIN SMALL LETTER X
1480                    // U+0058 LATIN CAPITAL LETTER X
1481                    case 'x':
1482                    case 'X':
1483                        // Follow the steps below, but using the range of
1484                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485                        // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486                        // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487                        // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488                        // words, 0-9, A-F, a-f).
1489                        $char = 1;
1490                        $char_class = '0-9A-Fa-f';
1491                        break;
1492
1493                    // Anything else
1494                    default:
1495                        // Follow the steps below, but using the range of
1496                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497                        // NINE (i.e. just 0-9).
1498                        $char = 0;
1499                        $char_class = '0-9';
1500                        break;
1501                }
1502
1503                // Consume as many characters as match the range of characters
1504                // given above.
1505                $this->char++;
1506                $e_name = $this->characters($char_class, $this->char + $char + 1);
1507                $entity = $this->character($start, $this->char);
1508                $cond = strlen($e_name) > 0;
1509
1510                // The rest of the parsing happens below.
1511                break;
1512
1513            // Anything else
1514            default:
1515                // Consume the maximum number of characters possible, with the
1516                // consumed characters case-sensitively matching one of the
1517                // identifiers in the first column of the entities table.
1518
1519                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520                $len = strlen($e_name);
1521
1522                for ($c = 1; $c <= $len; $c++) {
1523                    $id = substr($e_name, 0, $c);
1524                    $this->char++;
1525
1526                    if (in_array($id, $this->entities)) {
1527                        if ($e_name[$c - 1] !== ';') {
1528                            if ($c < $len && $e_name[$c] == ';') {
1529                                $this->char++; // consume extra semicolon
1530                            }
1531                        }
1532                        $entity = $id;
1533                        break;
1534                    }
1535                }
1536
1537                $cond = isset($entity);
1538                // The rest of the parsing happens below.
1539                break;
1540        }
1541
1542        if (!$cond) {
1543            // If no match can be made, then this is a parse error. No
1544            // characters are consumed, and nothing is returned.
1545            $this->char = $start;
1546            return false;
1547        }
1548
1549        // Return a character token for the character corresponding to the
1550        // entity name (as given by the second column of the entities table).
1551        return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
1552    }
1553
1554    private function emitToken($token)
1555    {
1556        $emit = $this->tree->emitToken($token);
1557
1558        if (is_int($emit)) {
1559            $this->content_model = $emit;
1560
1561        } elseif ($token['type'] === self::ENDTAG) {
1562            $this->content_model = self::PCDATA;
1563        }
1564    }
1565
1566    private function EOF()
1567    {
1568        $this->state = null;
1569        $this->tree->emitToken(
1570            array(
1571                'type' => self::EOF
1572            )
1573        );
1574    }
1575}
1576
1577class HTML5TreeConstructer
1578{
1579    public $stack = array();
1580
1581    private $phase;
1582    private $mode;
1583    private $dom;
1584    private $foster_parent = null;
1585    private $a_formatting = array();
1586
1587    private $head_pointer = null;
1588    private $form_pointer = null;
1589
1590    private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1591    private $formatting = array(
1592        'a',
1593        'b',
1594        'big',
1595        'em',
1596        'font',
1597        'i',
1598        'nobr',
1599        's',
1600        'small',
1601        'strike',
1602        'strong',
1603        'tt',
1604        'u'
1605    );
1606    private $special = array(
1607        'address',
1608        'area',
1609        'base',
1610        'basefont',
1611        'bgsound',
1612        'blockquote',
1613        'body',
1614        'br',
1615        'center',
1616        'col',
1617        'colgroup',
1618        'dd',
1619        'dir',
1620        'div',
1621        'dl',
1622        'dt',
1623        'embed',
1624        'fieldset',
1625        'form',
1626        'frame',
1627        'frameset',
1628        'h1',
1629        'h2',
1630        'h3',
1631        'h4',
1632        'h5',
1633        'h6',
1634        'head',
1635        'hr',
1636        'iframe',
1637        'image',
1638        'img',
1639        'input',
1640        'isindex',
1641        'li',
1642        'link',
1643        'listing',
1644        'menu',
1645        'meta',
1646        'noembed',
1647        'noframes',
1648        'noscript',
1649        'ol',
1650        'optgroup',
1651        'option',
1652        'p',
1653        'param',
1654        'plaintext',
1655        'pre',
1656        'script',
1657        'select',
1658        'spacer',
1659        'style',
1660        'tbody',
1661        'textarea',
1662        'tfoot',
1663        'thead',
1664        'title',
1665        'tr',
1666        'ul',
1667        'wbr'
1668    );
1669
1670    // The different phases.
1671    const INIT_PHASE = 0;
1672    const ROOT_PHASE = 1;
1673    const MAIN_PHASE = 2;
1674    const END_PHASE = 3;
1675
1676    // The different insertion modes for the main phase.
1677    const BEFOR_HEAD = 0;
1678    const IN_HEAD = 1;
1679    const AFTER_HEAD = 2;
1680    const IN_BODY = 3;
1681    const IN_TABLE = 4;
1682    const IN_CAPTION = 5;
1683    const IN_CGROUP = 6;
1684    const IN_TBODY = 7;
1685    const IN_ROW = 8;
1686    const IN_CELL = 9;
1687    const IN_SELECT = 10;
1688    const AFTER_BODY = 11;
1689    const IN_FRAME = 12;
1690    const AFTR_FRAME = 13;
1691
1692    // The different types of elements.
1693    const SPECIAL = 0;
1694    const SCOPING = 1;
1695    const FORMATTING = 2;
1696    const PHRASING = 3;
1697
1698    const MARKER = 0;
1699
1700    public function __construct()
1701    {
1702        $this->phase = self::INIT_PHASE;
1703        $this->mode = self::BEFOR_HEAD;
1704        $this->dom = new DOMDocument;
1705
1706        $this->dom->encoding = 'UTF-8';
1707        $this->dom->preserveWhiteSpace = true;
1708        $this->dom->substituteEntities = true;
1709        $this->dom->strictErrorChecking = false;
1710    }
1711
1712    // Process tag tokens
1713    public function emitToken($token)
1714    {
1715        switch ($this->phase) {
1716            case self::INIT_PHASE:
1717                return $this->initPhase($token);
1718                break;
1719            case self::ROOT_PHASE:
1720                return $this->rootElementPhase($token);
1721                break;
1722            case self::MAIN_PHASE:
1723                return $this->mainPhase($token);
1724                break;
1725            case self::END_PHASE :
1726                return $this->trailingEndPhase($token);
1727                break;
1728        }
1729    }
1730
1731    private function initPhase($token)
1732    {
1733        /* Initially, the tree construction stage must handle each token
1734        emitted from the tokenisation stage as follows: */
1735
1736        /* A DOCTYPE token that is marked as being in error
1737        A comment token
1738        A start tag token
1739        An end tag token
1740        A character token that is not one of one of U+0009 CHARACTER TABULATION,
1741            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1742            or U+0020 SPACE
1743        An end-of-file token */
1744        if ((isset($token['error']) && $token['error']) ||
1745            $token['type'] === HTML5::COMMENT ||
1746            $token['type'] === HTML5::STARTTAG ||
1747            $token['type'] === HTML5::ENDTAG ||
1748            $token['type'] === HTML5::EOF ||
1749            ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1750                !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751        ) {
1752            /* This specification does not define how to handle this case. In
1753            particular, user agents may ignore the entirety of this specification
1754            altogether for such documents, and instead invoke special parse modes
1755            with a greater emphasis on backwards compatibility. */
1756
1757            $this->phase = self::ROOT_PHASE;
1758            return $this->rootElementPhase($token);
1759
1760            /* A DOCTYPE token marked as being correct */
1761        } elseif (isset($token['error']) && !$token['error']) {
1762            /* Append a DocumentType node to the Document  node, with the name
1763            attribute set to the name given in the DOCTYPE token (which will be
1764            "HTML"), and the other attributes specific to DocumentType objects
1765            set to null, empty lists, or the empty string as appropriate. */
1766            $doctype = new DOMDocumentType(null, null, 'HTML');
1767
1768            /* Then, switch to the root element phase of the tree construction
1769            stage. */
1770            $this->phase = self::ROOT_PHASE;
1771
1772            /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1773            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1774            or U+0020 SPACE */
1775        } elseif (isset($token['data']) && preg_match(
1776                '/^[\t\n\x0b\x0c ]+$/',
1777                $token['data']
1778            )
1779        ) {
1780            /* Append that character  to the Document node. */
1781            $text = $this->dom->createTextNode($token['data']);
1782            $this->dom->appendChild($text);
1783        }
1784    }
1785
1786    private function rootElementPhase($token)
1787    {
1788        /* After the initial phase, as each token is emitted from the tokenisation
1789        stage, it must be processed as described in this section. */
1790
1791        /* A DOCTYPE token */
1792        if ($token['type'] === HTML5::DOCTYPE) {
1793            // Parse error. Ignore the token.
1794
1795            /* A comment token */
1796        } elseif ($token['type'] === HTML5::COMMENT) {
1797            /* Append a Comment node to the Document object with the data
1798            attribute set to the data given in the comment token. */
1799            $comment = $this->dom->createComment($token['data']);
1800            $this->dom->appendChild($comment);
1801
1802            /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1803            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1804            or U+0020 SPACE */
1805        } elseif ($token['type'] === HTML5::CHARACTR &&
1806            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807        ) {
1808            /* Append that character  to the Document node. */
1809            $text = $this->dom->createTextNode($token['data']);
1810            $this->dom->appendChild($text);
1811
1812            /* A character token that is not one of U+0009 CHARACTER TABULATION,
1813                U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1814                (FF), or U+0020 SPACE
1815            A start tag token
1816            An end tag token
1817            An end-of-file token */
1818        } elseif (($token['type'] === HTML5::CHARACTR &&
1819                !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1820            $token['type'] === HTML5::STARTTAG ||
1821            $token['type'] === HTML5::ENDTAG ||
1822            $token['type'] === HTML5::EOF
1823        ) {
1824            /* Create an HTMLElement node with the tag name html, in the HTML
1825            namespace. Append it to the Document object. Switch to the main
1826            phase and reprocess the current token. */
1827            $html = $this->dom->createElement('html');
1828            $this->dom->appendChild($html);
1829            $this->stack[] = $html;
1830
1831            $this->phase = self::MAIN_PHASE;
1832            return $this->mainPhase($token);
1833        }
1834    }
1835
1836    private function mainPhase($token)
1837    {
1838        /* Tokens in the main phase must be handled as follows: */
1839
1840        /* A DOCTYPE token */
1841        if ($token['type'] === HTML5::DOCTYPE) {
1842            // Parse error. Ignore the token.
1843
1844            /* A start tag token with the tag name "html" */
1845        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1846            /* If this start tag token was not the first start tag token, then
1847            it is a parse error. */
1848
1849            /* For each attribute on the token, check to see if the attribute
1850            is already present on the top element of the stack of open elements.
1851            If it is not, add the attribute and its corresponding value to that
1852            element. */
1853            foreach ($token['attr'] as $attr) {
1854                if (!$this->stack[0]->hasAttribute($attr['name'])) {
1855                    $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1856                }
1857            }
1858
1859            /* An end-of-file token */
1860        } elseif ($token['type'] === HTML5::EOF) {
1861            /* Generate implied end tags. */
1862            $this->generateImpliedEndTags();
1863
1864            /* Anything else. */
1865        } else {
1866            /* Depends on the insertion mode: */
1867            switch ($this->mode) {
1868                case self::BEFOR_HEAD:
1869                    return $this->beforeHead($token);
1870                    break;
1871                case self::IN_HEAD:
1872                    return $this->inHead($token);
1873                    break;
1874                case self::AFTER_HEAD:
1875                    return $this->afterHead($token);
1876                    break;
1877                case self::IN_BODY:
1878                    return $this->inBody($token);
1879                    break;
1880                case self::IN_TABLE:
1881                    return $this->inTable($token);
1882                    break;
1883                case self::IN_CAPTION:
1884                    return $this->inCaption($token);
1885                    break;
1886                case self::IN_CGROUP:
1887                    return $this->inColumnGroup($token);
1888                    break;
1889                case self::IN_TBODY:
1890                    return $this->inTableBody($token);
1891                    break;
1892                case self::IN_ROW:
1893                    return $this->inRow($token);
1894                    break;
1895                case self::IN_CELL:
1896                    return $this->inCell($token);
1897                    break;
1898                case self::IN_SELECT:
1899                    return $this->inSelect($token);
1900                    break;
1901                case self::AFTER_BODY:
1902                    return $this->afterBody($token);
1903                    break;
1904                case self::IN_FRAME:
1905                    return $this->inFrameset($token);
1906                    break;
1907                case self::AFTR_FRAME:
1908                    return $this->afterFrameset($token);
1909                    break;
1910                case self::END_PHASE:
1911                    return $this->trailingEndPhase($token);
1912                    break;
1913            }
1914        }
1915    }
1916
1917    private function beforeHead($token)
1918    {
1919        /* Handle the token as follows: */
1920
1921        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1922        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1923        or U+0020 SPACE */
1924        if ($token['type'] === HTML5::CHARACTR &&
1925            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926        ) {
1927            /* Append the character to the current node. */
1928            $this->insertText($token['data']);
1929
1930            /* A comment token */
1931        } elseif ($token['type'] === HTML5::COMMENT) {
1932            /* Append a Comment node to the current node with the data attribute
1933            set to the data given in the comment token. */
1934            $this->insertComment($token['data']);
1935
1936            /* A start tag token with the tag name "head" */
1937        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1938            /* Create an element for the token, append the new element to the
1939            current node and push it onto the stack of open elements. */
1940            $element = $this->insertElement($token);
1941
1942            /* Set the head element pointer to this new element node. */
1943            $this->head_pointer = $element;
1944
1945            /* Change the insertion mode to "in head". */
1946            $this->mode = self::IN_HEAD;
1947
1948            /* A start tag token whose tag name is one of: "base", "link", "meta",
1949            "script", "style", "title". Or an end tag with the tag name "html".
1950            Or a character token that is not one of U+0009 CHARACTER TABULATION,
1951            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1952            or U+0020 SPACE. Or any other start tag token */
1953        } elseif ($token['type'] === HTML5::STARTTAG ||
1954            ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1955            ($token['type'] === HTML5::CHARACTR && !preg_match(
1956                    '/^[\t\n\x0b\x0c ]$/',
1957                    $token['data']
1958                ))
1959        ) {
1960            /* Act as if a start tag token with the tag name "head" and no
1961            attributes had been seen, then reprocess the current token. */
1962            $this->beforeHead(
1963                array(
1964                    'name' => 'head',
1965                    'type' => HTML5::STARTTAG,
1966                    'attr' => array()
1967                )
1968            );
1969
1970            return $this->inHead($token);
1971
1972            /* Any other end tag */
1973        } elseif ($token['type'] === HTML5::ENDTAG) {
1974            /* Parse error. Ignore the token. */
1975        }
1976    }
1977
1978    private function inHead($token)
1979    {
1980        /* Handle the token as follows: */
1981
1982        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1983        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1984        or U+0020 SPACE.
1985
1986        THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1987        or script element, append the character to the current node regardless
1988        of its content. */
1989        if (($token['type'] === HTML5::CHARACTR &&
1990                preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1991                $token['type'] === HTML5::CHARACTR && in_array(
1992                    end($this->stack)->nodeName,
1993                    array('title', 'style', 'script')
1994                ))
1995        ) {
1996            /* Append the character to the current node. */
1997            $this->insertText($token['data']);
1998
1999            /* A comment token */
2000        } elseif ($token['type'] === HTML5::COMMENT) {
2001            /* Append a Comment node to the current node with the data attribute
2002            set to the data given in the comment token. */
2003            $this->insertComment($token['data']);
2004
2005        } elseif ($token['type'] === HTML5::ENDTAG &&
2006            in_array($token['name'], array('title', 'style', 'script'))
2007        ) {
2008            array_pop($this->stack);
2009            return HTML5::PCDATA;
2010
2011            /* A start tag with the tag name "title" */
2012        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2013            /* Create an element for the token and append the new element to the
2014            node pointed to by the head element pointer, or, if that is null
2015            (innerHTML case), to the current node. */
2016            if ($this->head_pointer !== null) {
2017                $element = $this->insertElement($token, false);
2018                $this->head_pointer->appendChild($element);
2019
2020            } else {
2021                $element = $this->insertElement($token);
2022            }
2023
2024            /* Switch the tokeniser's content model flag  to the RCDATA state. */
2025            return HTML5::RCDATA;
2026
2027            /* A start tag with the tag name "style" */
2028        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2029            /* Create an element for the token and append the new element to the
2030            node pointed to by the head element pointer, or, if that is null
2031            (innerHTML case), to the current node. */
2032            if ($this->head_pointer !== null) {
2033                $element = $this->insertElement($token, false);
2034                $this->head_pointer->appendChild($element);
2035
2036            } else {
2037                $this->insertElement($token);
2038            }
2039
2040            /* Switch the tokeniser's content model flag  to the CDATA state. */
2041            return HTML5::CDATA;
2042
2043            /* A start tag with the tag name "script" */
2044        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2045            /* Create an element for the token. */
2046            $element = $this->insertElement($token, false);
2047            $this->head_pointer->appendChild($element);
2048
2049            /* Switch the tokeniser's content model flag  to the CDATA state. */
2050            return HTML5::CDATA;
2051
2052            /* A start tag with the tag name "base", "link", or "meta" */
2053        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2054                $token['name'],
2055                array('base', 'link', 'meta')
2056            )
2057        ) {
2058            /* Create an element for the token and append the new element to the
2059            node pointed to by the head element pointer, or, if that is null
2060            (innerHTML case), to the current node. */
2061            if ($this->head_pointer !== null) {
2062                $element = $this->insertElement($token, false);
2063                $this->head_pointer->appendChild($element);
2064                array_pop($this->stack);
2065
2066            } else {
2067                $this->insertElement($token);
2068            }
2069
2070            /* An end tag with the tag name "head" */
2071        } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2072            /* If the current node is a head element, pop the current node off
2073            the stack of open elements. */
2074            if ($this->head_pointer->isSameNode(end($this->stack))) {
2075                array_pop($this->stack);
2076
2077                /* Otherwise, this is a parse error. */
2078            } else {
2079                // k
2080            }
2081
2082            /* Change the insertion mode to "after head". */
2083            $this->mode = self::AFTER_HEAD;
2084
2085            /* A start tag with the tag name "head" or an end tag except "html". */
2086        } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2087            ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088        ) {
2089            // Parse error. Ignore the token.
2090
2091            /* Anything else */
2092        } else {
2093            /* If the current node is a head element, act as if an end tag
2094            token with the tag name "head" had been seen. */
2095            if ($this->head_pointer->isSameNode(end($this->stack))) {
2096                $this->inHead(
2097                    array(
2098                        'name' => 'head',
2099                        'type' => HTML5::ENDTAG
2100                    )
2101                );
2102
2103                /* Otherwise, change the insertion mode to "after head". */
2104            } else {
2105                $this->mode = self::AFTER_HEAD;
2106            }
2107
2108            /* Then, reprocess the current token. */
2109            return $this->afterHead($token);
2110        }
2111    }
2112
2113    private function afterHead($token)
2114    {
2115        /* Handle the token as follows: */
2116
2117        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2118        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2119        or U+0020 SPACE */
2120        if ($token['type'] === HTML5::CHARACTR &&
2121            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122        ) {
2123            /* Append the character to the current node. */
2124            $this->insertText($token['data']);
2125
2126            /* A comment token */
2127        } elseif ($token['type'] === HTML5::COMMENT) {
2128            /* Append a Comment node to the current node with the data attribute
2129            set to the data given in the comment token. */
2130            $this->insertComment($token['data']);
2131
2132            /* A start tag token with the tag name "body" */
2133        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2134            /* Insert a body element for the token. */
2135            $this->insertElement($token);
2136
2137            /* Change the insertion mode to "in body". */
2138            $this->mode = self::IN_BODY;
2139
2140            /* A start tag token with the tag name "frameset" */
2141        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2142            /* Insert a frameset element for the token. */
2143            $this->insertElement($token);
2144
2145            /* Change the insertion mode to "in frameset". */
2146            $this->mode = self::IN_FRAME;
2147
2148            /* A start tag token whose tag name is one of: "base", "link", "meta",
2149            "script", "style", "title" */
2150        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2151                $token['name'],
2152                array('base', 'link', 'meta', 'script', 'style', 'title')
2153            )
2154        ) {
2155            /* Parse error. Switch the insertion mode back to "in head" and
2156            reprocess the token. */
2157            $this->mode = self::IN_HEAD;
2158            return $this->inHead($token);
2159
2160            /* Anything else */
2161        } else {
2162            /* Act as if a start tag token with the tag name "body" and no
2163            attributes had been seen, and then reprocess the current token. */
2164            $this->afterHead(
2165                array(
2166                    'name' => 'body',
2167                    'type' => HTML5::STARTTAG,
2168                    'attr' => array()
2169                )
2170            );
2171
2172            return $this->inBody($token);
2173        }
2174    }
2175
2176    private function inBody($token)
2177    {
2178        /* Handle the token as follows: */
2179
2180        switch ($token['type']) {
2181            /* A character token */
2182            case HTML5::CHARACTR:
2183                /* Reconstruct the active formatting elements, if any. */
2184                $this->reconstructActiveFormattingElements();
2185
2186                /* Append the token's character to the current node. */
2187                $this->insertText($token['data']);
2188                break;
2189
2190            /* A comment token */
2191            case HTML5::COMMENT:
2192                /* Append a Comment node to the current node with the data
2193                attribute set to the data given in the comment token. */
2194                $this->insertComment($token['data']);
2195                break;
2196
2197            case HTML5::STARTTAG:
2198                switch ($token['name']) {
2199                    /* A start tag token whose tag name is one of: "script",
2200                    "style" */
2201                    case 'script':
2202                    case 'style':
2203                        /* Process the token as if the insertion mode had been "in
2204                        head". */
2205                        return $this->inHead($token);
2206                        break;
2207
2208                    /* A start tag token whose tag name is one of: "base", "link",
2209                    "meta", "title" */
2210                    case 'base':
2211                    case 'link':
2212                    case 'meta':
2213                    case 'title':
2214                        /* Parse error. Process the token as if the insertion mode
2215                        had    been "in head". */
2216                        return $this->inHead($token);
2217                        break;
2218
2219                    /* A start tag token with the tag name "body" */
2220                    case 'body':
2221                        /* Parse error. If the second element on the stack of open
2222                        elements is not a body element, or, if the stack of open
2223                        elements has only one node on it, then ignore the token.
2224                        (innerHTML case) */
2225                        if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2226                            // Ignore
2227
2228                            /* Otherwise, for each attribute on the token, check to see
2229                            if the attribute is already present on the body element (the
2230                            second element)    on the stack of open elements. If it is not,
2231                            add the attribute and its corresponding value to that
2232                            element. */
2233                        } else {
2234                            foreach ($token['attr'] as $attr) {
2235                                if (!$this->stack[1]->hasAttribute($attr['name'])) {
2236                                    $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2237                                }
2238                            }
2239                        }
2240                        break;
2241
2242                    /* A start tag whose tag name is one of: "address",
2243                    "blockquote", "center", "dir", "div", "dl", "fieldset",
2244                    "listing", "menu", "ol", "p", "ul" */
2245                    case 'address':
2246                    case 'blockquote':
2247                    case 'center':
2248                    case 'dir':
2249                    case 'div':
2250                    case 'dl':
2251                    case 'fieldset':
2252                    case 'listing':
2253                    case 'menu':
2254                    case 'ol':
2255                    case 'p':
2256                    case 'ul':
2257                        /* If the stack of open elements has a p element in scope,
2258                        then act as if an end tag with the tag name p had been
2259                        seen. */
2260                        if ($this->elementInScope('p')) {
2261                            $this->emitToken(
2262                                array(
2263                                    'name' => 'p',
2264                                    'type' => HTML5::ENDTAG
2265                                )
2266                            );
2267                        }
2268
2269                        /* Insert an HTML element for the token. */
2270                        $this->insertElement($token);
2271                        break;
2272
2273                    /* A start tag whose tag name is "form" */
2274                    case 'form':
2275                        /* If the form element pointer is not null, ignore the
2276                        token with a parse error. */
2277                        if ($this->form_pointer !== null) {
2278                            // Ignore.
2279
2280                            /* Otherwise: */
2281                        } else {
2282                            /* If the stack of open elements has a p element in
2283                            scope, then act as if an end tag with the tag name p
2284                            had been seen. */
2285                            if ($this->elementInScope('p')) {
2286                                $this->emitToken(
2287                                    array(
2288                                        'name' => 'p',
2289                                        'type' => HTML5::ENDTAG
2290                                    )
2291                                );
2292                            }
2293
2294                            /* Insert an HTML element for the token, and set the
2295                            form element pointer to point to the element created. */
2296                            $element = $this->insertElement($token);
2297                            $this->form_pointer = $element;
2298                        }
2299                        break;
2300
2301                    /* A start tag whose tag name is "li", "dd" or "dt" */
2302                    case 'li':
2303                    case 'dd':
2304                    case 'dt':
2305                        /* If the stack of open elements has a p  element in scope,
2306                        then act as if an end tag with the tag name p had been
2307                        seen. */
2308                        if ($this->elementInScope('p')) {
2309                            $this->emitToken(
2310                                array(
2311                                    'name' => 'p',
2312                                    'type' => HTML5::ENDTAG
2313                                )
2314                            );
2315                        }
2316
2317                        $stack_length = count($this->stack) - 1;
2318
2319                        for ($n = $stack_length; 0 <= $n; $n--) {
2320                            /* 1. Initialise node to be the current node (the
2321                            bottommost node of the stack). */
2322                            $stop = false;
2323                            $node = $this->stack[$n];
2324                            $cat = $this->getElementCategory($node->tagName);
2325
2326                            /* 2. If node is an li, dd or dt element, then pop all
2327                            the    nodes from the current node up to node, including
2328                            node, then stop this algorithm. */
2329                            if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2330                                    && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331                            ) {
2332                                for ($x = $stack_length; $x >= $n; $x--) {
2333                                    array_pop($this->stack);
2334                                }
2335
2336                                break;
2337                            }
2338
2339                            /* 3. If node is not in the formatting category, and is
2340                            not    in the phrasing category, and is not an address or
2341                            div element, then stop this algorithm. */
2342                            if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2343                                $node->tagName !== 'address' && $node->tagName !== 'div'
2344                            ) {
2345                                break;
2346                            }
2347                        }
2348
2349                        /* Finally, insert an HTML element with the same tag
2350                        name as the    token's. */
2351                        $this->insertElement($token);
2352                        break;
2353
2354                    /* A start tag token whose tag name is "plaintext" */
2355                    case 'plaintext':
2356                        /* If the stack of open elements has a p  element in scope,
2357                        then act as if an end tag with the tag name p had been
2358                        seen. */
2359                        if ($this->elementInScope('p')) {
2360                            $this->emitToken(
2361                                array(
2362                                    'name' => 'p',
2363                                    'type' => HTML5::ENDTAG
2364                                )
2365                            );
2366                        }
2367
2368                        /* Insert an HTML element for the token. */
2369                        $this->insertElement($token);
2370
2371                        return HTML5::PLAINTEXT;
2372                        break;
2373
2374                    /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2375                    "h5", "h6" */
2376                    case 'h1':
2377                    case 'h2':
2378                    case 'h3':
2379                    case 'h4':
2380                    case 'h5':
2381                    case 'h6':
2382                        /* If the stack of open elements has a p  element in scope,
2383                        then act as if an end tag with the tag name p had been seen. */
2384                        if ($this->elementInScope('p')) {
2385                            $this->emitToken(
2386                                array(
2387                                    'name' => 'p',
2388                                    'type' => HTML5::ENDTAG
2389                                )
2390                            );
2391                        }
2392
2393                        /* If the stack of open elements has in scope an element whose
2394                        tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2395                        this is a parse error; pop elements from the stack until an
2396                        element with one of those tag names has been popped from the
2397                        stack. */
2398                        while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2399                            array_pop($this->stack);
2400                        }
2401
2402                        /* Insert an HTML element for the token. */
2403                        $this->insertElement($token);
2404                        break;
2405
2406                    /* A start tag whose tag name is "a" */
2407                    case 'a':
2408                        /* If the list of active formatting elements contains
2409                        an element whose tag name is "a" between the end of the
2410                        list and the last marker on the list (or the start of
2411                        the list if there is no marker on the list), then this
2412                        is a parse error; act as if an end tag with the tag name
2413                        "a" had been seen, then remove that element from the list
2414                        of active formatting elements and the stack of open
2415                        elements if the end tag didn't already remove it (it
2416                        might not have if the element is not in table scope). */
2417                        $leng = count($this->a_formatting);
2418
2419                        for ($n = $leng - 1; $n >= 0; $n--) {
2420                            if ($this->a_formatting[$n] === self::MARKER) {
2421                                break;
2422
2423                            } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2424                                $this->emitToken(
2425                                    array(
2426                                        'name' => 'a',
2427                                        'type' => HTML5::ENDTAG
2428                                    )
2429                                );
2430                                break;
2431                            }
2432                        }
2433
2434                        /* Reconstruct the active formatting elements, if any. */
2435                        $this->reconstructActiveFormattingElements();
2436
2437                        /* Insert an HTML element for the token. */
2438                        $el = $this->insertElement($token);
2439
2440                        /* Add that element to the list of active formatting
2441                        elements. */
2442                        $this->a_formatting[] = $el;
2443                        break;
2444
2445                    /* A start tag whose tag name is one of: "b", "big", "em", "font",
2446                    "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2447                    case 'b':
2448                    case 'big':
2449                    case 'em':
2450                    case 'font':
2451                    case 'i':
2452                    case 'nobr':
2453                    case 's':
2454                    case 'small':
2455                    case 'strike':
2456                    case 'strong':
2457                    case 'tt':
2458                    case 'u':
2459                        /* Reconstruct the active formatting elements, if any. */
2460                        $this->reconstructActiveFormattingElements();
2461
2462                        /* Insert an HTML element for the token. */
2463                        $el = $this->insertElement($token);
2464
2465                        /* Add that element to the list of active formatting
2466                        elements. */
2467                        $this->a_formatting[] = $el;
2468                        break;
2469
2470                    /* A start tag token whose tag name is "button" */
2471                    case 'button':
2472                        /* If the stack of open elements has a button element in scope,
2473                        then this is a parse error; act as if an end tag with the tag
2474                        name "button" had been seen, then reprocess the token. (We don't
2475                        do that. Unnecessary.) */
2476                        if ($this->elementInScope('button')) {
2477                            $this->inBody(
2478                                array(
2479                                    'name' => 'button',
2480                                    'type' => HTML5::ENDTAG
2481                                )
2482                            );
2483                        }
2484
2485                        /* Reconstruct the active formatting elements, if any. */
2486                        $this->reconstructActiveFormattingElements();
2487
2488                        /* Insert an HTML element for the token. */
2489                        $this->insertElement($token);
2490
2491                        /* Insert a marker at the end of the list of active
2492                        formatting elements. */
2493                        $this->a_formatting[] = self::MARKER;
2494                        break;
2495
2496                    /* A start tag token whose tag name is one of: "marquee", "object" */
2497                    case 'marquee':
2498                    case 'object':
2499                        /* Reconstruct the active formatting elements, if any. */
2500                        $this->reconstructActiveFormattingElements();
2501
2502                        /* Insert an HTML element for the token. */
2503                        $this->insertElement($token);
2504
2505                        /* Insert a marker at the end of the list of active
2506                        formatting elements. */
2507                        $this->a_formatting[] = self::MARKER;
2508                        break;
2509
2510                    /* A start tag token whose tag name is "xmp" */
2511                    case 'xmp':
2512                        /* Reconstruct the active formatting elements, if any. */
2513                        $this->reconstructActiveFormattingElements();
2514
2515                        /* Insert an HTML element for the token. */
2516                        $this->insertElement($token);
2517
2518                        /* Switch the content model flag to the CDATA state. */
2519                        return HTML5::CDATA;
2520                        break;
2521
2522                    /* A start tag whose tag name is "table" */
2523                    case 'table':
2524                        /* If the stack of open elements has a p element in scope,
2525                        then act as if an end tag with the tag name p had been seen. */
2526                        if ($this->elementInScope('p')) {
2527                            $this->emitToken(
2528                                array(
2529                                    'name' => 'p',
2530                                    'type' => HTML5::ENDTAG
2531                                )
2532                            );
2533                        }
2534
2535                        /* Insert an HTML element for the token. */
2536                        $this->insertElement($token);
2537
2538                        /* Change the insertion mode to "in table". */
2539                        $this->mode = self::IN_TABLE;
2540                        break;
2541
2542                    /* A start tag whose tag name is one of: "area", "basefont",
2543                    "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2544                    case 'area':
2545                    case 'basefont':
2546                    case 'bgsound':
2547                    case 'br':
2548                    case 'embed':
2549                    case 'img':
2550                    case 'param':
2551                    case 'spacer':
2552                    case 'wbr':
2553                        /* Reconstruct the active formatting elements, if any. */
2554                        $this->reconstructActiveFormattingElements();
2555
2556                        /* Insert an HTML element for the token. */
2557                        $this->insertElement($token);
2558
2559                        /* Immediately pop the current node off the stack of open elements. */
2560                        array_pop($this->stack);
2561                        break;
2562
2563                    /* A start tag whose tag name is "hr" */
2564                    case 'hr':
2565                        /* If the stack of open elements has a p element in scope,
2566                        then act as if an end tag with the tag name p had been seen. */
2567                        if ($this->elementInScope('p')) {
2568                            $this->emitToken(
2569                                array(
2570                                    'name' => 'p',
2571                                    'type' => HTML5::ENDTAG
2572                                )
2573                            );
2574                        }
2575
2576                        /* Insert an HTML element for the token. */
2577                        $this->insertElement($token);
2578
2579                        /* Immediately pop the current node off the stack of open elements. */
2580                        array_pop($this->stack);
2581                        break;
2582
2583                    /* A start tag whose tag name is "image" */
2584                    case 'image':
2585                        /* Parse error. Change the token's tag name to "img" and
2586                        reprocess it. (Don't ask.) */
2587                        $token['name'] = 'img';
2588                        return $this->inBody($token);
2589                        break;
2590
2591                    /* A start tag whose tag name is "input" */
2592                    case 'input':
2593                        /* Reconstruct the active formatting elements, if any. */
2594                        $this->reconstructActiveFormattingElements();
2595
2596                        /* Insert an input element for the token. */
2597                        $element = $this->insertElement($token, false);
2598
2599                        /* If the form element pointer is not null, then associate the
2600                        input element with the form element pointed to by the form
2601                        element pointer. */
2602                        $this->form_pointer !== null
2603                            ? $this->form_pointer->appendChild($element)
2604                            : end($this->stack)->appendChild($element);
2605
2606                        /* Pop that input element off the stack of open elements. */
2607                        array_pop($this->stack);
2608                        break;
2609
2610                    /* A start tag whose tag name is "isindex" */
2611                    case 'isindex':
2612                        /* Parse error. */
2613                        // w/e
2614
2615                        /* If the form element pointer is not null,
2616                        then ignore the token. */
2617                        if ($this->form_pointer === null) {
2618                            /* Act as if a start tag token with the tag name "form" had
2619                            been seen. */
2620                            $this->inBody(
2621                                array(
2622                                    'name' => 'body',
2623                                    'type' => HTML5::STARTTAG,
2624                                    'attr' => array()
2625                                )
2626                            );
2627
2628                            /* Act as if a start tag token with the tag name "hr" had
2629                            been seen. */
2630                            $this->inBody(
2631                                array(
2632                                    'name' => 'hr',
2633                                    'type' => HTML5::STARTTAG,
2634                                    'attr' => array()
2635                                )
2636                            );
2637
2638                            /* Act as if a start tag token with the tag name "p" had
2639                            been seen. */
2640                            $this->inBody(
2641                                array(
2642                                    'name' => 'p',
2643                                    'type' => HTML5::STARTTAG,
2644                                    'attr' => array()
2645                                )
2646                            );
2647
2648                            /* Act as if a start tag token with the tag name "label"
2649                            had been seen. */
2650                            $this->inBody(
2651                                array(
2652                                    'name' => 'label',
2653                                    'type' => HTML5::STARTTAG,
2654                                    'attr' => array()
2655                                )
2656                            );
2657
2658                            /* Act as if a stream of character tokens had been seen. */
2659                            $this->insertText(
2660                                'This is a searchable index. ' .
2661                                'Insert your search keywords here: '
2662                            );
2663
2664                            /* Act as if a start tag token with the tag name "input"
2665                            had been seen, with all the attributes from the "isindex"
2666                            token, except with the "name" attribute set to the value
2667                            "isindex" (ignoring any explicit "name" attribute). */
2668                            $attr = $token['attr'];
2669                            $attr[] = array('name' => 'name', 'value' => 'isindex');
2670
2671                            $this->inBody(
2672                                array(
2673                                    'name' => 'input',
2674                                    'type' => HTML5::STARTTAG,
2675                                    'attr' => $attr
2676                                )
2677                            );
2678
2679                            /* Act as if a stream of character tokens had been seen
2680                            (see below for what they should say). */
2681                            $this->insertText(
2682                                'This is a searchable index. ' .
2683                                'Insert your search keywords here: '
2684                            );
2685
2686                            /* Act as if an end tag token with the tag name "label"
2687                            had been seen. */
2688                            $this->inBody(
2689                                array(
2690                                    'name' => 'label',
2691                                    'type' => HTML5::ENDTAG
2692                                )
2693                            );
2694
2695                            /* Act as if an end tag token with the tag name "p" had
2696                            been seen. */
2697                            $this->inBody(
2698                                array(
2699                                    'name' => 'p',
2700                                    'type' => HTML5::ENDTAG
2701                                )
2702                            );
2703
2704                            /* Act as if a start tag token with the tag name "hr" had
2705                            been seen. */
2706                            $this->inBody(
2707                                array(
2708                                    'name' => 'hr',
2709                                    'type' => HTML5::ENDTAG
2710                                )
2711                            );
2712
2713                            /* Act as if an end tag token with the tag name "form" had
2714                            been seen. */
2715                            $this->inBody(
2716                                array(
2717                                    'name' => 'form',
2718                                    'type' => HTML5::ENDTAG
2719                                )
2720                            );
2721                        }
2722                        break;
2723
2724                    /* A start tag whose tag name is "textarea" */
2725                    case 'textarea':
2726                        $this->insertElement($token);
2727
2728                        /* Switch the tokeniser's content model flag to the
2729                        RCDATA state. */
2730                        return HTML5::RCDATA;
2731                        break;
2732
2733                    /* A start tag whose tag name is one of: "iframe", "noembed",
2734                    "noframes" */
2735                    case 'iframe':
2736                    case 'noembed':
2737                    case 'noframes':
2738                        $this->insertElement($token);
2739
2740                        /* Switch the tokeniser's content model flag to the CDATA state. */
2741                        return HTML5::CDATA;
2742                        break;
2743
2744                    /* A start tag whose tag name is "select" */
2745                    case 'select':
2746                        /* Reconstruct the active formatting elements, if any. */
2747                        $this->reconstructActiveFormattingElements();
2748
2749                        /* Insert an HTML element for the token. */
2750                        $this->insertElement($token);
2751
2752                        /* Change the insertion mode to "in select". */
2753                        $this->mode = self::IN_SELECT;
2754                        break;
2755
2756                    /* A start or end tag whose tag name is one of: "caption", "col",
2757                    "colgroup", "frame", "frameset", "head", "option", "optgroup",
2758                    "tbody", "td", "tfoot", "th", "thead", "tr". */
2759                    case 'caption':
2760                    case 'col':
2761                    case 'colgroup':
2762                    case 'frame':
2763                    case 'frameset':
2764                    case 'head':
2765                    case 'option':
2766                    case 'optgroup':
2767                    case 'tbody':
2768                    case 'td':
2769                    case 'tfoot':
2770                    case 'th':
2771                    case 'thead':
2772                    case 'tr':
2773                        // Parse error. Ignore the token.
2774                        break;
2775
2776                    /* A start or end tag whose tag name is one of: "event-source",
2777                    "section", "nav", "article", "aside", "header", "footer",
2778                    "datagrid", "command" */
2779                    case 'event-source':
2780                    case 'section':
2781                    case 'nav':
2782                    case 'article':
2783                    case 'aside':
2784                    case 'header':
2785                    case 'footer':
2786                    case 'datagrid':
2787                    case 'command':
2788                        // Work in progress!
2789                        break;
2790
2791                    /* A start tag token not covered by the previous entries */
2792                    default:
2793                        /* Reconstruct the active formatting elements, if any. */
2794                        $this->reconstructActiveFormattingElements();
2795
2796                        $this->insertElement($token, true, true);
2797                        break;
2798                }
2799                break;
2800
2801            case HTML5::ENDTAG:
2802                switch ($token['name']) {
2803                    /* An end tag with the tag name "body" */
2804                    case 'body':
2805                        /* If the second element in the stack of open elements is
2806                        not a body element, this is a parse error. Ignore the token.
2807                        (innerHTML case) */
2808                        if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2809                            // Ignore.
2810
2811                            /* If the current node is not the body element, then this
2812                            is a parse error. */
2813                        } elseif (end($this->stack)->nodeName !== 'body') {
2814                            // Parse error.
2815                        }
2816
2817                        /* Change the insertion mode to "after body". */
2818                        $this->mode = self::AFTER_BODY;
2819                        break;
2820
2821                    /* An end tag with the tag name "html" */
2822                    case 'html':
2823                        /* Act as if an end tag with tag name "body" had been seen,
2824                        then, if that token wasn't ignored, reprocess the current
2825                        token. */
2826                        $this->inBody(
2827                            array(
2828                                'name' => 'body',
2829                                'type' => HTML5::ENDTAG
2830                            )
2831                        );
2832
2833                        return $this->afterBody($token);
2834                        break;
2835
2836                    /* An end tag whose tag name is one of: "address", "blockquote",
2837                    "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2838                    "ol", "pre", "ul" */
2839                    case 'address':
2840                    case 'blockquote':
2841                    case 'center':
2842                    case 'dir':
2843                    case 'div':
2844                    case 'dl':
2845                    case 'fieldset':
2846                    case 'listing':
2847                    case 'menu':
2848                    case 'ol':
2849                    case 'pre':
2850                    case 'ul':
2851                        /* If the stack of open elements has an element in scope
2852                        with the same tag name as that of the token, then generate
2853                        implied end tags. */
2854                        if ($this->elementInScope($token['name'])) {
2855                            $this->generateImpliedEndTags();
2856
2857                            /* Now, if the current node is not an element with
2858                            the same tag name as that of the token, then this
2859                            is a parse error. */
2860                            // w/e
2861
2862                            /* If the stack of open elements has an element in
2863                            scope with the same tag name as that of the token,
2864                            then pop elements from this stack until an element
2865                            with that tag name has been popped from the stack. */
2866                            for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2867                                if ($this->stack[$n]->nodeName === $token['name']) {
2868                                    $n = -1;
2869                                }
2870
2871                                array_pop($this->stack);
2872                            }
2873                        }
2874                        break;
2875
2876                    /* An end tag whose tag name is "form" */
2877                    case 'form':
2878                        /* If the stack of open elements has an element in scope
2879                        with the same tag name as that of the token, then generate
2880                        implied    end tags. */
2881                        if ($this->elementInScope($token['name'])) {
2882                            $this->generateImpliedEndTags();
2883
2884                        }
2885
2886                        if (end($this->stack)->nodeName !== $token['name']) {
2887                            /* Now, if the current node is not an element with the
2888                            same tag name as that of the token, then this is a parse
2889                            error. */
2890                            // w/e
2891
2892                        } else {
2893                            /* Otherwise, if the current node is an element with
2894                            the same tag name as that of the token pop that element
2895                            from the stack. */
2896                            array_pop($this->stack);
2897                        }
2898
2899                        /* In any case, set the form element pointer to null. */
2900                        $this->form_pointer = null;
2901                        break;
2902
2903                    /* An end tag whose tag name is "p" */
2904                    case 'p':
2905                        /* If the stack of open elements has a p element in scope,
2906                        then generate implied end tags, except for p elements. */
2907                        if ($this->elementInScope('p')) {
2908                            $this->generateImpliedEndTags(array('p'));
2909
2910                            /* If the current node is not a p element, then this is
2911                            a parse error. */
2912                            // k
2913
2914                            /* If the stack of open elements has a p element in
2915                            scope, then pop elements from this stack until the stack
2916                            no longer has a p element in scope. */
2917                            for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2918                                if ($this->elementInScope('p')) {
2919                                    array_pop($this->stack);
2920
2921                                } else {
2922                                    break;
2923                                }
2924                            }
2925                        }
2926                        break;
2927
2928                    /* An end tag whose tag name is "dd", "dt", or "li" */
2929                    case 'dd':
2930                    case 'dt':
2931                    case 'li':
2932                        /* If the stack of open elements has an element in scope
2933                        whose tag name matches the tag name of the token, then
2934                        generate implied end tags, except for elements with the
2935                        same tag name as the token. */
2936                        if ($this->elementInScope($token['name'])) {
2937                            $this->generateImpliedEndTags(array($token['name']));
2938
2939                            /* If the current node is not an element with the same
2940                            tag name as the token, then this is a parse error. */
2941                            // w/e
2942
2943                            /* If the stack of open elements has an element in scope
2944                            whose tag name matches the tag name of the token, then
2945                            pop elements from this stack until an element with that
2946                            tag name has been popped from the stack. */
2947                            for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2948                                if ($this->stack[$n]->nodeName === $token['name']) {
2949                                    $n = -1;
2950                                }
2951
2952                                array_pop($this->stack);
2953                            }
2954                        }
2955                        break;
2956
2957                    /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2958                    "h5", "h6" */
2959                    case 'h1':
2960                    case 'h2':
2961                    case 'h3':
2962                    case 'h4':
2963                    case 'h5':
2964                    case 'h6':
2965                        $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966
2967                        /* If the stack of open elements has in scope an element whose
2968                        tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2969                        generate implied end tags. */
2970                        if ($this->elementInScope($elements)) {
2971                            $this->generateImpliedEndTags();
2972
2973                            /* Now, if the current node is not an element with the same
2974                            tag name as that of the token, then this is a parse error. */
2975                            // w/e
2976
2977                            /* If the stack of open elements has in scope an element
2978                            whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2979                            "h6", then pop elements from the stack until an element
2980                            with one of those tag names has been popped from the stack. */
2981                            while ($this->elementInScope($elements)) {
2982                                array_pop($this->stack);
2983                            }
2984                        }
2985                        break;
2986
2987                    /* An end tag whose tag name is one of: "a", "b", "big", "em",
2988                    "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2989                    case 'a':
2990                    case 'b':
2991                    case 'big':
2992                    case 'em':
2993                    case 'font':
2994                    case 'i':
2995                    case 'nobr':
2996                    case 's':
2997                    case 'small':
2998                    case 'strike':
2999                    case 'strong':
3000                    case 'tt':
3001                    case 'u':
3002                        /* 1. Let the formatting element be the last element in
3003                        the list of active formatting elements that:
3004                            * is between the end of the list and the last scope
3005                            marker in the list, if any, or the start of the list
3006                            otherwise, and
3007                            * has the same tag name as the token.
3008                        */
3009                        while (true) {
3010                            for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3011                                if ($this->a_formatting[$a] === self::MARKER) {
3012                                    break;
3013
3014                                } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3015                                    $formatting_element = $this->a_formatting[$a];
3016                                    $in_stack = in_array($formatting_element, $this->stack, true);
3017                                    $fe_af_pos = $a;
3018                                    break;
3019                                }
3020                            }
3021
3022                            /* If there is no such node, or, if that node is
3023                            also in the stack of open elements but the element
3024                            is not in scope, then this is a parse error. Abort
3025                            these steps. The token is ignored. */
3026                            if (!isset($formatting_element) || ($in_stack &&
3027                                    !$this->elementInScope($token['name']))
3028                            ) {
3029                                break;
3030
3031                                /* Otherwise, if there is such a node, but that node
3032                                is not in the stack of open elements, then this is a
3033                                parse error; remove the element from the list, and
3034                                abort these steps. */
3035                            } elseif (isset($formatting_element) && !$in_stack) {
3036                                unset($this->a_formatting[$fe_af_pos]);
3037                                $this->a_formatting = array_merge($this->a_formatting);
3038                                break;
3039                            }
3040
3041                            /* 2. Let the furthest block be the topmost node in the
3042                            stack of open elements that is lower in the stack
3043                            than the formatting element, and is not an element in
3044                            the phrasing or formatting categories. There might
3045                            not be one. */
3046                            $fe_s_pos = array_search($formatting_element, $this->stack, true);
3047                            $length = count($this->stack);
3048
3049                            for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3050                                $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051
3052                                if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3053                                    $furthest_block = $this->stack[$s];
3054                                }
3055                            }
3056
3057                            /* 3. If there is no furthest block, then the UA must
3058                            skip the subsequent steps and instead just pop all
3059                            the nodes from the bottom of the stack of open
3060                            elements, from the current node up to the formatting
3061                            element, and remove the formatting element from the
3062                            list of active formatting elements. */
3063                            if (!isset($furthest_block)) {
3064                                for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3065                                    array_pop($this->stack);
3066                                }
3067
3068                                unset($this->a_formatting[$fe_af_pos]);
3069                                $this->a_formatting = array_merge($this->a_formatting);
3070                                break;
3071                            }
3072
3073                            /* 4. Let the common ancestor be the element
3074                            immediately above the formatting element in the stack
3075                            of open elements. */
3076                            $common_ancestor = $this->stack[$fe_s_pos - 1];
3077
3078                            /* 5. If the furthest block has a parent node, then
3079                            remove the furthest block from its parent node. */
3080                            if ($furthest_block->parentNode !== null) {
3081                                $furthest_block->parentNode->removeChild($furthest_block);
3082                            }
3083
3084                            /* 6. Let a bookmark note the position of the
3085                            formatting element in the list of active formatting
3086                            elements relative to the elements on either side
3087                            of it in the list. */
3088                            $bookmark = $fe_af_pos;
3089
3090                            /* 7. Let node and last node  be the furthest block.
3091                            Follow these steps: */
3092                            $node = $furthest_block;
3093                            $last_node = $furthest_block;
3094
3095                            while (true) {
3096                                for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3097                                    /* 7.1 Let node be the element immediately
3098                                    prior to node in the stack of open elements. */
3099                                    $node = $this->stack[$n];
3100
3101                                    /* 7.2 If node is not in the list of active
3102                                    formatting elements, then remove node from
3103                                    the stack of open elements and then go back
3104                                    to step 1. */
3105                                    if (!in_array($node, $this->a_formatting, true)) {
3106                                        unset($this->stack[$n]);
3107                                        $this->stack = array_merge($this->stack);
3108
3109                                    } else {
3110                                        break;
3111                                    }
3112                                }
3113
3114                                /* 7.3 Otherwise, if node is the formatting
3115                                element, then go to the next step in the overall
3116                                algorithm. */
3117                                if ($node === $formatting_element) {
3118                                    break;
3119
3120                                    /* 7.4 Otherwise, if last node is the furthest
3121                                    block, then move the aforementioned bookmark to
3122                                    be immediately after the node in the list of
3123                                    active formatting elements. */
3124                                } elseif ($last_node === $furthest_block) {
3125                                    $bookmark = array_search($node, $this->a_formatting, true) + 1;
3126                                }
3127
3128                                /* 7.5 If node has any children, perform a
3129                                shallow clone of node, replace the entry for
3130                                node in the list of active formatting elements
3131                                with an entry for the clone, replace the entry
3132                                for node in the stack of open elements with an
3133                                entry for the clone, and let node be the clone. */
3134                                if ($node->hasChildNodes()) {
3135                                    $clone = $node->cloneNode();
3136                                    $s_pos = array_search($node, $this->stack, true);
3137                                    $a_pos = array_search($node, $this->a_formatting, true);
3138
3139                                    $this->stack[$s_pos] = $clone;
3140                                    $this->a_formatting[$a_pos] = $clone;
3141                                    $node = $clone;
3142                                }
3143
3144                                /* 7.6 Insert last node into node, first removing
3145                                it from its previous parent node if any. */
3146                                if ($last_node->parentNode !== null) {
3147                                    $last_node->parentNode->removeChild($last_node);
3148                                }
3149
3150                                $node->appendChild($last_node);
3151
3152                                /* 7.7 Let last node be node. */
3153                                $last_node = $node;
3154                            }
3155
3156                            /* 8. Insert whatever last node ended up being in
3157                            the previous step into the common ancestor node,
3158                            first removing it from its previous parent node if
3159                            any. */
3160                            if ($last_node->parentNode !== null) {
3161                                $last_node->parentNode->removeChild($last_node);
3162                            }
3163
3164                            $common_ancestor->appendChild($last_node);
3165
3166                            /* 9. Perform a shallow clone of the formatting
3167                            element. */
3168                            $clone = $formatting_element->cloneNode();
3169
3170                            /* 10. Take all of the child nodes of the furthest
3171                            block and append them to the clone created in the
3172                            last step. */
3173                            while ($furthest_block->hasChildNodes()) {
3174                                $child = $furthest_block->firstChild;
3175                                $furthest_block->removeChild($child);
3176                                $clone->appendChild($child);
3177                            }
3178
3179                            /* 11. Append that clone to the furthest block. */
3180                            $furthest_block->appendChild($clone);
3181
3182                            /* 12. Remove the formatting element from the list
3183                            of active formatting elements, and insert the clone
3184                            into the list of active formatting elements at the
3185                            position of the aforementioned bookmark. */
3186                            $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3187                            unset($this->a_formatting[$fe_af_pos]);
3188                            $this->a_formatting = array_merge($this->a_formatting);
3189
3190                            $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3191                            $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3192                            $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193
3194                            /* 13. Remove the formatting element from the stack
3195                            of open elements, and insert the clone into the stack
3196                            of open elements immediately after (i.e. in a more
3197                            deeply nested position than) the position of the
3198                            furthest block in that stack. */
3199                            $fe_s_pos = array_search($formatting_element, $this->stack, true);
3200                            $fb_s_pos = array_search($furthest_block, $this->stack, true);
3201                            unset($this->stack[$fe_s_pos]);
3202
3203                            $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3204                            $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3205                            $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206
3207                            /* 14. Jump back to step 1 in this series of steps. */
3208                            unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209                        }
3210                        break;
3211
3212                    /* An end tag token whose tag name is one of: "button",
3213                    "marquee", "object" */
3214                    case 'button':
3215                    case 'marquee':
3216                    case 'object':
3217                        /* If the stack of open elements has an element in scope whose
3218                        tag name matches the tag name of the token, then generate implied
3219                        tags. */
3220                        if ($this->elementInScope($token['name'])) {
3221                            $this->generateImpliedEndTags();
3222
3223                            /* Now, if the current node is not an element with the same
3224                            tag name as the token, then this is a parse error. */
3225                            // k
3226
3227                            /* Now, if the stack of open elements has an element in scope
3228                            whose tag name matches the tag name of the token, then pop
3229                            elements from the stack until that element has been popped from
3230                            the stack, and clear the list of active formatting elements up
3231                            to the last marker. */
3232                            for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3233                                if ($this->stack[$n]->nodeName === $token['name']) {
3234                                    $n = -1;
3235                                }
3236
3237                                array_pop($this->stack);
3238                            }
3239
3240                            $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241
3242                            for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3243                                array_pop($this->a_formatting);
3244                            }
3245                        }
3246                        break;
3247
3248                    /* Or an end tag whose tag name is one of: "area", "basefont",
3249                    "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3250                    "input", "isindex", "noembed", "noframes", "param", "select",
3251                    "spacer", "table", "textarea", "wbr" */
3252                    case 'area':
3253                    case 'basefont':
3254                    case 'bgsound':
3255                    case 'br':
3256                    case 'embed':
3257                    case 'hr':
3258                    case 'iframe':
3259                    case 'image':
3260                    case 'img':
3261                    case 'input':
3262                    case 'isindex':
3263                    case 'noembed':
3264                    case 'noframes':
3265                    case 'param':
3266                    case 'select':
3267                    case 'spacer':
3268                    case 'table':
3269                    case 'textarea':
3270                    case 'wbr':
3271                        // Parse error. Ignore the token.
3272                        break;
3273
3274                    /* An end tag token not covered by the previous entries */
3275                    default:
3276                        for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3277                            /* Initialise node to be the current node (the bottommost
3278                            node of the stack). */
3279                            $node = end($this->stack);
3280
3281                            /* If node has the same tag name as the end tag token,
3282                            then: */
3283                            if ($token['name'] === $node->nodeName) {
3284                                /* Generate implied end tags. */
3285                                $this->generateImpliedEndTags();
3286
3287                                /* If the tag name of the end tag token does not
3288                                match the tag name of the current node, this is a
3289                                parse error. */
3290                                // k
3291
3292                                /* Pop all the nodes from the current node up to
3293                                node, including node, then stop this algorithm. */
3294                                for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3295                                    array_pop($this->stack);
3296                                }
3297
3298                            } else {
3299                                $category = $this->getElementCategory($node);
3300
3301                                if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3302                                    /* Otherwise, if node is in neither the formatting
3303                                    category nor the phrasing category, then this is a
3304                                    parse error. Stop this algorithm. The end tag token
3305                                    is ignored. */
3306                                    return false;
3307                                }
3308                            }
3309                        }
3310                        break;
3311                }
3312                break;
3313        }
3314    }
3315
3316    private function inTable($token)
3317    {
3318        $clear = array('html', 'table');
3319
3320        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3321        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3322        or U+0020 SPACE */
3323        if ($token['type'] === HTML5::CHARACTR &&
3324            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325        ) {
3326            /* Append the character to the current node. */
3327            $text = $this->dom->createTextNode($token['data']);
3328            end($this->stack)->appendChild($text);
3329
3330            /* A comment token */
3331        } elseif ($token['type'] === HTML5::COMMENT) {
3332            /* Append a Comment node to the current node with the data
3333            attribute set to the data given in the comment token. */
3334            $comment = $this->dom->createComment($token['data']);
3335            end($this->stack)->appendChild($comment);
3336
3337            /* A start tag whose tag name is "caption" */
3338        } elseif ($token['type'] === HTML5::STARTTAG &&
3339            $token['name'] === 'caption'
3340        ) {
3341            /* Clear the stack back to a table context. */
3342            $this->clearStackToTableContext($clear);
3343
3344            /* Insert a marker at the end of the list of active
3345            formatting elements. */
3346            $this->a_formatting[] = self::MARKER;
3347
3348            /* Insert an HTML element for the token, then switch the
3349            insertion mode to "in caption". */
3350            $this->insertElement($token);
3351            $this->mode = self::IN_CAPTION;
3352
3353            /* A start tag whose tag name is "colgroup" */
3354        } elseif ($token['type'] === HTML5::STARTTAG &&
3355            $token['name'] === 'colgroup'
3356        ) {
3357            /* Clear the stack back to a table context. */
3358            $this->clearStackToTableContext($clear);
3359
3360            /* Insert an HTML element for the token, then switch the
3361            insertion mode to "in column group". */
3362            $this->insertElement($token);
3363            $this->mode = self::IN_CGROUP;
3364
3365            /* A start tag whose tag name is "col" */
3366        } elseif ($token['type'] === HTML5::STARTTAG &&
3367            $token['name'] === 'col'
3368        ) {
3369            $this->inTable(
3370                array(
3371                    'name' => 'colgroup',
3372                    'type' => HTML5::STARTTAG,
3373                    'attr' => array()
3374                )
3375            );
3376
3377            $this->inColumnGroup($token);
3378
3379            /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3380        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3381                $token['name'],
3382                array('tbody', 'tfoot', 'thead')
3383            )
3384        ) {
3385            /* Clear the stack back to a table context. */
3386            $this->clearStackToTableContext($clear);
3387
3388            /* Insert an HTML element for the token, then switch the insertion
3389            mode to "in table body". */
3390            $this->insertElement($token);
3391            $this->mode = self::IN_TBODY;
3392
3393            /* A start tag whose tag name is one of: "td", "th", "tr" */
3394        } elseif ($token['type'] === HTML5::STARTTAG &&
3395            in_array($token['name'], array('td', 'th', 'tr'))
3396        ) {
3397            /* Act as if a start tag token with the tag name "tbody" had been
3398            seen, then reprocess the current token. */
3399            $this->inTable(
3400                array(
3401                    'name' => 'tbody',
3402                    'type' => HTML5::STARTTAG,
3403                    'attr' => array()
3404                )
3405            );
3406
3407            return $this->inTableBody($token);
3408
3409            /* A start tag whose tag name is "table" */
3410        } elseif ($token['type'] === HTML5::STARTTAG &&
3411            $token['name'] === 'table'
3412        ) {
3413            /* Parse error. Act as if an end tag token with the tag name "table"
3414            had been seen, then, if that token wasn't ignored, reprocess the
3415            current token. */
3416            $this->inTable(
3417                array(
3418                    'name' => 'table',
3419                    'type' => HTML5::ENDTAG
3420                )
3421            );
3422
3423            return $this->mainPhase($token);
3424
3425            /* An end tag whose tag name is "table" */
3426        } elseif ($token['type'] === HTML5::ENDTAG &&
3427            $token['name'] === 'table'
3428        ) {
3429            /* If the stack of open elements does not have an element in table
3430            scope with the same tag name as the token, this is a parse error.
3431            Ignore the token. (innerHTML case) */
3432            if (!$this->elementInScope($token['name'], true)) {
3433                return false;
3434
3435                /* Otherwise: */
3436            } else {
3437                /* Generate implied end tags. */
3438                $this->generateImpliedEndTags();
3439
3440                /* Now, if the current node is not a table element, then this
3441                is a parse error. */
3442                // w/e
3443
3444                /* Pop elements from this stack until a table element has been
3445                popped from the stack. */
3446                while (true) {
3447                    $current = end($this->stack)->nodeName;
3448                    array_pop($this->stack);
3449
3450                    if ($current === 'table') {
3451                        break;
3452                    }
3453                }
3454
3455                /* Reset the insertion mode appropriately. */
3456                $this->resetInsertionMode();
3457            }
3458
3459            /* An end tag whose tag name is one of: "body", "caption", "col",
3460            "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3461        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3462                $token['name'],
3463                array(
3464                    'body',
3465                    'caption',
3466                    'col',
3467                    'colgroup',
3468                    'html',
3469                    'tbody',
3470                    'td',
3471                    'tfoot',
3472                    'th',
3473                    'thead',
3474                    'tr'
3475                )
3476            )
3477        ) {
3478            // Parse error. Ignore the token.
3479
3480            /* Anything else */
3481        } else {
3482            /* Parse error. Process the token as if the insertion mode was "in
3483            body", with the following exception: */
3484
3485            /* If the current node is a table, tbody, tfoot, thead, or tr
3486            element, then, whenever a node would be inserted into the current
3487            node, it must instead be inserted into the foster parent element. */
3488            if (in_array(
3489                end($this->stack)->nodeName,
3490                array('table', 'tbody', 'tfoot', 'thead', 'tr')
3491            )
3492            ) {
3493                /* The foster parent element is the parent element of the last
3494                table element in the stack of open elements, if there is a
3495                table element and it has such a parent element. If there is no
3496                table element in the stack of open elements (innerHTML case),
3497                then the foster parent element is the first element in the
3498                stack of open elements (the html  element). Otherwise, if there
3499                is a table element in the stack of open elements, but the last
3500                table element in the stack of open elements has no parent, or
3501                its parent node is not an element, then the foster parent
3502                element is the element before the last table element in the
3503                stack of open elements. */
3504                for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3505                    if ($this->stack[$n]->nodeName === 'table') {
3506                        $table = $this->stack[$n];
3507                        break;
3508                    }
3509                }
3510
3511                if (isset($table) && $table->parentNode !== null) {
3512                    $this->foster_parent = $table->parentNode;
3513
3514                } elseif (!isset($table)) {
3515                    $this->foster_parent = $this->stack[0];
3516
3517                } elseif (isset($table) && ($table->parentNode === null ||
3518                        $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519                ) {
3520                    $this->foster_parent = $this->stack[$n - 1];
3521                }
3522            }
3523
3524            $this->inBody($token);
3525        }
3526    }
3527
3528    private function inCaption($token)
3529    {
3530        /* An end tag whose tag name is "caption" */
3531        if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3532            /* If the stack of open elements does not have an element in table
3533            scope with the same tag name as the token, this is a parse error.
3534            Ignore the token. (innerHTML case) */
3535            if (!$this->elementInScope($token['name'], true)) {
3536                // Ignore
3537
3538                /* Otherwise: */
3539            } else {
3540                /* Generate implied end tags. */
3541                $this->generateImpliedEndTags();
3542
3543                /* Now, if the current node is not a caption element, then this
3544                is a parse error. */
3545                // w/e
3546
3547                /* Pop elements from this stack until a caption element has
3548                been popped from the stack. */
3549                while (true) {
3550                    $node = end($this->stack)->nodeName;
3551                    array_pop($this->stack);
3552
3553                    if ($node === 'caption') {
3554                        break;
3555                    }
3556                }
3557
3558                /* Clear the list of active formatting elements up to the last
3559                marker. */
3560                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3561
3562                /* Switch the insertion mode to "in table". */
3563                $this->mode = self::IN_TABLE;
3564            }
3565
3566            /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3567            "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3568            name is "table" */
3569        } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3570                    $token['name'],
3571                    array(
3572                        'caption',
3573                        'col',
3574                        'colgroup',
3575                        'tbody',
3576                        'td',
3577                        'tfoot',
3578                        'th',
3579                        'thead',
3580                        'tr'
3581                    )
3582                )) || ($token['type'] === HTML5::ENDTAG &&
3583                $token['name'] === 'table')
3584        ) {
3585            /* Parse error. Act as if an end tag with the tag name "caption"
3586            had been seen, then, if that token wasn't ignored, reprocess the
3587            current token. */
3588            $this->inCaption(
3589                array(
3590                    'name' => 'caption',
3591                    'type' => HTML5::ENDTAG
3592                )
3593            );
3594
3595            return $this->inTable($token);
3596
3597            /* An end tag whose tag name is one of: "body", "col", "colgroup",
3598            "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3599        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3600                $token['name'],
3601                array(
3602                    'body',
3603                    'col',
3604                    'colgroup',
3605                    'html',
3606                    'tbody',
3607                    'tfoot',
3608                    'th',
3609                    'thead',
3610                    'tr'
3611                )
3612            )
3613        ) {
3614            // Parse error. Ignore the token.
3615
3616            /* Anything else */
3617        } else {
3618            /* Process the token as if the insertion mode was "in body". */
3619            $this->inBody($token);
3620        }
3621    }
3622
3623    private function inColumnGroup($token)
3624    {
3625        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3626        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3627        or U+0020 SPACE */
3628        if ($token['type'] === HTML5::CHARACTR &&
3629            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630        ) {
3631            /* Append the character to the current node. */
3632            $text = $this->dom->createTextNode($token['data']);
3633            end($this->stack)->appendChild($text);
3634
3635            /* A comment token */
3636        } elseif ($token['type'] === HTML5::COMMENT) {
3637            /* Append a Comment node to the current node with the data
3638            attribute set to the data given in the comment token. */
3639            $comment = $this->dom->createComment($token['data']);
3640            end($this->stack)->appendChild($comment);
3641
3642            /* A start tag whose tag name is "col" */
3643        } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3644            /* Insert a col element for the token. Immediately pop the current
3645            node off the stack of open elements. */
3646            $this->insertElement($token);
3647            array_pop($this->stack);
3648
3649            /* An end tag whose tag name is "colgroup" */
3650        } elseif ($token['type'] === HTML5::ENDTAG &&
3651            $token['name'] === 'colgroup'
3652        ) {
3653            /* If the current node is the root html element, then this is a
3654            parse error, ignore the token. (innerHTML case) */
3655            if (end($this->stack)->nodeName === 'html') {
3656                // Ignore
3657
3658                /* Otherwise, pop the current node (which will be a colgroup
3659                element) from the stack of open elements. Switch the insertion
3660                mode to "in table". */
3661            } else {
3662                array_pop($this->stack);
3663                $this->mode = self::IN_TABLE;
3664            }
3665
3666            /* An end tag whose tag name is "col" */
3667        } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3668            /* Parse error. Ignore the token. */
3669
3670            /* Anything else */
3671        } else {
3672            /* Act as if an end tag with the tag name "colgroup" had been seen,
3673            and then, if that token wasn't ignored, reprocess the current token. */
3674            $this->inColumnGroup(
3675                array(
3676                    'name' => 'colgroup',
3677                    'type' => HTML5::ENDTAG
3678                )
3679            );
3680
3681            return $this->inTable($token);
3682        }
3683    }
3684
3685    private function inTableBody($token)
3686    {
3687        $clear = array('tbody', 'tfoot', 'thead', 'html');
3688
3689        /* A start tag whose tag name is "tr" */
3690        if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3691            /* Clear the stack back to a table body context. */
3692            $this->clearStackToTableContext($clear);
3693
3694            /* Insert a tr element for the token, then switch the insertion
3695            mode to "in row". */
3696            $this->insertElement($token);
3697            $this->mode = self::IN_ROW;
3698
3699            /* A start tag whose tag name is one of: "th", "td" */
3700        } elseif ($token['type'] === HTML5::STARTTAG &&
3701            ($token['name'] === 'th' || $token['name'] === 'td')
3702        ) {
3703            /* Parse error. Act as if a start tag with the tag name "tr" had
3704            been seen, then reprocess the current token. */
3705            $this->inTableBody(
3706                array(
3707                    'name' => 'tr',
3708                    'type' => HTML5::STARTTAG,
3709                    'attr' => array()
3710                )
3711            );
3712
3713            return $this->inRow($token);
3714
3715            /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3716        } elseif ($token['type'] === HTML5::ENDTAG &&
3717            in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718        ) {
3719            /* If the stack of open elements does not have an element in table
3720            scope with the same tag name as the token, this is a parse error.
3721            Ignore the token. */
3722            if (!$this->elementInScope($token['name'], true)) {
3723                // Ignore
3724
3725                /* Otherwise: */
3726            } else {
3727                /* Clear the stack back to a table body context. */
3728                $this->clearStackToTableContext($clear);
3729
3730                /* Pop the current node from the stack of open elements. Switch
3731                the insertion mode to "in table". */
3732                array_pop($this->stack);
3733                $this->mode = self::IN_TABLE;
3734            }
3735
3736            /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3737            "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3738        } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3739                    $token['name'],
3740                    array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3741                )) ||
3742            ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743        ) {
3744            /* If the stack of open elements does not have a tbody, thead, or
3745            tfoot element in table scope, this is a parse error. Ignore the
3746            token. (innerHTML case) */
3747            if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3748                // Ignore.
3749
3750                /* Otherwise: */
3751            } else {
3752                /* Clear the stack back to a table body context. */
3753                $this->clearStackToTableContext($clear);
3754
3755                /* Act as if an end tag with the same tag name as the current
3756                node ("tbody", "tfoot", or "thead") had been seen, then
3757                reprocess the current token. */
3758                $this->inTableBody(
3759                    array(
3760                        'name' => end($this->stack)->nodeName,
3761                        'type' => HTML5::ENDTAG
3762                    )
3763                );
3764
3765                return $this->mainPhase($token);
3766            }
3767
3768            /* An end tag whose tag name is one of: "body", "caption", "col",
3769            "colgroup", "html", "td", "th", "tr" */
3770        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3771                $token['name'],
3772                array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3773            )
3774        ) {
3775            /* Parse error. Ignore the token. */
3776
3777            /* Anything else */
3778        } else {
3779            /* Process the token as if the insertion mode was "in table". */
3780            $this->inTable($token);
3781        }
3782    }
3783
3784    private function inRow($token)
3785    {
3786        $clear = array('tr', 'html');
3787
3788        /* A start tag whose tag name is one of: "th", "td" */
3789        if ($token['type'] === HTML5::STARTTAG &&
3790            ($token['name'] === 'th' || $token['name'] === 'td')
3791        ) {
3792            /* Clear the stack back to a table row context. */
3793            $this->clearStackToTableContext($clear);
3794
3795            /* Insert an HTML element for the token, then switch the insertion
3796            mode to "in cell". */
3797            $this->insertElement($token);
3798            $this->mode = self::IN_CELL;
3799
3800            /* Insert a marker at the end of the list of active formatting
3801            elements. */
3802            $this->a_formatting[] = self::MARKER;
3803
3804            /* An end tag whose tag name is "tr" */
3805        } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3806            /* If the stack of open elements does not have an element in table
3807            scope with the same tag name as the token, this is a parse error.
3808            Ignore the token. (innerHTML case) */
3809            if (!$this->elementInScope($token['name'], true)) {
3810                // Ignore.
3811
3812                /* Otherwise: */
3813            } else {
3814                /* Clear the stack back to a table row context. */
3815                $this->clearStackToTableContext($clear);
3816
3817                /* Pop the current node (which will be a tr element) from the
3818                stack of open elements. Switch the insertion mode to "in table
3819                body". */
3820                array_pop($this->stack);
3821                $this->mode = self::IN_TBODY;
3822            }
3823
3824            /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3825            "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3826        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3827                $token['name'],
3828                array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3829            )
3830        ) {
3831            /* Act as if an end tag with the tag name "tr" had been seen, then,
3832            if that token wasn't ignored, reprocess the current token. */
3833            $this->inRow(
3834                array(
3835                    'name' => 'tr',
3836                    'type' => HTML5::ENDTAG
3837                )
3838            );
3839
3840            return $this->inCell($token);
3841
3842            /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3843        } elseif ($token['type'] === HTML5::ENDTAG &&
3844            in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845        ) {
3846            /* If the stack of open elements does not have an element in table
3847            scope with the same tag name as the token, this is a parse error.
3848            Ignore the token. */
3849            if (!$this->elementInScope($token['name'], true)) {
3850                // Ignore.
3851
3852                /* Otherwise: */
3853            } else {
3854                /* Otherwise, act as if an end tag with the tag name "tr" had
3855                been seen, then reprocess the current token. */
3856                $this->inRow(
3857                    array(
3858                        'name' => 'tr',
3859                        'type' => HTML5::ENDTAG
3860                    )
3861                );
3862
3863                return $this->inCell($token);
3864            }
3865
3866            /* An end tag whose tag name is one of: "body", "caption", "col",
3867            "colgroup", "html", "td", "th" */
3868        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3869                $token['name'],
3870                array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3871            )
3872        ) {
3873            /* Parse error. Ignore the token. */
3874
3875            /* Anything else */
3876        } else {
3877            /* Process the token as if the insertion mode was "in table". */
3878            $this->inTable($token);
3879        }
3880    }
3881
3882    private function inCell($token)
3883    {
3884        /* An end tag whose tag name is one of: "td", "th" */
3885        if ($token['type'] === HTML5::ENDTAG &&
3886            ($token['name'] === 'td' || $token['name'] === 'th')
3887        ) {
3888            /* If the stack of open elements does not have an element in table
3889            scope with the same tag name as that of the token, then this is a
3890            parse error and the token must be ignored. */
3891            if (!$this->elementInScope($token['name'], true)) {
3892                // Ignore.
3893
3894                /* Otherwise: */
3895            } else {
3896                /* Generate implied end tags, except for elements with the same
3897                tag name as the token. */
3898                $this->generateImpliedEndTags(array($token['name']));
3899
3900                /* Now, if the current node is not an element with the same tag
3901                name as the token, then this is a parse error. */
3902                // k
3903
3904                /* Pop elements from this stack until an element with the same
3905                tag name as the token has been popped from the stack. */
3906                while (true) {
3907                    $node = end($this->stack)->nodeName;
3908                    array_pop($this->stack);
3909
3910                    if ($node === $token['name']) {
3911                        break;
3912                    }
3913                }
3914
3915                /* Clear the list of active formatting elements up to the last
3916                marker. */
3917                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3918
3919                /* Switch the insertion mode to "in row". (The current node
3920                will be a tr element at this point.) */
3921                $this->mode = self::IN_ROW;
3922            }
3923
3924            /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3925            "tbody", "td", "tfoot", "th", "thead", "tr" */
3926        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3927                $token['name'],
3928                array(
3929                    'caption',
3930                    'col',
3931                    'colgroup',
3932                    'tbody',
3933                    'td',
3934                    'tfoot',
3935                    'th',
3936                    'thead',
3937                    'tr'
3938                )
3939            )
3940        ) {
3941            /* If the stack of open elements does not have a td or th element
3942            in table scope, then this is a parse error; ignore the token.
3943            (innerHTML case) */
3944            if (!$this->elementInScope(array('td', 'th'), true)) {
3945                // Ignore.
3946
3947                /* Otherwise, close the cell (see below) and reprocess the current
3948                token. */
3949            } else {
3950                $this->closeCell();
3951                return $this->inRow($token);
3952            }
3953
3954            /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3955            "tbody", "td", "tfoot", "th", "thead", "tr" */
3956        } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3957                $token['name'],
3958                array(
3959                    'caption',
3960                    'col',
3961                    'colgroup',
3962                    'tbody',
3963                    'td',
3964                    'tfoot',
3965                    'th',
3966                    'thead',
3967                    'tr'
3968                )
3969            )
3970        ) {
3971            /* If the stack of open elements does not have a td or th element
3972            in table scope, then this is a parse error; ignore the token.
3973            (innerHTML case) */
3974            if (!$this->elementInScope(array('td', 'th'), true)) {
3975                // Ignore.
3976
3977                /* Otherwise, close the cell (see below) and reprocess the current
3978                token. */
3979            } else {
3980                $this->closeCell();
3981                return $this->inRow($token);
3982            }
3983
3984            /* An end tag whose tag name is one of: "body", "caption", "col",
3985            "colgroup", "html" */
3986        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3987                $token['name'],
3988                array('body', 'caption', 'col', 'colgroup', 'html')
3989            )
3990        ) {
3991            /* Parse error. Ignore the token. */
3992
3993            /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3994            "thead", "tr" */
3995        } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3996                $token['name'],
3997                array('table', 'tbody', 'tfoot', 'thead', 'tr')
3998            )
3999        ) {
4000            /* If the stack of open elements does not have an element in table
4001            scope with the same tag name as that of the token (which can only
4002            happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4003            then this is a parse error and the token must be ignored. */
4004            if (!$this->elementInScope($token['name'], true)) {
4005                // Ignore.
4006
4007                /* Otherwise, close the cell (see below) and reprocess the current
4008                token. */
4009            } else {
4010                $this->closeCell();
4011                return $this->inRow($token);
4012            }
4013
4014            /* Anything else */
4015        } else {
4016            /* Process the token as if the insertion mode was "in body". */
4017            $this->inBody($token);
4018        }
4019    }
4020
4021    private function inSelect($token)
4022    {
4023        /* Handle the token as follows: */
4024
4025        /* A character token */
4026        if ($token['type'] === HTML5::CHARACTR) {
4027            /* Append the token's character to the current node. */
4028            $this->insertText($token['data']);
4029
4030            /* A comment token */
4031        } elseif ($token['type'] === HTML5::COMMENT) {
4032            /* Append a Comment node to the current node with the data
4033            attribute set to the data given in the comment token. */
4034            $this->insertComment($token['data']);
4035
4036            /* A start tag token whose tag name is "option" */
4037        } elseif ($token['type'] === HTML5::STARTTAG &&
4038            $token['name'] === 'option'
4039        ) {
4040            /* If the current node is an option element, act as if an end tag
4041            with the tag name "option" had been seen. */
4042            if (end($this->stack)->nodeName === 'option') {
4043                $this->inSelect(
4044                    array(
4045                        'name' => 'option',
4046                        'type' => HTML5::ENDTAG
4047                    )
4048                );
4049            }
4050
4051            /* Insert an HTML element for the token. */
4052            $this->insertElement($token);
4053
4054            /* A start tag token whose tag name is "optgroup" */
4055        } elseif ($token['type'] === HTML5::STARTTAG &&
4056            $token['name'] === 'optgroup'
4057        ) {
4058            /* If the current node is an option element, act as if an end tag
4059            with the tag name "option" had been seen. */
4060            if (end($this->stack)->nodeName === 'option') {
4061                $this->inSelect(
4062                    array(
4063                        'name' => 'option',
4064                        'type' => HTML5::ENDTAG
4065                    )
4066                );
4067            }
4068
4069            /* If the current node is an optgroup element, act as if an end tag
4070            with the tag name "optgroup" had been seen. */
4071            if (end($this->stack)->nodeName === 'optgroup') {
4072                $this->inSelect(
4073                    array(
4074                        'name' => 'optgroup',
4075                        'type' => HTML5::ENDTAG
4076                    )
4077                );
4078            }
4079
4080            /* Insert an HTML element for the token. */
4081            $this->insertElement($token);
4082
4083            /* An end tag token whose tag name is "optgroup" */
4084        } elseif ($token['type'] === HTML5::ENDTAG &&
4085            $token['name'] === 'optgroup'
4086        ) {
4087            /* First, if the current node is an option element, and the node
4088            immediately before it in the stack of open elements is an optgroup
4089            element, then act as if an end tag with the tag name "option" had
4090            been seen. */
4091            $elements_in_stack = count($this->stack);
4092
4093            if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4094                $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095            ) {
4096                $this->inSelect(
4097                    array(
4098                        'name' => 'option',
4099                        'type' => HTML5::ENDTAG
4100                    )
4101                );
4102            }
4103
4104            /* If the current node is an optgroup element, then pop that node
4105            from the stack of open elements. Otherwise, this is a parse error,
4106            ignore the token. */
4107            if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4108                array_pop($this->stack);
4109            }
4110
4111            /* An end tag token whose tag name is "option" */
4112        } elseif ($token['type'] === HTML5::ENDTAG &&
4113            $token['name'] === 'option'
4114        ) {
4115            /* If the current node is an option element, then pop that node
4116            from the stack of open elements. Otherwise, this is a parse error,
4117            ignore the token. */
4118            if (end($this->stack)->nodeName === 'option') {
4119                array_pop($this->stack);
4120            }
4121
4122            /* An end tag whose tag name is "select" */
4123        } elseif ($token['type'] === HTML5::ENDTAG &&
4124            $token['name'] === 'select'
4125        ) {
4126            /* If the stack of open elements does not have an element in table
4127            scope with the same tag name as the token, this is a parse error.
4128            Ignore the token. (innerHTML case) */
4129            if (!$this->elementInScope($token['name'], true)) {
4130                // w/e
4131
4132                /* Otherwise: */
4133            } else {
4134                /* Pop elements from the stack of open elements until a select
4135                element has been popped from the stack. */
4136                while (true) {
4137                    $current = end($this->stack)->nodeName;
4138                    array_pop($this->stack);
4139
4140                    if ($current === 'select') {
4141                        break;
4142                    }
4143                }
4144
4145                /* Reset the insertion mode appropriately. */
4146                $this->resetInsertionMode();
4147            }
4148
4149            /* A start tag whose tag name is "select" */
4150        } elseif ($token['name'] === 'select' &&
4151            $token['type'] === HTML5::STARTTAG
4152        ) {
4153            /* Parse error. Act as if the token had been an end tag with the
4154            tag name "select" instead. */
4155            $this->inSelect(
4156                array(
4157                    'name' => 'select',
4158                    'type' => HTML5::ENDTAG
4159                )
4160            );
4161
4162            /* An end tag whose tag name is one of: "caption", "table", "tbody",
4163            "tfoot", "thead", "tr", "td", "th" */
4164        } elseif (in_array(
4165                $token['name'],
4166                array(
4167                    'caption',
4168                    'table',
4169                    'tbody',
4170                    'tfoot',
4171                    'thead',
4172                    'tr',
4173                    'td',
4174                    'th'
4175                )
4176            ) && $token['type'] === HTML5::ENDTAG
4177        ) {
4178            /* Parse error. */
4179            // w/e
4180
4181            /* If the stack of open elements has an element in table scope with
4182            the same tag name as that of the token, then act as if an end tag
4183            with the tag name "select" had been seen, and reprocess the token.
4184            Otherwise, ignore the token. */
4185            if ($this->elementInScope($token['name'], true)) {
4186                $this->inSelect(
4187                    array(
4188                        'name' => 'select',
4189                        'type' => HTML5::ENDTAG
4190                    )
4191                );
4192
4193                $this->mainPhase($token);
4194            }
4195
4196            /* Anything else */
4197        } else {
4198            /* Parse error. Ignore the token. */
4199        }
4200    }
4201
4202    private function afterBody($token)
4203    {
4204        /* Handle the token as follows: */
4205
4206        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4207        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4208        or U+0020 SPACE */
4209        if ($token['type'] === HTML5::CHARACTR &&
4210            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211        ) {
4212            /* Process the token as it would be processed if the insertion mode
4213            was "in body". */
4214            $this->inBody($token);
4215
4216            /* A comment token */
4217        } elseif ($token['type'] === HTML5::COMMENT) {
4218            /* Append a Comment node to the first element in the stack of open
4219            elements (the html element), with the data attribute set to the
4220            data given in the comment token. */
4221            $comment = $this->dom->createComment($token['data']);
4222            $this->stack[0]->appendChild($comment);
4223
4224            /* An end tag with the tag name "html" */
4225        } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4226            /* If the parser was originally created in order to handle the
4227            setting of an element's innerHTML attribute, this is a parse error;
4228            ignore the token. (The element will be an html element in this
4229            case.) (innerHTML case) */
4230
4231            /* Otherwise, switch to the trailing end phase. */
4232            $this->phase = self::END_PHASE;
4233
4234            /* Anything else */
4235        } else {
4236            /* Parse error. Set the insertion mode to "in body" and reprocess
4237            the token. */
4238            $this->mode = self::IN_BODY;
4239            return $this->inBody($token);
4240        }
4241    }
4242
4243    private function inFrameset($token)
4244    {
4245        /* Handle the token as follows: */
4246
4247        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4248        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4249        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4250        if ($token['type'] === HTML5::CHARACTR &&
4251            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252        ) {
4253            /* Append the character to the current node. */
4254            $this->insertText($token['data']);
4255
4256            /* A comment token */
4257        } elseif ($token['type'] === HTML5::COMMENT) {
4258            /* Append a Comment node to the current node with the data
4259            attribute set to the data given in the comment token. */
4260            $this->insertComment($token['data']);
4261
4262            /* A start tag with the tag name "frameset" */
4263        } elseif ($token['name'] === 'frameset' &&
4264            $token['type'] === HTML5::STARTTAG
4265        ) {
4266            $this->insertElement($token);
4267
4268            /* An end tag with the tag name "frameset" */
4269        } elseif ($token['name'] === 'frameset' &&
4270            $token['type'] === HTML5::ENDTAG
4271        ) {
4272            /* If the current node is the root html element, then this is a
4273            parse error; ignore the token. (innerHTML case) */
4274            if (end($this->stack)->nodeName === 'html') {
4275                // Ignore
4276
4277            } else {
4278                /* Otherwise, pop the current node from the stack of open
4279                elements. */
4280                array_pop($this->stack);
4281
4282                /* If the parser was not originally created in order to handle
4283                the setting of an element's innerHTML attribute (innerHTML case),
4284                and the current node is no longer a frameset element, then change
4285                the insertion mode to "after frameset". */
4286                $this->mode = self::AFTR_FRAME;
4287            }
4288
4289            /* A start tag with the tag name "frame" */
4290        } elseif ($token['name'] === 'frame' &&
4291            $token['type'] === HTML5::STARTTAG
4292        ) {
4293            /* Insert an HTML element for the token. */
4294            $this->insertElement($token);
4295
4296            /* Immediately pop the current node off the stack of open elements. */
4297            array_pop($this->stack);
4298
4299            /* A start tag with the tag name "noframes" */
4300        } elseif ($token['name'] === 'noframes' &&
4301            $token['type'] === HTML5::STARTTAG
4302        ) {
4303            /* Process the token as if the insertion mode had been "in body". */
4304            $this->inBody($token);
4305
4306            /* Anything else */
4307        } else {
4308            /* Parse error. Ignore the token. */
4309        }
4310    }
4311
4312    private function afterFrameset($token)
4313    {
4314        /* Handle the token as follows: */
4315
4316        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4317        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4318        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4319        if ($token['type'] === HTML5::CHARACTR &&
4320            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321        ) {
4322            /* Append the character to the current node. */
4323            $this->insertText($token['data']);
4324
4325            /* A comment token */
4326        } elseif ($token['type'] === HTML5::COMMENT) {
4327            /* Append a Comment node to the current node with the data
4328            attribute set to the data given in the comment token. */
4329            $this->insertComment($token['data']);
4330
4331            /* An end tag with the tag name "html" */
4332        } elseif ($token['name'] === 'html' &&
4333            $token['type'] === HTML5::ENDTAG
4334        ) {
4335            /* Switch to the trailing end phase. */
4336            $this->phase = self::END_PHASE;
4337
4338            /* A start tag with the tag name "noframes" */
4339        } elseif ($token['name'] === 'noframes' &&
4340            $token['type'] === HTML5::STARTTAG
4341        ) {
4342            /* Process the token as if the insertion mode had been "in body". */
4343            $this->inBody($token);
4344
4345            /* Anything else */
4346        } else {
4347            /* Parse error. Ignore the token. */
4348        }
4349    }
4350
4351    private function trailingEndPhase($token)
4352    {
4353        /* After the main phase, as each token is emitted from the tokenisation
4354        stage, it must be processed as described in this section. */
4355
4356        /* A DOCTYPE token */
4357        if ($token['type'] === HTML5::DOCTYPE) {
4358            // Parse error. Ignore the token.
4359
4360            /* A comment token */
4361        } elseif ($token['type'] === HTML5::COMMENT) {
4362            /* Append a Comment node to the Document object with the data
4363            attribute set to the data given in the comment token. */
4364            $comment = $this->dom->createComment($token['data']);
4365            $this->dom->appendChild($comment);
4366
4367            /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4368            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4369            or U+0020 SPACE */
4370        } elseif ($token['type'] === HTML5::CHARACTR &&
4371            preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372        ) {
4373            /* Process the token as it would be processed in the main phase. */
4374            $this->mainPhase($token);
4375
4376            /* A character token that is not one of U+0009 CHARACTER TABULATION,
4377            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4378            or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4379        } elseif (($token['type'] === HTML5::CHARACTR &&
4380                preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4381            $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382        ) {
4383            /* Parse error. Switch back to the main phase and reprocess the
4384            token. */
4385            $this->phase = self::MAIN_PHASE;
4386            return $this->mainPhase($token);
4387
4388            /* An end-of-file token */
4389        } elseif ($token['type'] === HTML5::EOF) {
4390            /* OMG DONE!! */
4391        }
4392    }
4393
4394    private function insertElement($token, $append = true, $check = false)
4395    {
4396        // Proprietary workaround for libxml2's limitations with tag names
4397        if ($check) {
4398            // Slightly modified HTML5 tag-name modification,
4399            // removing anything that's not an ASCII letter, digit, or hyphen
4400            $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4401            // Remove leading hyphens and numbers
4402            $token['name'] = ltrim($token['name'], '-0..9');
4403            // In theory, this should ever be needed, but just in case
4404            if ($token['name'] === '') {
4405                $token['name'] = 'span';
4406            } // arbitrary generic choice
4407        }
4408
4409        $el = $this->dom->createElement($token['name']);
4410
4411        foreach ($token['attr'] as $attr) {
4412            if (!$el->hasAttribute($attr['name'])) {
4413                $el->setAttribute($attr['name'], $attr['value']);
4414            }
4415        }
4416
4417        $this->appendToRealParent($el);
4418        $this->stack[] = $el;
4419
4420        return $el;
4421    }
4422
4423    private function insertText($data)
4424    {
4425        $text = $this->dom->createTextNode($data);
4426        $this->appendToRealParent($text);
4427    }
4428
4429    private function insertComment($data)
4430    {
4431        $comment = $this->dom->createComment($data);
4432        $this->appendToRealParent($comment);
4433    }
4434
4435    private function appendToRealParent($node)
4436    {
4437        if ($this->foster_parent === null) {
4438            end($this->stack)->appendChild($node);
4439
4440        } elseif ($this->foster_parent !== null) {
4441            /* If the foster parent element is the parent element of the
4442            last table element in the stack of open elements, then the new
4443            node must be inserted immediately before the last table element
4444            in the stack of open elements in the foster parent element;
4445            otherwise, the new node must be appended to the foster parent
4446            element. */
4447            for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4448                if ($this->stack[$n]->nodeName === 'table' &&
4449                    $this->stack[$n]->parentNode !== null
4450                ) {
4451                    $table = $this->stack[$n];
4452                    break;
4453                }
4454            }
4455
4456            if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4457                $this->foster_parent->insertBefore($node, $table);
4458            } else {
4459                $this->foster_parent->appendChild($node);
4460            }
4461
4462            $this->foster_parent = null;
4463        }
4464    }
4465
4466    private function elementInScope($el, $table = false)
4467    {
4468        if (is_array($el)) {
4469            foreach ($el as $element) {
4470                if ($this->elementInScope($element, $table)) {
4471                    return true;
4472                }
4473            }
4474
4475            return false;
4476        }
4477
4478        $leng = count($this->stack);
4479
4480        for ($n = 0; $n < $leng; $n++) {
4481            /* 1. Initialise node to be the current node (the bottommost node of
4482            the stack). */
4483            $node = $this->stack[$leng - 1 - $n];
4484
4485            if ($node->tagName === $el) {
4486                /* 2. If node is the target node, terminate in a match state. */
4487                return true;
4488
4489            } elseif ($node->tagName === 'table') {
4490                /* 3. Otherwise, if node is a table element, terminate in a failure
4491                state. */
4492                return false;
4493
4494            } elseif ($table === true && in_array(
4495                    $node->tagName,
4496                    array(
4497                        'caption',
4498                        'td',
4499                        'th',
4500                        'button',
4501                        'marquee',
4502                        'object'
4503                    )
4504                )
4505            ) {
4506                /* 4. Otherwise, if the algorithm is the "has an element in scope"
4507                variant (rather than the "has an element in table scope" variant),
4508                and node is one of the following, terminate in a failure state. */
4509                return false;
4510
4511            } elseif ($node === $node->ownerDocument->documentElement) {
4512                /* 5. Otherwise, if node is an html element (root element), terminate
4513                in a failure state. (This can only happen if the node is the topmost
4514                node of the    stack of open elements, and prevents the next step from
4515                being invoked if there are no more elements in the stack.) */
4516                return false;
4517            }
4518
4519            /* Otherwise, set node to the previous entry in the stack of open
4520            elements and return to step 2. (This will never fail, since the loop
4521            will always terminate in the previous step if the top of the stack
4522            is reached.) */
4523        }
4524    }
4525
4526    private function reconstructActiveFormattingElements()
4527    {
4528        /* 1. If there are no entries in the list of active formatting elements,
4529        then there is nothing to reconstruct; stop this algorithm. */
4530        $formatting_elements = count($this->a_formatting);
4531
4532        if ($formatting_elements === 0) {
4533            return false;
4534        }
4535
4536        /* 3. Let entry be the last (most recently added) element in the list
4537        of active formatting elements. */
4538        $entry = end($this->a_formatting);
4539
4540        /* 2. If the last (most recently added) entry in the list of active
4541        formatting elements is a marker, or if it is an element that is in the
4542        stack of open elements, then there is nothing to reconstruct; stop this
4543        algorithm. */
4544        if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4545            return false;
4546        }
4547
4548        for ($a = $formatting_elements - 1; $a >= 0; true) {
4549            /* 4. If there are no entries before entry in the list of active
4550            formatting elements, then jump to step 8. */
4551            if ($a === 0) {
4552                $step_seven = false;
4553                break;
4554            }
4555
4556            /* 5. Let entry be the entry one earlier than entry in the list of
4557            active formatting elements. */
4558            $a--;
4559            $entry = $this->a_formatting[$a];
4560
4561            /* 6. If entry is neither a marker nor an element that is also in
4562            thetack of open elements, go to step 4. */
4563            if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4564                break;
4565            }
4566        }
4567
4568        while (true) {
4569            /* 7. Let entry be the element one later than entry in the list of
4570            active formatting elements. */
4571            if (isset($step_seven) && $step_seven === true) {
4572                $a++;
4573                $entry = $this->a_formatting[$a];
4574            }
4575
4576            /* 8. Perform a shallow clone of the element entry to obtain clone. */
4577            $clone = $entry->cloneNode();
4578
4579            /* 9. Append clone to the current node and push it onto the stack
4580            of open elements  so that it is the new current node. */
4581            end($this->stack)->appendChild($clone);
4582            $this->stack[] = $clone;
4583
4584            /* 10. Replace the entry for entry in the list with an entry for
4585            clone. */
4586            $this->a_formatting[$a] = $clone;
4587
4588            /* 11. If the entry for clone in the list of active formatting
4589            elements is not the last entry in the list, return to step 7. */
4590            if (end($this->a_formatting) !== $clone) {
4591                $step_seven = true;
4592            } else {
4593                break;
4594            }
4595        }
4596    }
4597
4598    private function clearTheActiveFormattingElementsUpToTheLastMarker()
4599    {
4600        /* When the steps below require the UA to clear the list of active
4601        formatting elements up to the last marker, the UA must perform the
4602        following steps: */
4603
4604        while (true) {
4605            /* 1. Let entry be the last (most recently added) entry in the list
4606            of active formatting elements. */
4607            $entry = end($this->a_formatting);
4608
4609            /* 2. Remove entry from the list of active formatting elements. */
4610            array_pop($this->a_formatting);
4611
4612            /* 3. If entry was a marker, then stop the algorithm at this point.
4613            The list has been cleared up to the last marker. */
4614            if ($entry === self::MARKER) {
4615                break;
4616            }
4617        }
4618    }
4619
4620    private function generateImpliedEndTags($exclude = array())
4621    {
4622        /* When the steps below require the UA to generate implied end tags,
4623        then, if the current node is a dd element, a dt element, an li element,
4624        a p element, a td element, a th  element, or a tr element, the UA must
4625        act as if an end tag with the respective tag name had been seen and
4626        then generate implied end tags again. */
4627        $node = end($this->stack);
4628        $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629
4630        while (in_array(end($this->stack)->nodeName, $elements)) {
4631            array_pop($this->stack);
4632        }
4633    }
4634
4635    private function getElementCategory($node)
4636    {
4637        $name = $node->tagName;
4638        if (in_array($name, $this->special)) {
4639            return self::SPECIAL;
4640        } elseif (in_array($name, $this->scoping)) {
4641            return self::SCOPING;
4642        } elseif (in_array($name, $this->formatting)) {
4643            return self::FORMATTING;
4644        } else {
4645            return self::PHRASING;
4646        }
4647    }
4648
4649    private function clearStackToTableContext($elements)
4650    {
4651        /* When the steps above require the UA to clear the stack back to a
4652        table context, it means that the UA must, while the current node is not
4653        a table element or an html element, pop elements from the stack of open
4654        elements. If this causes any elements to be popped from the stack, then
4655        this is a parse error. */
4656        while (true) {
4657            $node = end($this->stack)->nodeName;
4658
4659            if (in_array($node, $elements)) {
4660                break;
4661            } else {
4662                array_pop($this->stack);
4663            }
4664        }
4665    }
4666
4667    private function resetInsertionMode()
4668    {
4669        /* 1. Let last be false. */
4670        $last = false;
4671        $leng = count($this->stack);
4672
4673        for ($n = $leng - 1; $n >= 0; $n--) {
4674            /* 2. Let node be the last node in the stack of open elements. */
4675            $node = $this->stack[$n];
4676
4677            /* 3. If node is the first node in the stack of open elements, then
4678            set last to true. If the element whose innerHTML  attribute is being
4679            set is neither a td  element nor a th element, then set node to the
4680            element whose innerHTML  attribute is being set. (innerHTML  case) */
4681            if ($this->stack[0]->isSameNode($node)) {
4682                $last = true;
4683            }
4684
4685            /* 4. If node is a select element, then switch the insertion mode to
4686            "in select" and abort these steps. (innerHTML case) */
4687            if ($node->nodeName === 'select') {
4688                $this->mode = self::IN_SELECT;
4689                break;
4690
4691                /* 5. If node is a td or th element, then switch the insertion mode
4692                to "in cell" and abort these steps. */
4693            } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4694                $this->mode = self::IN_CELL;
4695                break;
4696
4697                /* 6. If node is a tr element, then switch the insertion mode to
4698                "in    row" and abort these steps. */
4699            } elseif ($node->nodeName === 'tr') {
4700                $this->mode = self::IN_ROW;
4701                break;
4702
4703                /* 7. If node is a tbody, thead, or tfoot element, then switch the
4704                insertion mode to "in table body" and abort these steps. */
4705            } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4706                $this->mode = self::IN_TBODY;
4707                break;
4708
4709                /* 8. If node is a caption element, then switch the insertion mode
4710                to "in caption" and abort these steps. */
4711            } elseif ($node->nodeName === 'caption') {
4712                $this->mode = self::IN_CAPTION;
4713                break;
4714
4715                /* 9. If node is a colgroup element, then switch the insertion mode
4716                to "in column group" and abort these steps. (innerHTML case) */
4717            } elseif ($node->nodeName === 'colgroup') {
4718                $this->mode = self::IN_CGROUP;
4719                break;
4720
4721                /* 10. If node is a table element, then switch the insertion mode
4722                to "in table" and abort these steps. */
4723            } elseif ($node->nodeName === 'table') {
4724                $this->mode = self::IN_TABLE;
4725                break;
4726
4727                /* 11. If node is a head element, then switch the insertion mode
4728                to "in body" ("in body"! not "in head"!) and abort these steps.
4729                (innerHTML case) */
4730            } elseif ($node->nodeName === 'head') {
4731                $this->mode = self::IN_BODY;
4732                break;
4733
4734                /* 12. If node is a body element, then switch the insertion mode to
4735                "in body" and abort these steps. */
4736            } elseif ($node->nodeName === 'body') {
4737                $this->mode = self::IN_BODY;
4738                break;
4739
4740                /* 13. If node is a frameset element, then switch the insertion
4741                mode to "in frameset" and abort these steps. (innerHTML case) */
4742            } elseif ($node->nodeName === 'frameset') {
4743                $this->mode = self::IN_FRAME;
4744                break;
4745
4746                /* 14. If node is an html element, then: if the head element
4747                pointer is null, switch the insertion mode to "before head",
4748                otherwise, switch the insertion mode to "after head". In either
4749                case, abort these steps. (innerHTML case) */
4750            } elseif ($node->nodeName === 'html') {
4751                $this->mode = ($this->head_pointer === null)
4752                    ? self::BEFOR_HEAD
4753                    : self::AFTER_HEAD;
4754
4755                break;
4756
4757                /* 15. If last is true, then set the insertion mode to "in body"
4758                and    abort these steps. (innerHTML case) */
4759            } elseif ($last) {
4760                $this->mode = self::IN_BODY;
4761                break;
4762            }
4763        }
4764    }
4765
4766    private function closeCell()
4767    {
4768        /* If the stack of open elements has a td or th element in table scope,
4769        then act as if an end tag token with that tag name had been seen. */
4770        foreach (array('td', 'th') as $cell) {
4771            if ($this->elementInScope($cell, true)) {
4772                $this->inCell(
4773                    array(
4774                        'name' => $cell,
4775                        'type' => HTML5::ENDTAG
4776                    )
4777                );
4778
4779                break;
4780            }
4781        }
4782    }
4783
4784    public function save()
4785    {
4786        return $this->dom;
4787    }
4788}
4789