1<?php
2
3/*
4
5Copyright 2007 Jeroen van der Meer <http://jero.net/>
6Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9Permission is hereby granted, free of charge, to any person obtaining a
10copy of this software and associated documentation files (the
11"Software"), to deal in the Software without restriction, including
12without limitation the rights to use, copy, modify, merge, publish,
13distribute, sublicense, and/or sell copies of the Software, and to
14permit persons to whom the Software is furnished to do so, subject to
15the following conditions:
16
17The above copyright notice and this permission notice shall be included
18in all copies or substantial portions of the Software.
19
20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28*/
29
30// Some conventions:
31// /* */ indicates verbatim text from the HTML 5 specification
32// // indicates regular comments
33
34// all flags are in hyphenated form
35
36class HTML5_Tokenizer {
37    /**
38     * @var HTML5_InputStream
39     *
40     * Points to an InputStream object.
41     */
42    protected $stream;
43
44    /**
45     * @var HTML5_TreeBuilder
46     *
47     * Tree builder that the tokenizer emits token to.
48     */
49    private $tree;
50
51    /**
52     * @var int
53     *
54     * Current content model we are parsing as.
55     */
56    protected $content_model;
57
58    /**
59     * Current token that is being built, but not yet emitted. Also
60     * is the last token emitted, if applicable.
61     */
62    protected $token;
63
64    // These are constants describing the content model
65    const PCDATA    = 0;
66    const RCDATA    = 1;
67    const CDATA     = 2;
68    const PLAINTEXT = 3;
69
70    // These are constants describing tokens
71    // XXX should probably be moved somewhere else, probably the
72    // HTML5 class.
73    const DOCTYPE        = 0;
74    const STARTTAG       = 1;
75    const ENDTAG         = 2;
76    const COMMENT        = 3;
77    const CHARACTER      = 4;
78    const SPACECHARACTER = 5;
79    const EOF            = 6;
80    const PARSEERROR     = 7;
81
82    // These are constants representing bunches of characters.
83    const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
84    const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
85    const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
86    const DIGIT       = '0123456789';
87    const HEX         = '0123456789ABCDEFabcdef';
88    const WHITESPACE  = "\t\n\x0c ";
89
90    /**
91     * @param $data | Data to parse
92     * @param HTML5_TreeBuilder|null $builder
93     */
94    public function __construct($data, $builder = null) {
95        $this->stream = new HTML5_InputStream($data);
96        if (!$builder) {
97            $this->tree = new HTML5_TreeBuilder;
98        } else {
99            $this->tree = $builder;
100        }
101        $this->content_model = self::PCDATA;
102    }
103
104    /**
105     * @param null $context
106     */
107    public function parseFragment($context = null) {
108        $this->tree->setupContext($context);
109        if ($this->tree->content_model) {
110            $this->content_model = $this->tree->content_model;
111            $this->tree->content_model = null;
112        }
113        $this->parse();
114    }
115
116    // XXX maybe convert this into an iterator? regardless, this function
117    // and the save function should go into a Parser facade of some sort
118    /**
119     * Performs the actual parsing of the document.
120     */
121    public function parse() {
122        // Current state
123        $state = 'data';
124        // This is used to avoid having to have look-behind in the data state.
125        $lastFourChars = '';
126        /**
127         * Escape flag as specified by the HTML5 specification: "used to
128         * control the behavior of the tokeniser. It is either true or
129         * false, and initially must be set to the false state."
130         */
131        $escape = false;
132        //echo "\n\n";
133        while($state !== null) {
134
135            /*echo $state . ' ';
136            switch ($this->content_model) {
137                case self::PCDATA: echo 'PCDATA'; break;
138                case self::RCDATA: echo 'RCDATA'; break;
139                case self::CDATA: echo 'CDATA'; break;
140                case self::PLAINTEXT: echo 'PLAINTEXT'; break;
141            }
142            if ($escape) echo " escape";
143            echo "\n";*/
144
145            switch($state) {
146                case 'data':
147
148                    /* Consume the next input character */
149                    $char = $this->stream->char();
150                    $lastFourChars .= $char;
151                    if (strlen($lastFourChars) > 4) {
152                        $lastFourChars = substr($lastFourChars, -4);
153                    }
154
155                    // see below for meaning
156                    $hyp_cond =
157                        !$escape &&
158                        (
159                            $this->content_model === self::RCDATA ||
160                            $this->content_model === self::CDATA
161                        );
162                    $amp_cond =
163                        !$escape &&
164                        (
165                            $this->content_model === self::PCDATA ||
166                            $this->content_model === self::RCDATA
167                        );
168                    $lt_cond =
169                        $this->content_model === self::PCDATA ||
170                        (
171                            (
172                                $this->content_model === self::RCDATA ||
173                                $this->content_model === self::CDATA
174                             ) &&
175                             !$escape
176                        );
177                    $gt_cond =
178                        $escape &&
179                        (
180                            $this->content_model === self::RCDATA ||
181                            $this->content_model === self::CDATA
182                        );
183
184                    if ($char === '&' && $amp_cond === true) {
185                        /* U+0026 AMPERSAND (&)
186                        When the content model flag is set to one of the PCDATA or RCDATA
187                        states and the escape flag is false: switch to the
188                        character reference data state. Otherwise: treat it as per
189                        the "anything else" entry below. */
190                        $state = 'character reference data';
191
192                    } elseif (
193                        $char === '-' &&
194                        $hyp_cond === true &&
195                        $lastFourChars === '<!--'
196                    ) {
197                        /*
198                        U+002D HYPHEN-MINUS (-)
199                        If the content model flag is set to either the RCDATA state or
200                        the CDATA state, and the escape flag is false, and there are at
201                        least three characters before this one in the input stream, and the
202                        last four characters in the input stream, including this one, are
203                        U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
204                        and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
205                        $escape = true;
206
207                        /* In any case, emit the input character as a character token. Stay
208                        in the data state. */
209                        $this->emitToken(array(
210                            'type' => self::CHARACTER,
211                            'data' => '-'
212                        ));
213                        // We do the "any case" part as part of "anything else".
214
215                    /* U+003C LESS-THAN SIGN (<) */
216                    } elseif ($char === '<' && $lt_cond === true) {
217                        /* When the content model flag is set to the PCDATA state: switch
218                        to the tag open state.
219
220                        When the content model flag is set to either the RCDATA state or
221                        the CDATA state and the escape flag is false: switch to the tag
222                        open state.
223
224                        Otherwise: treat it as per the "anything else" entry below. */
225                        $state = 'tag open';
226
227                    /* U+003E GREATER-THAN SIGN (>) */
228                    } elseif (
229                        $char === '>' &&
230                        $gt_cond === true &&
231                        substr($lastFourChars, 1) === '-->'
232                    ) {
233                        /* If the content model flag is set to either the RCDATA state or
234                        the CDATA state, and the escape flag is true, and the last three
235                        characters in the input stream including this one are U+002D
236                        HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
237                        set the escape flag to false. */
238                        $escape = false;
239
240                        /* In any case, emit the input character as a character token.
241                        Stay in the data state. */
242                        $this->emitToken(array(
243                            'type' => self::CHARACTER,
244                            'data' => '>'
245                        ));
246                        // We do the "any case" part as part of "anything else".
247
248                    } elseif ($char === false) {
249                        /* EOF
250                        Emit an end-of-file token. */
251                        $state = null;
252                        $this->tree->emitToken(array(
253                            'type' => self::EOF
254                        ));
255
256                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
257                        // Directly after emitting a token you switch back to the "data
258                        // state". At that point spaceCharacters are important so they are
259                        // emitted separately.
260                        $chars = $this->stream->charsWhile(self::WHITESPACE);
261                        $this->emitToken(array(
262                            'type' => self::SPACECHARACTER,
263                            'data' => $char . $chars
264                        ));
265                        $lastFourChars .= $chars;
266                        if (strlen($lastFourChars) > 4) {
267                            $lastFourChars = substr($lastFourChars, -4);
268                        }
269                    } else {
270                        /* Anything else
271                        THIS IS AN OPTIMIZATION: Get as many character that
272                        otherwise would also be treated as a character token and emit it
273                        as a single character token. Stay in the data state. */
274
275                        $mask = '';
276                        if ($hyp_cond === true) {
277                            $mask .= '-';
278                        }
279                        if ($amp_cond === true) {
280                            $mask .= '&';
281                        }
282                        if ($lt_cond === true) {
283                            $mask .= '<';
284                        }
285                        if ($gt_cond === true) {
286                            $mask .= '>';
287                        }
288
289                        if ($mask === '') {
290                            $chars = $this->stream->remainingChars();
291                        } else {
292                            $chars = $this->stream->charsUntil($mask);
293                        }
294
295                        $this->emitToken(array(
296                            'type' => self::CHARACTER,
297                            'data' => $char . $chars
298                        ));
299
300                        $lastFourChars .= $chars;
301                        if (strlen($lastFourChars) > 4) {
302                            $lastFourChars = substr($lastFourChars, -4);
303                        }
304
305                        $state = 'data';
306                    }
307                break;
308
309                case 'character reference data':
310                    /* (This cannot happen if the content model flag
311                    is set to the CDATA state.) */
312
313                    /* Attempt to consume a character reference, with no
314                    additional allowed character. */
315                    $entity = $this->consumeCharacterReference();
316
317                    /* If nothing is returned, emit a U+0026 AMPERSAND
318                    character token. Otherwise, emit the character token that
319                    was returned. */
320                    // This is all done when consuming the character reference.
321                    $this->emitToken(array(
322                        'type' => self::CHARACTER,
323                        'data' => $entity
324                    ));
325
326                    /* Finally, switch to the data state. */
327                    $state = 'data';
328                break;
329
330                case 'tag open':
331                    $char = $this->stream->char();
332
333                    switch ($this->content_model) {
334                        case self::RCDATA:
335                        case self::CDATA:
336                            /* Consume the next input character. If it is a
337                            U+002F SOLIDUS (/) character, switch to the close
338                            tag open state. Otherwise, emit a U+003C LESS-THAN
339                            SIGN character token and reconsume the current input
340                            character in the data state. */
341                            // We consumed above.
342
343                            if ($char === '/') {
344                                $state = 'close tag open';
345                            } else {
346                                $this->emitToken(array(
347                                    'type' => self::CHARACTER,
348                                    'data' => '<'
349                                ));
350
351                                $this->stream->unget();
352
353                                $state = 'data';
354                            }
355                        break;
356
357                        case self::PCDATA:
358                            /* If the content model flag is set to the PCDATA state
359                            Consume the next input character: */
360                            // We consumed above.
361
362                            if ($char === '!') {
363                                /* U+0021 EXCLAMATION MARK (!)
364                                Switch to the markup declaration open state. */
365                                $state = 'markup declaration open';
366
367                            } elseif ($char === '/') {
368                                /* U+002F SOLIDUS (/)
369                                Switch to the close tag open state. */
370                                $state = 'close tag open';
371
372                            } elseif ('A' <= $char && $char <= 'Z') {
373                                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
374                                Create a new start tag token, set its tag name to the lowercase
375                                version of the input character (add 0x0020 to the character's code
376                                point), then switch to the tag name state. (Don't emit the token
377                                yet; further details will be filled in before it is emitted.) */
378                                $this->token = array(
379                                    'name'  => strtolower($char),
380                                    'type'  => self::STARTTAG,
381                                    'attr'  => array()
382                                );
383
384                                $state = 'tag name';
385
386                            } elseif ('a' <= $char && $char <= 'z') {
387                                /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
388                                Create a new start tag token, set its tag name to the input
389                                character, then switch to the tag name state. (Don't emit
390                                the token yet; further details will be filled in before it
391                                is emitted.) */
392                                $this->token = array(
393                                    'name'  => $char,
394                                    'type'  => self::STARTTAG,
395                                    'attr'  => array()
396                                );
397
398                                $state = 'tag name';
399
400                            } elseif ($char === '>') {
401                                /* U+003E GREATER-THAN SIGN (>)
402                                Parse error. Emit a U+003C LESS-THAN SIGN character token and a
403                                U+003E GREATER-THAN SIGN character token. Switch to the data state. */
404                                $this->emitToken(array(
405                                    'type' => self::PARSEERROR,
406                                    'data' => 'expected-tag-name-but-got-right-bracket'
407                                ));
408                                $this->emitToken(array(
409                                    'type' => self::CHARACTER,
410                                    'data' => '<>'
411                                ));
412
413                                $state = 'data';
414
415                            } elseif ($char === '?') {
416                                /* U+003F QUESTION MARK (?)
417                                Parse error. Switch to the bogus comment state. */
418                                $this->emitToken(array(
419                                    'type' => self::PARSEERROR,
420                                    'data' => 'expected-tag-name-but-got-question-mark'
421                                ));
422                                $this->token = array(
423                                    'data' => '?',
424                                    'type' => self::COMMENT
425                                );
426                                $state = 'bogus comment';
427
428                            } else {
429                                /* Anything else
430                                Parse error. Emit a U+003C LESS-THAN SIGN character token and
431                                reconsume the current input character in the data state. */
432                                $this->emitToken(array(
433                                    'type' => self::PARSEERROR,
434                                    'data' => 'expected-tag-name'
435                                ));
436                                $this->emitToken(array(
437                                    'type' => self::CHARACTER,
438                                    'data' => '<'
439                                ));
440
441                                $state = 'data';
442                                $this->stream->unget();
443                            }
444                        break;
445                    }
446                break;
447
448                case 'close tag open':
449                    if (
450                        $this->content_model === self::RCDATA ||
451                        $this->content_model === self::CDATA
452                    ) {
453                        /* If the content model flag is set to the RCDATA or CDATA
454                        states... */
455                        $name = strtolower($this->stream->charsWhile(self::ALPHA));
456                        $following = $this->stream->char();
457                        $this->stream->unget();
458                        if (
459                            !$this->token ||
460                            $this->token['name'] !== $name ||
461                            $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
462                        ) {
463                            /* if no start tag token has ever been emitted by this instance
464                            of the tokenizer (fragment case), or, if the next few
465                            characters do not match the tag name of the last start tag
466                            token emitted (compared in an ASCII case-insensitive manner),
467                            or if they do but they are not immediately followed by one of
468                            the following characters:
469
470                                * U+0009 CHARACTER TABULATION
471                                * U+000A LINE FEED (LF)
472                                * U+000C FORM FEED (FF)
473                                * U+0020 SPACE
474                                * U+003E GREATER-THAN SIGN (>)
475                                * U+002F SOLIDUS (/)
476                                * EOF
477
478                            ...then emit a U+003C LESS-THAN SIGN character token, a
479                            U+002F SOLIDUS character token, and switch to the data
480                            state to process the next input character. */
481                            // XXX: Probably ought to replace in_array with $following === x ||...
482
483                            // We also need to emit $name now we've consumed that, as we
484                            // know it'll just be emitted as a character token.
485                            $this->emitToken(array(
486                                'type' => self::CHARACTER,
487                                'data' => '</' . $name
488                            ));
489
490                            $state = 'data';
491                        } else {
492                            // This matches what would happen if we actually did the
493                            // otherwise below (but we can't because we've consumed too
494                            // much).
495
496                            // Start the end tag token with the name we already have.
497                            $this->token = array(
498                                'name'  => $name,
499                                'type'  => self::ENDTAG
500                            );
501
502                            // Change to tag name state.
503                            $state = 'tag name';
504                        }
505                    } elseif ($this->content_model === self::PCDATA) {
506                        /* Otherwise, if the content model flag is set to the PCDATA
507                        state [...]: */
508                        $char = $this->stream->char();
509
510                        if ('A' <= $char && $char <= 'Z') {
511                            /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
512                            Create a new end tag token, set its tag name to the lowercase version
513                            of the input character (add 0x0020 to the character's code point), then
514                            switch to the tag name state. (Don't emit the token yet; further details
515                            will be filled in before it is emitted.) */
516                            $this->token = array(
517                                'name'  => strtolower($char),
518                                'type'  => self::ENDTAG
519                            );
520
521                            $state = 'tag name';
522
523                        } elseif ('a' <= $char && $char <= 'z') {
524                            /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
525                            Create a new end tag token, set its tag name to the
526                            input character, then switch to the tag name state.
527                            (Don't emit the token yet; further details will be
528                            filled in before it is emitted.) */
529                            $this->token = array(
530                                'name'  => $char,
531                                'type'  => self::ENDTAG
532                            );
533
534                            $state = 'tag name';
535
536                        } elseif ($char === '>') {
537                            /* U+003E GREATER-THAN SIGN (>)
538                            Parse error. Switch to the data state. */
539                            $this->emitToken(array(
540                                'type' => self::PARSEERROR,
541                                'data' => 'expected-closing-tag-but-got-right-bracket'
542                            ));
543                            $state = 'data';
544
545                        } elseif ($char === false) {
546                            /* EOF
547                            Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
548                            SOLIDUS character token. Reconsume the EOF character in the data state. */
549                            $this->emitToken(array(
550                                'type' => self::PARSEERROR,
551                                'data' => 'expected-closing-tag-but-got-eof'
552                            ));
553                            $this->emitToken(array(
554                                'type' => self::CHARACTER,
555                                'data' => '</'
556                            ));
557
558                            $this->stream->unget();
559                            $state = 'data';
560
561                        } else {
562                            /* Parse error. Switch to the bogus comment state. */
563                            $this->emitToken(array(
564                                'type' => self::PARSEERROR,
565                                'data' => 'expected-closing-tag-but-got-char'
566                            ));
567                            $this->token = array(
568                                'data' => $char,
569                                'type' => self::COMMENT
570                            );
571                            $state = 'bogus comment';
572                        }
573                    }
574                break;
575
576                case 'tag name':
577                    /* Consume the next input character: */
578                    $char = $this->stream->char();
579
580                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
581                        /* U+0009 CHARACTER TABULATION
582                        U+000A LINE FEED (LF)
583                        U+000C FORM FEED (FF)
584                        U+0020 SPACE
585                        Switch to the before attribute name state. */
586                        $state = 'before attribute name';
587
588                    } elseif ($char === '/') {
589                        /* U+002F SOLIDUS (/)
590                        Switch to the self-closing start tag state. */
591                        $state = 'self-closing start tag';
592
593                    } elseif ($char === '>') {
594                        /* U+003E GREATER-THAN SIGN (>)
595                        Emit the current tag token. Switch to the data state. */
596                        $this->emitToken($this->token);
597                        $state = 'data';
598
599                    } elseif ('A' <= $char && $char <= 'Z') {
600                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
601                        Append the lowercase version of the current input
602                        character (add 0x0020 to the character's code point) to
603                        the current tag token's tag name. Stay in the tag name state. */
604                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
605
606                        $this->token['name'] .= strtolower($char . $chars);
607                        $state = 'tag name';
608
609                    } elseif ($char === false) {
610                        /* EOF
611                        Parse error. Reconsume the EOF character in the data state. */
612                        $this->emitToken(array(
613                            'type' => self::PARSEERROR,
614                            'data' => 'eof-in-tag-name'
615                        ));
616
617                        $this->stream->unget();
618                        $state = 'data';
619
620                    } else {
621                        /* Anything else
622                        Append the current input character to the current tag token's tag name.
623                        Stay in the tag name state. */
624                        $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
625
626                        $this->token['name'] .= $char . $chars;
627                        $state = 'tag name';
628                    }
629                break;
630
631                case 'before attribute name':
632                    /* Consume the next input character: */
633                    $char = $this->stream->char();
634
635                    // this conditional is optimized, check bottom
636                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
637                        /* U+0009 CHARACTER TABULATION
638                        U+000A LINE FEED (LF)
639                        U+000C FORM FEED (FF)
640                        U+0020 SPACE
641                        Stay in the before attribute name state. */
642                        $state = 'before attribute name';
643
644                    } elseif ($char === '/') {
645                        /* U+002F SOLIDUS (/)
646                        Switch to the self-closing start tag state. */
647                        $state = 'self-closing start tag';
648
649                    } elseif ($char === '>') {
650                        /* U+003E GREATER-THAN SIGN (>)
651                        Emit the current tag token. Switch to the data state. */
652                        $this->emitToken($this->token);
653                        $state = 'data';
654
655                    } elseif ('A' <= $char && $char <= 'Z') {
656                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
657                        Start a new attribute in the current tag token. Set that
658                        attribute's name to the lowercase version of the current
659                        input character (add 0x0020 to the character's code
660                        point), and its value to the empty string. Switch to the
661                        attribute name state.*/
662                        $this->token['attr'][] = array(
663                            'name'  => strtolower($char),
664                            'value' => ''
665                        );
666
667                        $state = 'attribute name';
668
669                    } elseif ($char === false) {
670                        /* EOF
671                        Parse error. Reconsume the EOF character in the data state. */
672                        $this->emitToken(array(
673                            'type' => self::PARSEERROR,
674                            'data' => 'expected-attribute-name-but-got-eof'
675                        ));
676
677                        $this->stream->unget();
678                        $state = 'data';
679
680                    } else {
681                        /* U+0022 QUOTATION MARK (")
682                           U+0027 APOSTROPHE (')
683                           U+003C LESS-THAN SIGN (<)
684                           U+003D EQUALS SIGN (=)
685                        Parse error. Treat it as per the "anything else" entry
686                        below. */
687                        if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
688                            $this->emitToken(array(
689                                'type' => self::PARSEERROR,
690                                'data' => 'invalid-character-in-attribute-name'
691                            ));
692                        }
693
694                        /* Anything else
695                        Start a new attribute in the current tag token. Set that attribute's
696                        name to the current input character, and its value to the empty string.
697                        Switch to the attribute name state. */
698                        $this->token['attr'][] = array(
699                            'name'  => $char,
700                            'value' => ''
701                        );
702
703                        $state = 'attribute name';
704                    }
705                break;
706
707                case 'attribute name':
708                    // Consume the next input character:
709                    $char = $this->stream->char();
710
711                    // this conditional is optimized, check bottom
712                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
713                        /* U+0009 CHARACTER TABULATION
714                        U+000A LINE FEED (LF)
715                        U+000C FORM FEED (FF)
716                        U+0020 SPACE
717                        Switch to the after attribute name state. */
718                        $state = 'after attribute name';
719
720                    } elseif ($char === '/') {
721                        /* U+002F SOLIDUS (/)
722                        Switch to the self-closing start tag state. */
723                        $state = 'self-closing start tag';
724
725                    } elseif ($char === '=') {
726                        /* U+003D EQUALS SIGN (=)
727                        Switch to the before attribute value state. */
728                        $state = 'before attribute value';
729
730                    } elseif ($char === '>') {
731                        /* U+003E GREATER-THAN SIGN (>)
732                        Emit the current tag token. Switch to the data state. */
733                        $this->emitToken($this->token);
734                        $state = 'data';
735
736                    } elseif ('A' <= $char && $char <= 'Z') {
737                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
738                        Append the lowercase version of the current input
739                        character (add 0x0020 to the character's code point) to
740                        the current attribute's name. Stay in the attribute name
741                        state. */
742                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
743
744                        $last = count($this->token['attr']) - 1;
745                        $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
746
747                        $state = 'attribute name';
748
749                    } elseif ($char === false) {
750                        /* EOF
751                        Parse error. Reconsume the EOF character in the data state. */
752                        $this->emitToken(array(
753                            'type' => self::PARSEERROR,
754                            'data' => 'eof-in-attribute-name'
755                        ));
756
757                        $this->stream->unget();
758                        $state = 'data';
759
760                    } else {
761                        /* U+0022 QUOTATION MARK (")
762                           U+0027 APOSTROPHE (')
763                           U+003C LESS-THAN SIGN (<)
764                        Parse error. Treat it as per the "anything else"
765                        entry below. */
766                        if ($char === '"' || $char === "'" || $char === '<') {
767                            $this->emitToken(array(
768                                'type' => self::PARSEERROR,
769                                'data' => 'invalid-character-in-attribute-name'
770                            ));
771                        }
772
773                        /* Anything else
774                        Append the current input character to the current attribute's name.
775                        Stay in the attribute name state. */
776                        $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
777
778                        $last = count($this->token['attr']) - 1;
779                        $this->token['attr'][$last]['name'] .= $char . $chars;
780
781                        $state = 'attribute name';
782                    }
783
784                    /* When the user agent leaves the attribute name state
785                    (and before emitting the tag token, if appropriate), the
786                    complete attribute's name must be compared to the other
787                    attributes on the same token; if there is already an
788                    attribute on the token with the exact same name, then this
789                    is a parse error and the new attribute must be dropped, along
790                    with the value that gets associated with it (if any). */
791                    // this might be implemented in the emitToken method
792                break;
793
794                case 'after attribute name':
795                    // Consume the next input character:
796                    $char = $this->stream->char();
797
798                    // this is an optimized conditional, check the bottom
799                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
800                        /* U+0009 CHARACTER TABULATION
801                        U+000A LINE FEED (LF)
802                        U+000C FORM FEED (FF)
803                        U+0020 SPACE
804                        Stay in the after attribute name state. */
805                        $state = 'after attribute name';
806
807                    } elseif ($char === '/') {
808                        /* U+002F SOLIDUS (/)
809                        Switch to the self-closing start tag state. */
810                        $state = 'self-closing start tag';
811
812                    } elseif ($char === '=') {
813                        /* U+003D EQUALS SIGN (=)
814                        Switch to the before attribute value state. */
815                        $state = 'before attribute value';
816
817                    } elseif ($char === '>') {
818                        /* U+003E GREATER-THAN SIGN (>)
819                        Emit the current tag token. Switch to the data state. */
820                        $this->emitToken($this->token);
821                        $state = 'data';
822
823                    } elseif ('A' <= $char && $char <= 'Z') {
824                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
825                        Start a new attribute in the current tag token. Set that
826                        attribute's name to the lowercase version of the current
827                        input character (add 0x0020 to the character's code
828                        point), and its value to the empty string. Switch to the
829                        attribute name state. */
830                        $this->token['attr'][] = array(
831                            'name'  => strtolower($char),
832                            'value' => ''
833                        );
834
835                        $state = 'attribute name';
836
837                    } elseif ($char === false) {
838                        /* EOF
839                        Parse error. Reconsume the EOF character in the data state. */
840                        $this->emitToken(array(
841                            'type' => self::PARSEERROR,
842                            'data' => 'expected-end-of-tag-but-got-eof'
843                        ));
844
845                        $this->stream->unget();
846                        $state = 'data';
847
848                    } else {
849                        /* U+0022 QUOTATION MARK (")
850                           U+0027 APOSTROPHE (')
851                           U+003C LESS-THAN SIGN(<)
852                        Parse error. Treat it as per the "anything else"
853                        entry below. */
854                        if ($char === '"' || $char === "'" || $char === "<") {
855                            $this->emitToken(array(
856                                'type' => self::PARSEERROR,
857                                'data' => 'invalid-character-after-attribute-name'
858                            ));
859                        }
860
861                        /* Anything else
862                        Start a new attribute in the current tag token. Set that attribute's
863                        name to the current input character, and its value to the empty string.
864                        Switch to the attribute name state. */
865                        $this->token['attr'][] = array(
866                            'name'  => $char,
867                            'value' => ''
868                        );
869
870                        $state = 'attribute name';
871                    }
872                break;
873
874                case 'before attribute value':
875                    // Consume the next input character:
876                    $char = $this->stream->char();
877
878                    // this is an optimized conditional
879                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
880                        /* U+0009 CHARACTER TABULATION
881                        U+000A LINE FEED (LF)
882                        U+000C FORM FEED (FF)
883                        U+0020 SPACE
884                        Stay in the before attribute value state. */
885                        $state = 'before attribute value';
886
887                    } elseif ($char === '"') {
888                        /* U+0022 QUOTATION MARK (")
889                        Switch to the attribute value (double-quoted) state. */
890                        $state = 'attribute value (double-quoted)';
891
892                    } elseif ($char === '&') {
893                        /* U+0026 AMPERSAND (&)
894                        Switch to the attribute value (unquoted) state and reconsume
895                        this input character. */
896                        $this->stream->unget();
897                        $state = 'attribute value (unquoted)';
898
899                    } elseif ($char === '\'') {
900                        /* U+0027 APOSTROPHE (')
901                        Switch to the attribute value (single-quoted) state. */
902                        $state = 'attribute value (single-quoted)';
903
904                    } elseif ($char === '>') {
905                        /* U+003E GREATER-THAN SIGN (>)
906                        Parse error. Emit the current tag token. Switch to the data state. */
907                        $this->emitToken(array(
908                            'type' => self::PARSEERROR,
909                            'data' => 'expected-attribute-value-but-got-right-bracket'
910                        ));
911                        $this->emitToken($this->token);
912                        $state = 'data';
913
914                    } elseif ($char === false) {
915                        /* EOF
916                        Parse error. Reconsume the EOF character in the data state. */
917                        $this->emitToken(array(
918                            'type' => self::PARSEERROR,
919                            'data' => 'expected-attribute-value-but-got-eof'
920                        ));
921                        $this->stream->unget();
922                        $state = 'data';
923
924                    } else {
925                        /* U+003D EQUALS SIGN (=)
926                         * U+003C LESS-THAN SIGN (<)
927                        Parse error. Treat it as per the "anything else" entry below. */
928                        if ($char === '=' || $char === '<') {
929                            $this->emitToken(array(
930                                'type' => self::PARSEERROR,
931                                'data' => 'equals-in-unquoted-attribute-value'
932                            ));
933                        }
934
935                        /* Anything else
936                        Append the current input character to the current attribute's value.
937                        Switch to the attribute value (unquoted) state. */
938                        $last = count($this->token['attr']) - 1;
939                        $this->token['attr'][$last]['value'] .= $char;
940
941                        $state = 'attribute value (unquoted)';
942                    }
943                break;
944
945                case 'attribute value (double-quoted)':
946                    // Consume the next input character:
947                    $char = $this->stream->char();
948
949                    if ($char === '"') {
950                        /* U+0022 QUOTATION MARK (")
951                        Switch to the after attribute value (quoted) state. */
952                        $state = 'after attribute value (quoted)';
953
954                    } elseif ($char === '&') {
955                        /* U+0026 AMPERSAND (&)
956                        Switch to the character reference in attribute value
957                        state, with the additional allowed character
958                        being U+0022 QUOTATION MARK ("). */
959                        $this->characterReferenceInAttributeValue('"');
960
961                    } elseif ($char === false) {
962                        /* EOF
963                        Parse error. Reconsume the EOF character in the data state. */
964                        $this->emitToken(array(
965                            'type' => self::PARSEERROR,
966                            'data' => 'eof-in-attribute-value-double-quote'
967                        ));
968
969                        $this->stream->unget();
970                        $state = 'data';
971
972                    } else {
973                        /* Anything else
974                        Append the current input character to the current attribute's value.
975                        Stay in the attribute value (double-quoted) state. */
976                        $chars = $this->stream->charsUntil('"&');
977
978                        $last = count($this->token['attr']) - 1;
979                        $this->token['attr'][$last]['value'] .= $char . $chars;
980
981                        $state = 'attribute value (double-quoted)';
982                    }
983                break;
984
985                case 'attribute value (single-quoted)':
986                    // Consume the next input character:
987                    $char = $this->stream->char();
988
989                    if ($char === "'") {
990                        /* U+0022 QUOTATION MARK (')
991                        Switch to the after attribute value state. */
992                        $state = 'after attribute value (quoted)';
993
994                    } elseif ($char === '&') {
995                        /* U+0026 AMPERSAND (&)
996                        Switch to the entity in attribute value state. */
997                        $this->characterReferenceInAttributeValue("'");
998
999                    } elseif ($char === false) {
1000                        /* EOF
1001                        Parse error. Reconsume the EOF character in the data state. */
1002                        $this->emitToken(array(
1003                            'type' => self::PARSEERROR,
1004                            'data' => 'eof-in-attribute-value-single-quote'
1005                        ));
1006
1007                        $this->stream->unget();
1008                        $state = 'data';
1009
1010                    } else {
1011                        /* Anything else
1012                        Append the current input character to the current attribute's value.
1013                        Stay in the attribute value (single-quoted) state. */
1014                        $chars = $this->stream->charsUntil("'&");
1015
1016                        $last = count($this->token['attr']) - 1;
1017                        $this->token['attr'][$last]['value'] .= $char . $chars;
1018
1019                        $state = 'attribute value (single-quoted)';
1020                    }
1021                break;
1022
1023                case 'attribute value (unquoted)':
1024                    // Consume the next input character:
1025                    $char = $this->stream->char();
1026
1027                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1028                        /* U+0009 CHARACTER TABULATION
1029                        U+000A LINE FEED (LF)
1030                        U+000C FORM FEED (FF)
1031                        U+0020 SPACE
1032                        Switch to the before attribute name state. */
1033                        $state = 'before attribute name';
1034
1035                    } elseif ($char === '&') {
1036                        /* U+0026 AMPERSAND (&)
1037                        Switch to the entity in attribute value state, with the
1038                        additional allowed character  being U+003E
1039                        GREATER-THAN SIGN (>). */
1040                        $this->characterReferenceInAttributeValue('>');
1041
1042                    } elseif ($char === '>') {
1043                        /* U+003E GREATER-THAN SIGN (>)
1044                        Emit the current tag token. Switch to the data state. */
1045                        $this->emitToken($this->token);
1046                        $state = 'data';
1047
1048                    } elseif ($char === false) {
1049                        /* EOF
1050                        Parse error. Reconsume the EOF character in the data state. */
1051                        $this->emitToken(array(
1052                            'type' => self::PARSEERROR,
1053                            'data' => 'eof-in-attribute-value-no-quotes'
1054                        ));
1055                        $this->stream->unget();
1056                        $state = 'data';
1057
1058                    } else {
1059                        /* U+0022 QUOTATION MARK (")
1060                           U+0027 APOSTROPHE (')
1061                           U+003C LESS-THAN SIGN (<)
1062                           U+003D EQUALS SIGN (=)
1063                        Parse error. Treat it as per the "anything else"
1064                        entry below. */
1065                        if ($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1066                            $this->emitToken(array(
1067                                'type' => self::PARSEERROR,
1068                                'data' => 'unexpected-character-in-unquoted-attribute-value'
1069                            ));
1070                        }
1071
1072                        /* Anything else
1073                        Append the current input character to the current attribute's value.
1074                        Stay in the attribute value (unquoted) state. */
1075                        $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1076
1077                        $last = count($this->token['attr']) - 1;
1078                        $this->token['attr'][$last]['value'] .= $char . $chars;
1079
1080                        $state = 'attribute value (unquoted)';
1081                    }
1082                break;
1083
1084                case 'after attribute value (quoted)':
1085                    /* Consume the next input character: */
1086                    $char = $this->stream->char();
1087
1088                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1089                        /* U+0009 CHARACTER TABULATION
1090                           U+000A LINE FEED (LF)
1091                           U+000C FORM FEED (FF)
1092                           U+0020 SPACE
1093                        Switch to the before attribute name state. */
1094                        $state = 'before attribute name';
1095
1096                    } elseif ($char === '/') {
1097                        /* U+002F SOLIDUS (/)
1098                        Switch to the self-closing start tag state. */
1099                        $state = 'self-closing start tag';
1100
1101                    } elseif ($char === '>') {
1102                        /* U+003E GREATER-THAN SIGN (>)
1103                        Emit the current tag token. Switch to the data state. */
1104                        $this->emitToken($this->token);
1105                        $state = 'data';
1106
1107                    } elseif ($char === false) {
1108                        /* EOF
1109                        Parse error. Reconsume the EOF character in the data state. */
1110                        $this->emitToken(array(
1111                            'type' => self::PARSEERROR,
1112                            'data' => 'unexpected-EOF-after-attribute-value'
1113                        ));
1114                        $this->stream->unget();
1115                        $state = 'data';
1116
1117                    } else {
1118                        /* Anything else
1119                        Parse error. Reconsume the character in the before attribute
1120                        name state. */
1121                        $this->emitToken(array(
1122                            'type' => self::PARSEERROR,
1123                            'data' => 'unexpected-character-after-attribute-value'
1124                        ));
1125                        $this->stream->unget();
1126                        $state = 'before attribute name';
1127                    }
1128                break;
1129
1130                case 'self-closing start tag':
1131                    /* Consume the next input character: */
1132                    $char = $this->stream->char();
1133
1134                    if ($char === '>') {
1135                        /* U+003E GREATER-THAN SIGN (>)
1136                        Set the self-closing flag of the current tag token.
1137                        Emit the current tag token. Switch to the data state. */
1138                        // not sure if this is the name we want
1139                        $this->token['self-closing'] = true;
1140                        $this->emitToken($this->token);
1141                        $state = 'data';
1142
1143                    } elseif ($char === false) {
1144                        /* EOF
1145                        Parse error. Reconsume the EOF character in the data state. */
1146                        $this->emitToken(array(
1147                            'type' => self::PARSEERROR,
1148                            'data' => 'unexpected-eof-after-self-closing'
1149                        ));
1150                        $this->stream->unget();
1151                        $state = 'data';
1152
1153                    } else {
1154                        /* Anything else
1155                        Parse error. Reconsume the character in the before attribute name state. */
1156                        $this->emitToken(array(
1157                            'type' => self::PARSEERROR,
1158                            'data' => 'unexpected-character-after-self-closing'
1159                        ));
1160                        $this->stream->unget();
1161                        $state = 'before attribute name';
1162                    }
1163                break;
1164
1165                case 'bogus comment':
1166                    /* (This can only happen if the content model flag is set to the PCDATA state.) */
1167                    /* Consume every character up to the first U+003E GREATER-THAN SIGN
1168                    character (>) or the end of the file (EOF), whichever comes first. Emit
1169                    a comment token whose data is the concatenation of all the characters
1170                    starting from and including the character that caused the state machine
1171                    to switch into the bogus comment state, up to and including the last
1172                    consumed character before the U+003E character, if any, or up to the
1173                    end of the file otherwise. (If the comment was started by the end of
1174                    the file (EOF), the token is empty.) */
1175                    $this->token['data'] .= (string) $this->stream->charsUntil('>');
1176                    $this->stream->char();
1177
1178                    $this->emitToken($this->token);
1179
1180                    /* Switch to the data state. */
1181                    $state = 'data';
1182                break;
1183
1184                case 'markup declaration open':
1185                    // Consume for below
1186                    $hyphens = $this->stream->charsWhile('-', 2);
1187                    if ($hyphens === '-') {
1188                        $this->stream->unget();
1189                    }
1190                    if ($hyphens !== '--') {
1191                        $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1192                    }
1193
1194                    /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1195                    characters, consume those two characters, create a comment token whose
1196                    data is the empty string, and switch to the comment state. */
1197                    if ($hyphens === '--') {
1198                        $state = 'comment start';
1199                        $this->token = array(
1200                            'data' => '',
1201                            'type' => self::COMMENT
1202                        );
1203
1204                    /* Otherwise if the next seven characters are a case-insensitive match
1205                    for the word "DOCTYPE", then consume those characters and switch to the
1206                    DOCTYPE state. */
1207                    } elseif (strtoupper($alpha) === 'DOCTYPE') {
1208                        $state = 'DOCTYPE';
1209
1210                    // XXX not implemented
1211                    /* Otherwise, if the insertion mode is "in foreign content"
1212                    and the current node is not an element in the HTML namespace
1213                    and the next seven characters are an ASCII case-sensitive
1214                    match for the string "[CDATA[" (the five uppercase letters
1215                    "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1216                    and after), then consume those characters and switch to the
1217                    CDATA section state (which is unrelated to the content model
1218                    flag's CDATA state). */
1219
1220                    /* Otherwise, is is a parse error. Switch to the bogus comment state.
1221                    The next character that is consumed, if any, is the first character
1222                    that will be in the comment. */
1223                    } else {
1224                        $this->emitToken(array(
1225                            'type' => self::PARSEERROR,
1226                            'data' => 'expected-dashes-or-doctype'
1227                        ));
1228                        $this->token = array(
1229                            'data' => (string) $alpha,
1230                            'type' => self::COMMENT
1231                        );
1232                        $state = 'bogus comment';
1233                    }
1234                break;
1235
1236                case 'comment start':
1237                    /* Consume the next input character: */
1238                    $char = $this->stream->char();
1239
1240                    if ($char === '-') {
1241                        /* U+002D HYPHEN-MINUS (-)
1242                        Switch to the comment start dash state. */
1243                        $state = 'comment start dash';
1244                    } elseif ($char === '>') {
1245                        /* U+003E GREATER-THAN SIGN (>)
1246                        Parse error. Emit the comment token. Switch to the
1247                        data state. */
1248                        $this->emitToken(array(
1249                            'type' => self::PARSEERROR,
1250                            'data' => 'incorrect-comment'
1251                        ));
1252                        $this->emitToken($this->token);
1253                        $state = 'data';
1254                    } elseif ($char === false) {
1255                        /* EOF
1256                        Parse error. Emit the comment token. Reconsume the
1257                        EOF character in the data state. */
1258                        $this->emitToken(array(
1259                            'type' => self::PARSEERROR,
1260                            'data' => 'eof-in-comment'
1261                        ));
1262                        $this->emitToken($this->token);
1263                        $this->stream->unget();
1264                        $state = 'data';
1265                    } else {
1266                        /* Anything else
1267                        Append the input character to the comment token's
1268                        data. Switch to the comment state. */
1269                        $this->token['data'] .= $char;
1270                        $state = 'comment';
1271                    }
1272                break;
1273
1274                case 'comment start dash':
1275                    /* Consume the next input character: */
1276                    $char = $this->stream->char();
1277                    if ($char === '-') {
1278                        /* U+002D HYPHEN-MINUS (-)
1279                        Switch to the comment end state */
1280                        $state = 'comment end';
1281                    } elseif ($char === '>') {
1282                        /* U+003E GREATER-THAN SIGN (>)
1283                        Parse error. Emit the comment token. Switch to the
1284                        data state. */
1285                        $this->emitToken(array(
1286                            'type' => self::PARSEERROR,
1287                            'data' => 'incorrect-comment'
1288                        ));
1289                        $this->emitToken($this->token);
1290                        $state = 'data';
1291                    } elseif ($char === false) {
1292                        /* Parse error. Emit the comment token. Reconsume the
1293                        EOF character in the data state. */
1294                        $this->emitToken(array(
1295                            'type' => self::PARSEERROR,
1296                            'data' => 'eof-in-comment'
1297                        ));
1298                        $this->emitToken($this->token);
1299                        $this->stream->unget();
1300                        $state = 'data';
1301                    } else {
1302                        $this->token['data'] .= '-' . $char;
1303                        $state = 'comment';
1304                    }
1305                break;
1306
1307                case 'comment':
1308                    /* Consume the next input character: */
1309                    $char = $this->stream->char();
1310
1311                    if ($char === '-') {
1312                        /* U+002D HYPHEN-MINUS (-)
1313                        Switch to the comment end dash state */
1314                        $state = 'comment end dash';
1315
1316                    } elseif ($char === false) {
1317                        /* EOF
1318                        Parse error. Emit the comment token. Reconsume the EOF character
1319                        in the data state. */
1320                        $this->emitToken(array(
1321                            'type' => self::PARSEERROR,
1322                            'data' => 'eof-in-comment'
1323                        ));
1324                        $this->emitToken($this->token);
1325                        $this->stream->unget();
1326                        $state = 'data';
1327
1328                    } else {
1329                        /* Anything else
1330                        Append the input character to the comment token's data. Stay in
1331                        the comment state. */
1332                        $chars = $this->stream->charsUntil('-');
1333
1334                        $this->token['data'] .= $char . $chars;
1335                    }
1336                break;
1337
1338                case 'comment end dash':
1339                    /* Consume the next input character: */
1340                    $char = $this->stream->char();
1341
1342                    if ($char === '-') {
1343                        /* U+002D HYPHEN-MINUS (-)
1344                        Switch to the comment end state  */
1345                        $state = 'comment end';
1346
1347                    } elseif ($char === false) {
1348                        /* EOF
1349                        Parse error. Emit the comment token. Reconsume the EOF character
1350                        in the data state. */
1351                        $this->emitToken(array(
1352                            'type' => self::PARSEERROR,
1353                            'data' => 'eof-in-comment-end-dash'
1354                        ));
1355                        $this->emitToken($this->token);
1356                        $this->stream->unget();
1357                        $state = 'data';
1358
1359                    } else {
1360                        /* Anything else
1361                        Append a U+002D HYPHEN-MINUS (-) character and the input
1362                        character to the comment token's data. Switch to the comment state. */
1363                        $this->token['data'] .= '-'.$char;
1364                        $state = 'comment';
1365                    }
1366                break;
1367
1368                case 'comment end':
1369                    /* Consume the next input character: */
1370                    $char = $this->stream->char();
1371
1372                    if ($char === '>') {
1373                        /* U+003E GREATER-THAN SIGN (>)
1374                        Emit the comment token. Switch to the data state. */
1375                        $this->emitToken($this->token);
1376                        $state = 'data';
1377
1378                    } elseif ($char === '-') {
1379                        /* U+002D HYPHEN-MINUS (-)
1380                        Parse error. Append a U+002D HYPHEN-MINUS (-) character
1381                        to the comment token's data. Stay in the comment end
1382                        state. */
1383                        $this->emitToken(array(
1384                            'type' => self::PARSEERROR,
1385                            'data' => 'unexpected-dash-after-double-dash-in-comment'
1386                        ));
1387                        $this->token['data'] .= '-';
1388
1389                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1390                        $this->emitToken(array(
1391                            'type' => self::PARSEERROR,
1392                            'data' => 'unexpected-space-after-double-dash-in-comment'
1393                        ));
1394                        $this->token['data'] .= '--' . $char;
1395                        $state = 'comment end space';
1396
1397                    } elseif ($char === '!') {
1398                        $this->emitToken(array(
1399                            'type' => self::PARSEERROR,
1400                            'data' => 'unexpected-bang-after-double-dash-in-comment'
1401                        ));
1402                        $state = 'comment end bang';
1403
1404                    } elseif ($char === false) {
1405                        /* EOF
1406                        Parse error. Emit the comment token. Reconsume the
1407                        EOF character in the data state. */
1408                        $this->emitToken(array(
1409                            'type' => self::PARSEERROR,
1410                            'data' => 'eof-in-comment-double-dash'
1411                        ));
1412                        $this->emitToken($this->token);
1413                        $this->stream->unget();
1414                        $state = 'data';
1415
1416                    } else {
1417                        /* Anything else
1418                        Parse error. Append two U+002D HYPHEN-MINUS (-)
1419                        characters and the input character to the comment token's
1420                        data. Switch to the comment state. */
1421                        $this->emitToken(array(
1422                            'type' => self::PARSEERROR,
1423                            'data' => 'unexpected-char-in-comment'
1424                        ));
1425                        $this->token['data'] .= '--'.$char;
1426                        $state = 'comment';
1427                    }
1428                break;
1429
1430                case 'comment end bang':
1431                    $char = $this->stream->char();
1432                    if ($char === '>') {
1433                        $this->emitToken($this->token);
1434                        $state = 'data';
1435                    } elseif ($char === "-") {
1436                        $this->token['data'] .= '--!';
1437                        $state = 'comment end dash';
1438                    } elseif ($char === false) {
1439                        $this->emitToken(array(
1440                            'type' => self::PARSEERROR,
1441                            'data' => 'eof-in-comment-end-bang'
1442                        ));
1443                        $this->emitToken($this->token);
1444                        $this->stream->unget();
1445                        $state = 'data';
1446                    } else {
1447                        $this->token['data'] .= '--!' . $char;
1448                        $state = 'comment';
1449                    }
1450                break;
1451
1452                case 'comment end space':
1453                    $char = $this->stream->char();
1454                    if ($char === '>') {
1455                        $this->emitToken($this->token);
1456                        $state = 'data';
1457                    } elseif ($char === '-') {
1458                        $state = 'comment end dash';
1459                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1460                        $this->token['data'] .= $char;
1461                    } elseif ($char === false) {
1462                        $this->emitToken(array(
1463                            'type' => self::PARSEERROR,
1464                            'data' => 'unexpected-eof-in-comment-end-space',
1465                        ));
1466                        $this->emitToken($this->token);
1467                        $this->stream->unget();
1468                        $state = 'data';
1469                    } else {
1470                        $this->token['data'] .= $char;
1471                        $state = 'comment';
1472                    }
1473                break;
1474
1475                case 'DOCTYPE':
1476                    /* Consume the next input character: */
1477                    $char = $this->stream->char();
1478
1479                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1480                        /* U+0009 CHARACTER TABULATION
1481                           U+000A LINE FEED (LF)
1482                           U+000C FORM FEED (FF)
1483                           U+0020 SPACE
1484                        Switch to the before DOCTYPE name state. */
1485                        $state = 'before DOCTYPE name';
1486
1487                    } elseif ($char === false) {
1488                        /* EOF
1489                        Parse error. Create a new DOCTYPE token. Set its
1490                        force-quirks flag to on. Emit the token. Reconsume the
1491                        EOF character in the data state. */
1492                        $this->emitToken(array(
1493                            'type' => self::PARSEERROR,
1494                            'data' => 'need-space-after-doctype-but-got-eof'
1495                        ));
1496                        $this->emitToken(array(
1497                            'name' => '',
1498                            'type' => self::DOCTYPE,
1499                            'force-quirks' => true,
1500                            'error' => true
1501                        ));
1502                        $this->stream->unget();
1503                        $state = 'data';
1504
1505                    } else {
1506                        /* Anything else
1507                        Parse error. Reconsume the current character in the
1508                        before DOCTYPE name state. */
1509                        $this->emitToken(array(
1510                            'type' => self::PARSEERROR,
1511                            'data' => 'need-space-after-doctype'
1512                        ));
1513                        $this->stream->unget();
1514                        $state = 'before DOCTYPE name';
1515                    }
1516                break;
1517
1518                case 'before DOCTYPE name':
1519                    /* Consume the next input character: */
1520                    $char = $this->stream->char();
1521
1522                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1523                        /* U+0009 CHARACTER TABULATION
1524                           U+000A LINE FEED (LF)
1525                           U+000C FORM FEED (FF)
1526                           U+0020 SPACE
1527                        Stay in the before DOCTYPE name state. */
1528
1529                    } elseif ($char === '>') {
1530                        /* U+003E GREATER-THAN SIGN (>)
1531                        Parse error. Create a new DOCTYPE token. Set its
1532                        force-quirks flag to on. Emit the token. Switch to the
1533                        data state. */
1534                        $this->emitToken(array(
1535                            'type' => self::PARSEERROR,
1536                            'data' => 'expected-doctype-name-but-got-right-bracket'
1537                        ));
1538                        $this->emitToken(array(
1539                            'name' => '',
1540                            'type' => self::DOCTYPE,
1541                            'force-quirks' => true,
1542                            'error' => true
1543                        ));
1544
1545                        $state = 'data';
1546
1547                    } elseif ('A' <= $char && $char <= 'Z') {
1548                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1549                        Create a new DOCTYPE token. Set the token's name to the
1550                        lowercase version of the input character (add 0x0020 to
1551                        the character's code point). Switch to the DOCTYPE name
1552                        state. */
1553                        $this->token = array(
1554                            'name' => strtolower($char),
1555                            'type' => self::DOCTYPE,
1556                            'error' => true
1557                        );
1558
1559                        $state = 'DOCTYPE name';
1560
1561                    } elseif ($char === false) {
1562                        /* EOF
1563                        Parse error. Create a new DOCTYPE token. Set its
1564                        force-quirks flag to on. Emit the token. Reconsume the
1565                        EOF character in the data state. */
1566                        $this->emitToken(array(
1567                            'type' => self::PARSEERROR,
1568                            'data' => 'expected-doctype-name-but-got-eof'
1569                        ));
1570                        $this->emitToken(array(
1571                            'name' => '',
1572                            'type' => self::DOCTYPE,
1573                            'force-quirks' => true,
1574                            'error' => true
1575                        ));
1576
1577                        $this->stream->unget();
1578                        $state = 'data';
1579
1580                    } else {
1581                        /* Anything else
1582                        Create a new DOCTYPE token. Set the token's name to the
1583                        current input character. Switch to the DOCTYPE name state. */
1584                        $this->token = array(
1585                            'name' => $char,
1586                            'type' => self::DOCTYPE,
1587                            'error' => true
1588                        );
1589
1590                        $state = 'DOCTYPE name';
1591                    }
1592                break;
1593
1594                case 'DOCTYPE name':
1595                    /* Consume the next input character: */
1596                    $char = $this->stream->char();
1597
1598                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1599                        /* U+0009 CHARACTER TABULATION
1600                           U+000A LINE FEED (LF)
1601                           U+000C FORM FEED (FF)
1602                           U+0020 SPACE
1603                        Switch to the after DOCTYPE name state. */
1604                        $state = 'after DOCTYPE name';
1605
1606                    } elseif ($char === '>') {
1607                        /* U+003E GREATER-THAN SIGN (>)
1608                        Emit the current DOCTYPE token. Switch to the data state. */
1609                        $this->emitToken($this->token);
1610                        $state = 'data';
1611
1612                    } elseif ('A' <= $char && $char <= 'Z') {
1613                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1614                        Append the lowercase version of the input character
1615                        (add 0x0020 to the character's code point) to the current
1616                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1617                        $this->token['name'] .= strtolower($char);
1618
1619                    } elseif ($char === false) {
1620                        /* EOF
1621                        Parse error. Set the DOCTYPE token's force-quirks flag
1622                        to on. Emit that DOCTYPE token. Reconsume the EOF
1623                        character in the data state. */
1624                        $this->emitToken(array(
1625                            'type' => self::PARSEERROR,
1626                            'data' => 'eof-in-doctype-name'
1627                        ));
1628                        $this->token['force-quirks'] = true;
1629                        $this->emitToken($this->token);
1630                        $this->stream->unget();
1631                        $state = 'data';
1632
1633                    } else {
1634                        /* Anything else
1635                        Append the current input character to the current
1636                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1637                        $this->token['name'] .= $char;
1638                    }
1639
1640                    // XXX this is probably some sort of quirks mode designation,
1641                    // check tree-builder to be sure. In general 'error' needs
1642                    // to be specc'ified, this probably means removing it at the end
1643                    $this->token['error'] = ($this->token['name'] === 'HTML')
1644                        ? false
1645                        : true;
1646                break;
1647
1648                case 'after DOCTYPE name':
1649                    /* Consume the next input character: */
1650                    $char = $this->stream->char();
1651
1652                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1653                        /* U+0009 CHARACTER TABULATION
1654                           U+000A LINE FEED (LF)
1655                           U+000C FORM FEED (FF)
1656                           U+0020 SPACE
1657                        Stay in the after DOCTYPE name state. */
1658
1659                    } elseif ($char === '>') {
1660                        /* U+003E GREATER-THAN SIGN (>)
1661                        Emit the current DOCTYPE token. Switch to the data state. */
1662                        $this->emitToken($this->token);
1663                        $state = 'data';
1664
1665                    } elseif ($char === false) {
1666                        /* EOF
1667                        Parse error. Set the DOCTYPE token's force-quirks flag
1668                        to on. Emit that DOCTYPE token. Reconsume the EOF
1669                        character in the data state. */
1670                        $this->emitToken(array(
1671                            'type' => self::PARSEERROR,
1672                            'data' => 'eof-in-doctype'
1673                        ));
1674                        $this->token['force-quirks'] = true;
1675                        $this->emitToken($this->token);
1676                        $this->stream->unget();
1677                        $state = 'data';
1678
1679                    } else {
1680                        /* Anything else */
1681
1682                        $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1683                        if ($nextSix === 'PUBLIC') {
1684                            /* If the next six characters are an ASCII
1685                            case-insensitive match for the word "PUBLIC", then
1686                            consume those characters and switch to the before
1687                            DOCTYPE public identifier state. */
1688                            $state = 'before DOCTYPE public identifier';
1689
1690                        } elseif ($nextSix === 'SYSTEM') {
1691                            /* Otherwise, if the next six characters are an ASCII
1692                            case-insensitive match for the word "SYSTEM", then
1693                            consume those characters and switch to the before
1694                            DOCTYPE system identifier state. */
1695                            $state = 'before DOCTYPE system identifier';
1696
1697                        } else {
1698                            /* Otherwise, this is the parse error. Set the DOCTYPE
1699                            token's force-quirks flag to on. Switch to the bogus
1700                            DOCTYPE state. */
1701                            $this->emitToken(array(
1702                                'type' => self::PARSEERROR,
1703                                'data' => 'expected-space-or-right-bracket-in-doctype'
1704                            ));
1705                            $this->token['force-quirks'] = true;
1706                            $this->token['error'] = true;
1707                            $state = 'bogus DOCTYPE';
1708                        }
1709                    }
1710                break;
1711
1712                case 'before DOCTYPE public identifier':
1713                    /* Consume the next input character: */
1714                    $char = $this->stream->char();
1715
1716                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1717                        /* U+0009 CHARACTER TABULATION
1718                           U+000A LINE FEED (LF)
1719                           U+000C FORM FEED (FF)
1720                           U+0020 SPACE
1721                        Stay in the before DOCTYPE public identifier state. */
1722                    } elseif ($char === '"') {
1723                        /* U+0022 QUOTATION MARK (")
1724                        Set the DOCTYPE token's public identifier to the empty
1725                        string (not missing), then switch to the DOCTYPE public
1726                        identifier (double-quoted) state. */
1727                        $this->token['public'] = '';
1728                        $state = 'DOCTYPE public identifier (double-quoted)';
1729                    } elseif ($char === "'") {
1730                        /* U+0027 APOSTROPHE (')
1731                        Set the DOCTYPE token's public identifier to the empty
1732                        string (not missing), then switch to the DOCTYPE public
1733                        identifier (single-quoted) state. */
1734                        $this->token['public'] = '';
1735                        $state = 'DOCTYPE public identifier (single-quoted)';
1736                    } elseif ($char === '>') {
1737                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1738                        to on. Emit that DOCTYPE token. Switch to the data state. */
1739                        $this->emitToken(array(
1740                            'type' => self::PARSEERROR,
1741                            'data' => 'unexpected-end-of-doctype'
1742                        ));
1743                        $this->token['force-quirks'] = true;
1744                        $this->emitToken($this->token);
1745                        $state = 'data';
1746                    } elseif ($char === false) {
1747                        /* Parse error. Set the DOCTYPE token's force-quirks
1748                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1749                        character in the data state. */
1750                        $this->emitToken(array(
1751                            'type' => self::PARSEERROR,
1752                            'data' => 'eof-in-doctype'
1753                        ));
1754                        $this->token['force-quirks'] = true;
1755                        $this->emitToken($this->token);
1756                        $this->stream->unget();
1757                        $state = 'data';
1758                    } else {
1759                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1760                        to on. Switch to the bogus DOCTYPE state. */
1761                        $this->emitToken(array(
1762                            'type' => self::PARSEERROR,
1763                            'data' => 'unexpected-char-in-doctype'
1764                        ));
1765                        $this->token['force-quirks'] = true;
1766                        $state = 'bogus DOCTYPE';
1767                    }
1768                break;
1769
1770                case 'DOCTYPE public identifier (double-quoted)':
1771                    /* Consume the next input character: */
1772                    $char = $this->stream->char();
1773
1774                    if ($char === '"') {
1775                        /* U+0022 QUOTATION MARK (")
1776                        Switch to the after DOCTYPE public identifier state. */
1777                        $state = 'after DOCTYPE public identifier';
1778                    } elseif ($char === '>') {
1779                        /* U+003E GREATER-THAN SIGN (>)
1780                        Parse error. Set the DOCTYPE token's force-quirks flag
1781                        to on. Emit that DOCTYPE token. Switch to the data state. */
1782                        $this->emitToken(array(
1783                            'type' => self::PARSEERROR,
1784                            'data' => 'unexpected-end-of-doctype'
1785                        ));
1786                        $this->token['force-quirks'] = true;
1787                        $this->emitToken($this->token);
1788                        $state = 'data';
1789                    } elseif ($char === false) {
1790                        /* EOF
1791                        Parse error. Set the DOCTYPE token's force-quirks flag
1792                        to on. Emit that DOCTYPE token. Reconsume the EOF
1793                        character in the data state. */
1794                        $this->emitToken(array(
1795                            'type' => self::PARSEERROR,
1796                            'data' => 'eof-in-doctype'
1797                        ));
1798                        $this->token['force-quirks'] = true;
1799                        $this->emitToken($this->token);
1800                        $this->stream->unget();
1801                        $state = 'data';
1802                    } else {
1803                        /* Anything else
1804                        Append the current input character to the current
1805                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1806                        public identifier (double-quoted) state. */
1807                        $this->token['public'] .= $char;
1808                    }
1809                break;
1810
1811                case 'DOCTYPE public identifier (single-quoted)':
1812                    /* Consume the next input character: */
1813                    $char = $this->stream->char();
1814
1815                    if ($char === "'") {
1816                        /* U+0027 APOSTROPHE (')
1817                        Switch to the after DOCTYPE public identifier state. */
1818                        $state = 'after DOCTYPE public identifier';
1819                    } elseif ($char === '>') {
1820                        /* U+003E GREATER-THAN SIGN (>)
1821                        Parse error. Set the DOCTYPE token's force-quirks flag
1822                        to on. Emit that DOCTYPE token. Switch to the data state. */
1823                        $this->emitToken(array(
1824                            'type' => self::PARSEERROR,
1825                            'data' => 'unexpected-end-of-doctype'
1826                        ));
1827                        $this->token['force-quirks'] = true;
1828                        $this->emitToken($this->token);
1829                        $state = 'data';
1830                    } elseif ($char === false) {
1831                        /* EOF
1832                        Parse error. Set the DOCTYPE token's force-quirks flag
1833                        to on. Emit that DOCTYPE token. Reconsume the EOF
1834                        character in the data state. */
1835                        $this->emitToken(array(
1836                            'type' => self::PARSEERROR,
1837                            'data' => 'eof-in-doctype'
1838                        ));
1839                        $this->token['force-quirks'] = true;
1840                        $this->emitToken($this->token);
1841                        $this->stream->unget();
1842                        $state = 'data';
1843                    } else {
1844                        /* Anything else
1845                        Append the current input character to the current
1846                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1847                        public identifier (double-quoted) state. */
1848                        $this->token['public'] .= $char;
1849                    }
1850                break;
1851
1852                case 'after DOCTYPE public identifier':
1853                    /* Consume the next input character: */
1854                    $char = $this->stream->char();
1855
1856                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1857                        /* U+0009 CHARACTER TABULATION
1858                           U+000A LINE FEED (LF)
1859                           U+000C FORM FEED (FF)
1860                           U+0020 SPACE
1861                        Stay in the after DOCTYPE public identifier state. */
1862                    } elseif ($char === '"') {
1863                        /* U+0022 QUOTATION MARK (")
1864                        Set the DOCTYPE token's system identifier to the
1865                        empty string (not missing), then switch to the DOCTYPE
1866                        system identifier (double-quoted) state. */
1867                        $this->token['system'] = '';
1868                        $state = 'DOCTYPE system identifier (double-quoted)';
1869                    } elseif ($char === "'") {
1870                        /* U+0027 APOSTROPHE (')
1871                        Set the DOCTYPE token's system identifier to the
1872                        empty string (not missing), then switch to the DOCTYPE
1873                        system identifier (single-quoted) state. */
1874                        $this->token['system'] = '';
1875                        $state = 'DOCTYPE system identifier (single-quoted)';
1876                    } elseif ($char === '>') {
1877                        /* U+003E GREATER-THAN SIGN (>)
1878                        Emit the current DOCTYPE token. Switch to the data state. */
1879                        $this->emitToken($this->token);
1880                        $state = 'data';
1881                    } elseif ($char === false) {
1882                        /* Parse error. Set the DOCTYPE token's force-quirks
1883                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1884                        character in the data state. */
1885                        $this->emitToken(array(
1886                            'type' => self::PARSEERROR,
1887                            'data' => 'eof-in-doctype'
1888                        ));
1889                        $this->token['force-quirks'] = true;
1890                        $this->emitToken($this->token);
1891                        $this->stream->unget();
1892                        $state = 'data';
1893                    } else {
1894                        /* Anything else
1895                        Parse error. Set the DOCTYPE token's force-quirks flag
1896                        to on. Switch to the bogus DOCTYPE state. */
1897                        $this->emitToken(array(
1898                            'type' => self::PARSEERROR,
1899                            'data' => 'unexpected-char-in-doctype'
1900                        ));
1901                        $this->token['force-quirks'] = true;
1902                        $state = 'bogus DOCTYPE';
1903                    }
1904                break;
1905
1906                case 'before DOCTYPE system identifier':
1907                    /* Consume the next input character: */
1908                    $char = $this->stream->char();
1909
1910                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1911                        /* U+0009 CHARACTER TABULATION
1912                           U+000A LINE FEED (LF)
1913                           U+000C FORM FEED (FF)
1914                           U+0020 SPACE
1915                        Stay in the before DOCTYPE system identifier state. */
1916                    } elseif ($char === '"') {
1917                        /* U+0022 QUOTATION MARK (")
1918                        Set the DOCTYPE token's system identifier to the empty
1919                        string (not missing), then switch to the DOCTYPE system
1920                        identifier (double-quoted) state. */
1921                        $this->token['system'] = '';
1922                        $state = 'DOCTYPE system identifier (double-quoted)';
1923                    } elseif ($char === "'") {
1924                        /* U+0027 APOSTROPHE (')
1925                        Set the DOCTYPE token's system identifier to the empty
1926                        string (not missing), then switch to the DOCTYPE system
1927                        identifier (single-quoted) state. */
1928                        $this->token['system'] = '';
1929                        $state = 'DOCTYPE system identifier (single-quoted)';
1930                    } elseif ($char === '>') {
1931                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1932                        to on. Emit that DOCTYPE token. Switch to the data state. */
1933                        $this->emitToken(array(
1934                            'type' => self::PARSEERROR,
1935                            'data' => 'unexpected-char-in-doctype'
1936                        ));
1937                        $this->token['force-quirks'] = true;
1938                        $this->emitToken($this->token);
1939                        $state = 'data';
1940                    } elseif ($char === false) {
1941                        /* Parse error. Set the DOCTYPE token's force-quirks
1942                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1943                        character in the data state. */
1944                        $this->emitToken(array(
1945                            'type' => self::PARSEERROR,
1946                            'data' => 'eof-in-doctype'
1947                        ));
1948                        $this->token['force-quirks'] = true;
1949                        $this->emitToken($this->token);
1950                        $this->stream->unget();
1951                        $state = 'data';
1952                    } else {
1953                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1954                        to on. Switch to the bogus DOCTYPE state. */
1955                        $this->emitToken(array(
1956                            'type' => self::PARSEERROR,
1957                            'data' => 'unexpected-char-in-doctype'
1958                        ));
1959                        $this->token['force-quirks'] = true;
1960                        $state = 'bogus DOCTYPE';
1961                    }
1962                break;
1963
1964                case 'DOCTYPE system identifier (double-quoted)':
1965                    /* Consume the next input character: */
1966                    $char = $this->stream->char();
1967
1968                    if ($char === '"') {
1969                        /* U+0022 QUOTATION MARK (")
1970                        Switch to the after DOCTYPE system identifier state. */
1971                        $state = 'after DOCTYPE system identifier';
1972                    } elseif ($char === '>') {
1973                        /* U+003E GREATER-THAN SIGN (>)
1974                        Parse error. Set the DOCTYPE token's force-quirks flag
1975                        to on. Emit that DOCTYPE token. Switch to the data state. */
1976                        $this->emitToken(array(
1977                            'type' => self::PARSEERROR,
1978                            'data' => 'unexpected-end-of-doctype'
1979                        ));
1980                        $this->token['force-quirks'] = true;
1981                        $this->emitToken($this->token);
1982                        $state = 'data';
1983                    } elseif ($char === false) {
1984                        /* EOF
1985                        Parse error. Set the DOCTYPE token's force-quirks flag
1986                        to on. Emit that DOCTYPE token. Reconsume the EOF
1987                        character in the data state. */
1988                        $this->emitToken(array(
1989                            'type' => self::PARSEERROR,
1990                            'data' => 'eof-in-doctype'
1991                        ));
1992                        $this->token['force-quirks'] = true;
1993                        $this->emitToken($this->token);
1994                        $this->stream->unget();
1995                        $state = 'data';
1996                    } else {
1997                        /* Anything else
1998                        Append the current input character to the current
1999                        DOCTYPE token's system identifier. Stay in the DOCTYPE
2000                        system identifier (double-quoted) state. */
2001                        $this->token['system'] .= $char;
2002                    }
2003                break;
2004
2005                case 'DOCTYPE system identifier (single-quoted)':
2006                    /* Consume the next input character: */
2007                    $char = $this->stream->char();
2008
2009                    if ($char === "'") {
2010                        /* U+0027 APOSTROPHE (')
2011                        Switch to the after DOCTYPE system identifier state. */
2012                        $state = 'after DOCTYPE system identifier';
2013                    } elseif ($char === '>') {
2014                        /* U+003E GREATER-THAN SIGN (>)
2015                        Parse error. Set the DOCTYPE token's force-quirks flag
2016                        to on. Emit that DOCTYPE token. Switch to the data state. */
2017                        $this->emitToken(array(
2018                            'type' => self::PARSEERROR,
2019                            'data' => 'unexpected-end-of-doctype'
2020                        ));
2021                        $this->token['force-quirks'] = true;
2022                        $this->emitToken($this->token);
2023                        $state = 'data';
2024                    } elseif ($char === false) {
2025                        /* EOF
2026                        Parse error. Set the DOCTYPE token's force-quirks flag
2027                        to on. Emit that DOCTYPE token. Reconsume the EOF
2028                        character in the data state. */
2029                        $this->emitToken(array(
2030                            'type' => self::PARSEERROR,
2031                            'data' => 'eof-in-doctype'
2032                        ));
2033                        $this->token['force-quirks'] = true;
2034                        $this->emitToken($this->token);
2035                        $this->stream->unget();
2036                        $state = 'data';
2037                    } else {
2038                        /* Anything else
2039                        Append the current input character to the current
2040                        DOCTYPE token's system identifier. Stay in the DOCTYPE
2041                        system identifier (double-quoted) state. */
2042                        $this->token['system'] .= $char;
2043                    }
2044                break;
2045
2046                case 'after DOCTYPE system identifier':
2047                    /* Consume the next input character: */
2048                    $char = $this->stream->char();
2049
2050                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2051                        /* U+0009 CHARACTER TABULATION
2052                           U+000A LINE FEED (LF)
2053                           U+000C FORM FEED (FF)
2054                           U+0020 SPACE
2055                        Stay in the after DOCTYPE system identifier state. */
2056                    } elseif ($char === '>') {
2057                        /* U+003E GREATER-THAN SIGN (>)
2058                        Emit the current DOCTYPE token. Switch to the data state. */
2059                        $this->emitToken($this->token);
2060                        $state = 'data';
2061                    } elseif ($char === false) {
2062                        /* Parse error. Set the DOCTYPE token's force-quirks
2063                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
2064                        character in the data state. */
2065                        $this->emitToken(array(
2066                            'type' => self::PARSEERROR,
2067                            'data' => 'eof-in-doctype'
2068                        ));
2069                        $this->token['force-quirks'] = true;
2070                        $this->emitToken($this->token);
2071                        $this->stream->unget();
2072                        $state = 'data';
2073                    } else {
2074                        /* Anything else
2075                        Parse error. Switch to the bogus DOCTYPE state.
2076                        (This does not set the DOCTYPE token's force-quirks
2077                        flag to on.) */
2078                        $this->emitToken(array(
2079                            'type' => self::PARSEERROR,
2080                            'data' => 'unexpected-char-in-doctype'
2081                        ));
2082                        $state = 'bogus DOCTYPE';
2083                    }
2084                break;
2085
2086                case 'bogus DOCTYPE':
2087                    /* Consume the next input character: */
2088                    $char = $this->stream->char();
2089
2090                    if ($char === '>') {
2091                        /* U+003E GREATER-THAN SIGN (>)
2092                        Emit the DOCTYPE token. Switch to the data state. */
2093                        $this->emitToken($this->token);
2094                        $state = 'data';
2095
2096                    } elseif ($char === false) {
2097                        /* EOF
2098                        Emit the DOCTYPE token. Reconsume the EOF character in
2099                        the data state. */
2100                        $this->emitToken($this->token);
2101                        $this->stream->unget();
2102                        $state = 'data';
2103
2104                    } else {
2105                        /* Anything else
2106                        Stay in the bogus DOCTYPE state. */
2107                    }
2108                break;
2109
2110                // case 'cdataSection':
2111            }
2112        }
2113    }
2114
2115    /**
2116     * Returns a serialized representation of the tree.
2117     *
2118     * @return DOMDocument|DOMNodeList
2119     */
2120    public function save() {
2121        return $this->tree->save();
2122    }
2123
2124    /**
2125     * @return HTML5_TreeBuilder The tree
2126     */
2127    public function getTree()
2128    {
2129        return $this->tree;
2130    }
2131
2132
2133    /**
2134     * Returns the input stream.
2135     *
2136     * @return HTML5_InputStream
2137     */
2138    public function stream() {
2139        return $this->stream;
2140    }
2141
2142    /**
2143     * @param bool $allowed
2144     * @param bool $inattr
2145     * @return string
2146     */
2147    private function consumeCharacterReference($allowed = false, $inattr = false) {
2148        // This goes quite far against spec, and is far closer to the Python
2149        // impl., mainly because we don't do the large unconsuming the spec
2150        // requires.
2151
2152        // All consumed characters.
2153        $chars = $this->stream->char();
2154
2155        /* This section defines how to consume a character
2156        reference. This definition is used when parsing character
2157        references in text and in attributes.
2158
2159        The behavior depends on the identity of the next character
2160        (the one immediately after the U+0026 AMPERSAND character): */
2161
2162        if (
2163            $chars[0] === "\x09" ||
2164            $chars[0] === "\x0A" ||
2165            $chars[0] === "\x0C" ||
2166            $chars[0] === "\x20" ||
2167            $chars[0] === '<' ||
2168            $chars[0] === '&' ||
2169            $chars === false ||
2170            $chars[0] === $allowed
2171        ) {
2172            /* U+0009 CHARACTER TABULATION
2173               U+000A LINE FEED (LF)
2174               U+000C FORM FEED (FF)
2175               U+0020 SPACE
2176               U+003C LESS-THAN SIGN
2177               U+0026 AMPERSAND
2178               EOF
2179               The additional allowed character, if there is one
2180            Not a character reference. No characters are consumed,
2181            and nothing is returned. (This is not an error, either.) */
2182            // We already consumed, so unconsume.
2183            $this->stream->unget();
2184            return '&';
2185        } elseif ($chars[0] === '#') {
2186            /* Consume the U+0023 NUMBER SIGN. */
2187            // Um, yeah, we already did that.
2188            /* The behavior further depends on the character after
2189            the U+0023 NUMBER SIGN: */
2190            $chars .= $this->stream->char();
2191            if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2192                /* U+0078 LATIN SMALL LETTER X
2193                   U+0058 LATIN CAPITAL LETTER X */
2194                /* Consume the X. */
2195                // Um, yeah, we already did that.
2196                /* Follow the steps below, but using the range of
2197                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2198                NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2199                LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2200                A, through to U+0046 LATIN CAPITAL LETTER F (in other
2201                words, 0123456789, ABCDEF, abcdef). */
2202                $char_class = self::HEX;
2203                /* When it comes to interpreting the
2204                number, interpret it as a hexadecimal number. */
2205                $hex = true;
2206            } else {
2207                /* Anything else */
2208                // Unconsume because we shouldn't have consumed this.
2209                $chars = $chars[0];
2210                $this->stream->unget();
2211                /* Follow the steps below, but using the range of
2212                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2213                NINE (i.e. just 0123456789). */
2214                $char_class = self::DIGIT;
2215                /* When it comes to interpreting the number,
2216                interpret it as a decimal number. */
2217                $hex = false;
2218            }
2219
2220            /* Consume as many characters as match the range of characters given above. */
2221            $consumed = $this->stream->charsWhile($char_class);
2222            if ($consumed === '' || $consumed === false) {
2223                /* If no characters match the range, then don't consume
2224                any characters (and unconsume the U+0023 NUMBER SIGN
2225                character and, if appropriate, the X character). This
2226                is a parse error; nothing is returned. */
2227                $this->emitToken(array(
2228                    'type' => self::PARSEERROR,
2229                    'data' => 'expected-numeric-entity'
2230                ));
2231                return '&' . $chars;
2232            } else {
2233                /* Otherwise, if the next character is a U+003B SEMICOLON,
2234                consume that too. If it isn't, there is a parse error. */
2235                if ($this->stream->char() !== ';') {
2236                    $this->stream->unget();
2237                    $this->emitToken(array(
2238                        'type' => self::PARSEERROR,
2239                        'data' => 'numeric-entity-without-semicolon'
2240                    ));
2241                }
2242
2243                /* If one or more characters match the range, then take
2244                them all and interpret the string of characters as a number
2245                (either hexadecimal or decimal as appropriate). */
2246                $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2247
2248                /* If that number is one of the numbers in the first column
2249                of the following table, then this is a parse error. Find the
2250                row with that number in the first column, and return a
2251                character token for the Unicode character given in the
2252                second column of that row. */
2253                $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2254                if ($new_codepoint) {
2255                    $this->emitToken(array(
2256                        'type' => self::PARSEERROR,
2257                        'data' => 'illegal-windows-1252-entity'
2258                    ));
2259                    return HTML5_Data::utf8chr($new_codepoint);
2260                } else {
2261                    /* Otherwise, if the number is greater than 0x10FFFF, then
2262                     * this is a parse error. Return a U+FFFD REPLACEMENT
2263                     * CHARACTER. */
2264                    if ($codepoint > 0x10FFFF) {
2265                        $this->emitToken(array(
2266                            'type' => self::PARSEERROR,
2267                            'data' => 'overlong-character-entity' // XXX probably not correct
2268                        ));
2269                        return "\xEF\xBF\xBD";
2270                    }
2271                    /* Otherwise, return a character token for the Unicode
2272                     * character whose code point is that number.  If the
2273                     * number is in the range 0x0001 to 0x0008,    0x000E to
2274                     * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2275                     * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2276                     * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2277                     * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2278                     * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2279                     * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2280                     * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2281                     * or 0x10FFFF, then this is a parse error. */
2282                    // && has higher precedence than ||
2283                    if (
2284                        $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2285                        $codepoint === 0x000B ||
2286                        $codepoint >= 0x000E && $codepoint <= 0x001F ||
2287                        $codepoint >= 0x007F && $codepoint <= 0x009F ||
2288                        $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2289                        $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2290                        ($codepoint & 0xFFFE) === 0xFFFE ||
2291                        $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2292                    ) {
2293                        $this->emitToken(array(
2294                            'type' => self::PARSEERROR,
2295                            'data' => 'illegal-codepoint-for-numeric-entity'
2296                        ));
2297                    }
2298                    return HTML5_Data::utf8chr($codepoint);
2299                }
2300            }
2301        } else {
2302            /* Anything else */
2303
2304            /* Consume the maximum number of characters possible,
2305            with the consumed characters matching one of the
2306            identifiers in the first column of the named character
2307            references table (in a case-sensitive manner). */
2308            // What we actually do here is consume as much as we can while it
2309            // matches the start of one of the identifiers in the first column.
2310
2311            $refs = HTML5_Data::getNamedCharacterReferences();
2312
2313            // Get the longest string which is the start of an identifier
2314            // ($chars) as well as the longest identifier which matches ($id)
2315            // and its codepoint ($codepoint).
2316            $codepoint = false;
2317            $char = $chars;
2318            while ($char !== false && isset($refs[$char])) {
2319                $refs = $refs[$char];
2320                if (isset($refs['codepoint'])) {
2321                    $id = $chars;
2322                    $codepoint = $refs['codepoint'];
2323                }
2324                $chars .= $char = $this->stream->char();
2325            }
2326
2327            // Unconsume the one character we just took which caused the while
2328            // statement to fail. This could be anything and could cause state
2329            // changes (as if it matches the while loop it must be
2330            // alphanumeric so we can just concat it to whatever we get later).
2331            $this->stream->unget();
2332            if ($char !== false) {
2333                $chars = substr($chars, 0, -1);
2334            }
2335
2336            /* If no match can be made, then this is a parse error.
2337            No characters are consumed, and nothing is returned. */
2338            if (!$codepoint) {
2339                $this->emitToken(array(
2340                    'type' => self::PARSEERROR,
2341                    'data' => 'expected-named-entity'
2342                ));
2343                return '&' . $chars;
2344            }
2345
2346            /* If the last character matched is not a U+003B SEMICOLON
2347            (;), there is a parse error. */
2348            $semicolon = true;
2349            if (substr($id, -1) !== ';') {
2350                $this->emitToken(array(
2351                    'type' => self::PARSEERROR,
2352                    'data' => 'named-entity-without-semicolon'
2353                ));
2354                $semicolon = false;
2355            }
2356
2357            /* If the character reference is being consumed as part of
2358            an attribute, and the last character matched is not a
2359            U+003B SEMICOLON (;), and the next character is in the
2360            range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2361            LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2362            or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2363            then, for historical reasons, all the characters that were
2364            matched after the U+0026 AMPERSAND (&) must be unconsumed,
2365            and nothing is returned. */
2366            if ($inattr && !$semicolon) {
2367                // The next character is either the next character in $chars or in the stream.
2368                if (strlen($chars) > strlen($id)) {
2369                    $next = substr($chars, strlen($id), 1);
2370                } else {
2371                    $next = $this->stream->char();
2372                    $this->stream->unget();
2373                }
2374                if (
2375                    '0' <= $next && $next <= '9' ||
2376                    'A' <= $next && $next <= 'Z' ||
2377                    'a' <= $next && $next <= 'z'
2378                ) {
2379                    return '&' . $chars;
2380                }
2381            }
2382
2383            /* Otherwise, return a character token for the character
2384            corresponding to the character reference name (as given
2385            by the second column of the named character references table). */
2386            return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2387        }
2388    }
2389
2390    /**
2391     * @param bool $allowed
2392     */
2393    private function characterReferenceInAttributeValue($allowed = false) {
2394        /* Attempt to consume a character reference. */
2395        $entity = $this->consumeCharacterReference($allowed, true);
2396
2397        /* If nothing is returned, append a U+0026 AMPERSAND
2398        character to the current attribute's value.
2399
2400        Otherwise, append the returned character token to the
2401        current attribute's value. */
2402        $char = (!$entity)
2403            ? '&'
2404            : $entity;
2405
2406        $last = count($this->token['attr']) - 1;
2407        $this->token['attr'][$last]['value'] .= $char;
2408
2409        /* Finally, switch back to the attribute value state that you
2410        were in when were switched into this state. */
2411    }
2412
2413    /**
2414     * Emits a token, passing it on to the tree builder.
2415     *
2416     * @param $token
2417     * @param bool $checkStream
2418     * @param bool $dry
2419     */
2420    protected function emitToken($token, $checkStream = true, $dry = false) {
2421        if ($checkStream === true) {
2422            // Emit errors from input stream.
2423            while ($this->stream->errors) {
2424                $this->emitToken(array_shift($this->stream->errors), false);
2425            }
2426        }
2427        if ($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2428            for ($i = 0; $i < count($token['attr']); $i++) {
2429                $this->emitToken(array(
2430                    'type' => self::PARSEERROR,
2431                    'data' => 'attributes-in-end-tag'
2432                ));
2433            }
2434        }
2435        if ($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2436            $this->emitToken(array(
2437                'type' => self::PARSEERROR,
2438                'data' => 'self-closing-flag-on-end-tag',
2439            ));
2440        }
2441        if ($token['type'] === self::STARTTAG) {
2442            // This could be changed to actually pass the tree-builder a hash
2443            $hash = array();
2444            foreach ($token['attr'] as $keypair) {
2445                if (isset($hash[$keypair['name']])) {
2446                    $this->emitToken(array(
2447                        'type' => self::PARSEERROR,
2448                        'data' => 'duplicate-attribute',
2449                    ));
2450                } else {
2451                    $hash[$keypair['name']] = $keypair['value'];
2452                }
2453            }
2454        }
2455
2456        if ($dry === false) {
2457            // the current structure of attributes is not a terribly good one
2458            $this->tree->emitToken($token);
2459        }
2460
2461        if ($dry === false && is_int($this->tree->content_model)) {
2462            $this->content_model = $this->tree->content_model;
2463            $this->tree->content_model = null;
2464
2465        } elseif ($token['type'] === self::ENDTAG) {
2466            $this->content_model = self::PCDATA;
2467        }
2468    }
2469}
2470
2471