1<?php
2
3namespace Masterminds\HTML5\Parser;
4
5use Masterminds\HTML5\Elements;
6
7/**
8 * The HTML5 tokenizer.
9 *
10 * The tokenizer's role is reading data from the scanner and gathering it into
11 * semantic units. From the tokenizer, data is emitted to an event handler,
12 * which may (for example) create a DOM tree.
13 *
14 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
15 * follow that specification to the maximum extent that we can. If you find
16 * a discrepancy that is not documented, please file a bug and/or submit a
17 * patch.
18 *
19 * This tokenizer is implemented as a recursive descent parser.
20 *
21 * Within the API documentation, you may see references to the specific section
22 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
23 * This refers to section 8.2.4.1 of the HTML5 CR specification.
24 *
25 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
26 */
27class Tokenizer
28{
29    protected $scanner;
30
31    protected $events;
32
33    protected $tok;
34
35    /**
36     * Buffer for text.
37     */
38    protected $text = '';
39
40    // When this goes to false, the parser stops.
41    protected $carryOn = true;
42
43    protected $textMode = 0; // TEXTMODE_NORMAL;
44    protected $untilTag = null;
45
46    const CONFORMANT_XML = 'xml';
47    const CONFORMANT_HTML = 'html';
48    protected $mode = self::CONFORMANT_HTML;
49
50    /**
51     * Create a new tokenizer.
52     *
53     * Typically, parsing a document involves creating a new tokenizer, giving
54     * it a scanner (input) and an event handler (output), and then calling
55     * the Tokenizer::parse() method.`
56     *
57     * @param Scanner      $scanner      A scanner initialized with an input stream.
58     * @param EventHandler $eventHandler An event handler, initialized and ready to receive events.
59     * @param string       $mode
60     */
61    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
62    {
63        $this->scanner = $scanner;
64        $this->events = $eventHandler;
65        $this->mode = $mode;
66    }
67
68    /**
69     * Begin parsing.
70     *
71     * This will begin scanning the document, tokenizing as it goes.
72     * Tokens are emitted into the event handler.
73     *
74     * Tokenizing will continue until the document is completely
75     * read. Errors are emitted into the event handler, but
76     * the parser will attempt to continue parsing until the
77     * entire input stream is read.
78     */
79    public function parse()
80    {
81        do {
82            $this->consumeData();
83            // FIXME: Add infinite loop protection.
84        } while ($this->carryOn);
85    }
86
87    /**
88     * Set the text mode for the character data reader.
89     *
90     * HTML5 defines three different modes for reading text:
91     * - Normal: Read until a tag is encountered.
92     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
93     * special characters.
94     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
95     *
96     * This allows those modes to be set.
97     *
98     * Normally, setting is done by the event handler via a special return code on
99     * startTag(), but it can also be set manually using this function.
100     *
101     * @param int    $textmode One of Elements::TEXT_*.
102     * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not
103     *                         use this indicator.
104     */
105    public function setTextMode($textmode, $untilTag = null)
106    {
107        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
108        $this->untilTag = $untilTag;
109    }
110
111    /**
112     * Consume a character and make a move.
113     * HTML5 8.2.4.1.
114     */
115    protected function consumeData()
116    {
117        $tok = $this->scanner->current();
118
119        if ('&' === $tok) {
120            // Character reference
121            $ref = $this->decodeCharacterReference();
122            $this->buffer($ref);
123
124            $tok = $this->scanner->current();
125        }
126
127        // Parse tag
128        if ('<' === $tok) {
129            // Any buffered text data can go out now.
130            $this->flushBuffer();
131
132            $tok = $this->scanner->next();
133
134            if ('!' === $tok) {
135                $this->markupDeclaration();
136            } elseif ('/' === $tok) {
137                $this->endTag();
138            } elseif ('?' === $tok) {
139                $this->processingInstruction();
140            } elseif (ctype_alpha($tok)) {
141                $this->tagName();
142            } else {
143                $this->parseError('Illegal tag opening');
144                // TODO is this necessary ?
145                $this->characterData();
146            }
147
148            $tok = $this->scanner->current();
149        }
150
151        if (false === $tok) {
152            // Handle end of document
153            $this->eof();
154        } else {
155            // Parse character
156            switch ($this->textMode) {
157                case Elements::TEXT_RAW:
158                    $this->rawText($tok);
159                    break;
160
161                case Elements::TEXT_RCDATA:
162                    $this->rcdata($tok);
163                    break;
164
165                default:
166                    if ('<' === $tok || '&' === $tok) {
167                        break;
168                    }
169
170                    // NULL character
171                    if ("\00" === $tok) {
172                        $this->parseError('Received null character.');
173
174                        $this->text .= $tok;
175                        $this->scanner->consume();
176
177                        break;
178                    }
179
180                    $this->text .= $this->scanner->charsUntil("<&\0");
181            }
182        }
183
184        return $this->carryOn;
185    }
186
187    /**
188     * Parse anything that looks like character data.
189     *
190     * Different rules apply based on the current text mode.
191     *
192     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
193     */
194    protected function characterData()
195    {
196        $tok = $this->scanner->current();
197        if (false === $tok) {
198            return false;
199        }
200        switch ($this->textMode) {
201            case Elements::TEXT_RAW:
202                return $this->rawText($tok);
203            case Elements::TEXT_RCDATA:
204                return $this->rcdata($tok);
205            default:
206                if ('<' === $tok || '&' === $tok) {
207                    return false;
208                }
209
210                return $this->text($tok);
211        }
212    }
213
214    /**
215     * This buffers the current token as character data.
216     *
217     * @param string $tok The current token.
218     *
219     * @return bool
220     */
221    protected function text($tok)
222    {
223        // This should never happen...
224        if (false === $tok) {
225            return false;
226        }
227
228        // NULL character
229        if ("\00" === $tok) {
230            $this->parseError('Received null character.');
231        }
232
233        $this->buffer($tok);
234        $this->scanner->consume();
235
236        return true;
237    }
238
239    /**
240     * Read text in RAW mode.
241     *
242     * @param string $tok The current token.
243     *
244     * @return bool
245     */
246    protected function rawText($tok)
247    {
248        if (is_null($this->untilTag)) {
249            return $this->text($tok);
250        }
251
252        $sequence = '</' . $this->untilTag . '>';
253        $txt = $this->readUntilSequence($sequence);
254        $this->events->text($txt);
255        $this->setTextMode(0);
256
257        return $this->endTag();
258    }
259
260    /**
261     * Read text in RCDATA mode.
262     *
263     * @param string $tok The current token.
264     *
265     * @return bool
266     */
267    protected function rcdata($tok)
268    {
269        if (is_null($this->untilTag)) {
270            return $this->text($tok);
271        }
272
273        $sequence = '</' . $this->untilTag;
274        $txt = '';
275
276        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
277        while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
278            if ('&' == $tok) {
279                $txt .= $this->decodeCharacterReference();
280                $tok = $this->scanner->current();
281            } else {
282                $txt .= $tok;
283                $tok = $this->scanner->next();
284            }
285        }
286        $len = strlen($sequence);
287        $this->scanner->consume($len);
288        $len += $this->scanner->whitespace();
289        if ('>' !== $this->scanner->current()) {
290            $this->parseError('Unclosed RCDATA end tag');
291        }
292
293        $this->scanner->unconsume($len);
294        $this->events->text($txt);
295        $this->setTextMode(0);
296
297        return $this->endTag();
298    }
299
300    /**
301     * If the document is read, emit an EOF event.
302     */
303    protected function eof()
304    {
305        // fprintf(STDOUT, "EOF");
306        $this->flushBuffer();
307        $this->events->eof();
308        $this->carryOn = false;
309    }
310
311    /**
312     * Look for markup.
313     */
314    protected function markupDeclaration()
315    {
316        $tok = $this->scanner->next();
317
318        // Comment:
319        if ('-' == $tok && '-' == $this->scanner->peek()) {
320            $this->scanner->consume(2);
321
322            return $this->comment();
323        } elseif ('D' == $tok || 'd' == $tok) { // Doctype
324            return $this->doctype();
325        } elseif ('[' == $tok) { // CDATA section
326            return $this->cdataSection();
327        }
328
329        // FINISH
330        $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok);
331        $this->bogusComment('<!');
332
333        return true;
334    }
335
336    /**
337     * Consume an end tag. See section 8.2.4.9.
338     */
339    protected function endTag()
340    {
341        if ('/' != $this->scanner->current()) {
342            return false;
343        }
344        $tok = $this->scanner->next();
345
346        // a-zA-Z -> tagname
347        // > -> parse error
348        // EOF -> parse error
349        // -> parse error
350        if (!ctype_alpha($tok)) {
351            $this->parseError("Expected tag name, got '%s'", $tok);
352            if ("\0" == $tok || false === $tok) {
353                return false;
354            }
355
356            return $this->bogusComment('</');
357        }
358
359        $name = $this->scanner->charsUntil("\n\f \t>");
360        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
361        // Trash whitespace.
362        $this->scanner->whitespace();
363
364        $tok = $this->scanner->current();
365        if ('>' != $tok) {
366            $this->parseError("Expected >, got '%s'", $tok);
367            // We just trash stuff until we get to the next tag close.
368            $this->scanner->charsUntil('>');
369        }
370
371        $this->events->endTag($name);
372        $this->scanner->consume();
373
374        return true;
375    }
376
377    /**
378     * Consume a tag name and body. See section 8.2.4.10.
379     */
380    protected function tagName()
381    {
382        // We know this is at least one char.
383        $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
384        $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
385        $attributes = array();
386        $selfClose = false;
387
388        // Handle attribute parse exceptions here so that we can
389        // react by trying to build a sensible parse tree.
390        try {
391            do {
392                $this->scanner->whitespace();
393                $this->attribute($attributes);
394            } while (!$this->isTagEnd($selfClose));
395        } catch (ParseError $e) {
396            $selfClose = false;
397        }
398
399        $mode = $this->events->startTag($name, $attributes, $selfClose);
400
401        if (is_int($mode)) {
402            $this->setTextMode($mode, $name);
403        }
404
405        $this->scanner->consume();
406
407        return true;
408    }
409
410    /**
411     * Check if the scanner has reached the end of a tag.
412     */
413    protected function isTagEnd(&$selfClose)
414    {
415        $tok = $this->scanner->current();
416        if ('/' == $tok) {
417            $this->scanner->consume();
418            $this->scanner->whitespace();
419            $tok = $this->scanner->current();
420
421            if ('>' == $tok) {
422                $selfClose = true;
423
424                return true;
425            }
426            if (false === $tok) {
427                $this->parseError('Unexpected EOF inside of tag.');
428
429                return true;
430            }
431            // Basically, we skip the / token and go on.
432            // See 8.2.4.43.
433            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
434
435            return false;
436        }
437
438        if ('>' == $tok) {
439            return true;
440        }
441        if (false === $tok) {
442            $this->parseError('Unexpected EOF inside of tag.');
443
444            return true;
445        }
446
447        return false;
448    }
449
450    /**
451     * Parse attributes from inside of a tag.
452     *
453     * @param string[] $attributes
454     *
455     * @return bool
456     *
457     * @throws ParseError
458     */
459    protected function attribute(&$attributes)
460    {
461        $tok = $this->scanner->current();
462        if ('/' == $tok || '>' == $tok || false === $tok) {
463            return false;
464        }
465
466        if ('<' == $tok) {
467            $this->parseError("Unexpected '<' inside of attributes list.");
468            // Push the < back onto the stack.
469            $this->scanner->unconsume();
470            // Let the caller figure out how to handle this.
471            throw new ParseError('Start tag inside of attribute.');
472        }
473
474        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
475
476        if (0 == strlen($name)) {
477            $tok = $this->scanner->current();
478            $this->parseError('Expected an attribute name, got %s.', $tok);
479            // Really, only '=' can be the char here. Everything else gets absorbed
480            // under one rule or another.
481            $name = $tok;
482            $this->scanner->consume();
483        }
484
485        $isValidAttribute = true;
486        // Attribute names can contain most Unicode characters for HTML5.
487        // But method "DOMElement::setAttribute" is throwing exception
488        // because of it's own internal restriction so these have to be filtered.
489        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
490        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
491        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
492            $this->parseError('Unexpected characters in attribute name: %s', $name);
493            $isValidAttribute = false;
494        }         // There is no limitation for 1st character in HTML5.
495        // But method "DOMElement::setAttribute" is throwing exception for the
496        // characters below so they have to be filtered.
497        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
498        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
499        elseif (preg_match('/^[0-9.-]/u', $name)) {
500            $this->parseError('Unexpected character at the begining of attribute name: %s', $name);
501            $isValidAttribute = false;
502        }
503        // 8.1.2.3
504        $this->scanner->whitespace();
505
506        $val = $this->attributeValue();
507        if ($isValidAttribute) {
508            $attributes[$name] = $val;
509        }
510
511        return true;
512    }
513
514    /**
515     * Consume an attribute value. See section 8.2.4.37 and after.
516     *
517     * @return string|null
518     */
519    protected function attributeValue()
520    {
521        if ('=' != $this->scanner->current()) {
522            return null;
523        }
524        $this->scanner->consume();
525        // 8.1.2.3
526        $this->scanner->whitespace();
527
528        $tok = $this->scanner->current();
529        switch ($tok) {
530            case "\n":
531            case "\f":
532            case ' ':
533            case "\t":
534                // Whitespace here indicates an empty value.
535                return null;
536            case '"':
537            case "'":
538                $this->scanner->consume();
539
540                return $this->quotedAttributeValue($tok);
541            case '>':
542                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
543                $this->parseError('Expected attribute value, got tag end.');
544
545                return null;
546            case '=':
547            case '`':
548                $this->parseError('Expecting quotes, got %s.', $tok);
549
550                return $this->unquotedAttributeValue();
551            default:
552                return $this->unquotedAttributeValue();
553        }
554    }
555
556    /**
557     * Get an attribute value string.
558     *
559     * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered
560     *                      termination of an attribute's value. E.g. "\"'" will stop at either
561     *                      ' or ".
562     *
563     * @return string The attribute value.
564     */
565    protected function quotedAttributeValue($quote)
566    {
567        $stoplist = "\f" . $quote;
568        $val = '';
569
570        while (true) {
571            $tokens = $this->scanner->charsUntil($stoplist . '&');
572            if (false !== $tokens) {
573                $val .= $tokens;
574            } else {
575                break;
576            }
577
578            $tok = $this->scanner->current();
579            if ('&' == $tok) {
580                $val .= $this->decodeCharacterReference(true);
581                continue;
582            }
583            break;
584        }
585        $this->scanner->consume();
586
587        return $val;
588    }
589
590    protected function unquotedAttributeValue()
591    {
592        $val = '';
593        $tok = $this->scanner->current();
594        while (false !== $tok) {
595            switch ($tok) {
596                case "\n":
597                case "\f":
598                case ' ':
599                case "\t":
600                case '>':
601                    break 2;
602
603                case '&':
604                    $val .= $this->decodeCharacterReference(true);
605                    $tok = $this->scanner->current();
606
607                    break;
608
609                case "'":
610                case '"':
611                case '<':
612                case '=':
613                case '`':
614                    $this->parseError('Unexpected chars in unquoted attribute value %s', $tok);
615                    $val .= $tok;
616                    $tok = $this->scanner->next();
617                    break;
618
619                default:
620                    $val .= $this->scanner->charsUntil("\t\n\f >&\"'<=`");
621
622                    $tok = $this->scanner->current();
623            }
624        }
625
626        return $val;
627    }
628
629    /**
630     * Consume malformed markup as if it were a comment.
631     * 8.2.4.44.
632     *
633     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
634     * the comment. So this will generate comments like:
635     *
636     * &lt;!--&lt/+foo&gt;--&gt;
637     *
638     * @param string $leading Prepend any leading characters. This essentially
639     *                        negates the need to backtrack, but it's sort of a hack.
640     *
641     * @return bool
642     */
643    protected function bogusComment($leading = '')
644    {
645        $comment = $leading;
646        $tokens = $this->scanner->charsUntil('>');
647        if (false !== $tokens) {
648            $comment .= $tokens;
649        }
650        $tok = $this->scanner->current();
651        if (false !== $tok) {
652            $comment .= $tok;
653        }
654
655        $this->flushBuffer();
656        $this->events->comment($comment);
657        $this->scanner->consume();
658
659        return true;
660    }
661
662    /**
663     * Read a comment.
664     * Expects the first tok to be inside of the comment.
665     *
666     * @return bool
667     */
668    protected function comment()
669    {
670        $tok = $this->scanner->current();
671        $comment = '';
672
673        // <!-->. Emit an empty comment because 8.2.4.46 says to.
674        if ('>' == $tok) {
675            // Parse error. Emit the comment token.
676            $this->parseError("Expected comment data, got '>'");
677            $this->events->comment('');
678            $this->scanner->consume();
679
680            return true;
681        }
682
683        // Replace NULL with the replacement char.
684        if ("\0" == $tok) {
685            $tok = UTF8Utils::FFFD;
686        }
687        while (!$this->isCommentEnd()) {
688            $comment .= $tok;
689            $tok = $this->scanner->next();
690        }
691
692        $this->events->comment($comment);
693        $this->scanner->consume();
694
695        return true;
696    }
697
698    /**
699     * Check if the scanner has reached the end of a comment.
700     *
701     * @return bool
702     */
703    protected function isCommentEnd()
704    {
705        $tok = $this->scanner->current();
706
707        // EOF
708        if (false === $tok) {
709            // Hit the end.
710            $this->parseError('Unexpected EOF in a comment.');
711
712            return true;
713        }
714
715        // If it doesn't start with -, not the end.
716        if ('-' != $tok) {
717            return false;
718        }
719
720        // Advance one, and test for '->'
721        if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) {
722            $this->scanner->consume(); // Consume the last '>'
723            return true;
724        }
725        // Unread '-';
726        $this->scanner->unconsume(1);
727
728        return false;
729    }
730
731    /**
732     * Parse a DOCTYPE.
733     *
734     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
735     * not Quirksmode is enabled on the event handler.
736     *
737     * @todo This method is a little long. Should probably refactor.
738     *
739     * @return bool
740     */
741    protected function doctype()
742    {
743        // Check that string is DOCTYPE.
744        if ($this->scanner->sequenceMatches('DOCTYPE', false)) {
745            $this->scanner->consume(7);
746        } else {
747            $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
748            $this->parseError('Expected DOCTYPE, got %s', $chars);
749
750            return $this->bogusComment('<!' . $chars);
751        }
752
753        $this->scanner->whitespace();
754        $tok = $this->scanner->current();
755
756        // EOF: die.
757        if (false === $tok) {
758            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
759            $this->eof();
760
761            return true;
762        }
763
764        // NULL char: convert.
765        if ("\0" === $tok) {
766            $this->parseError('Unexpected null character in DOCTYPE.');
767        }
768
769        $stop = " \n\f>";
770        $doctypeName = $this->scanner->charsUntil($stop);
771        // Lowercase ASCII, replace \0 with FFFD
772        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
773
774        $tok = $this->scanner->current();
775
776        // If false, emit a parse error, DOCTYPE, and return.
777        if (false === $tok) {
778            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
779            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
780
781            return true;
782        }
783
784        // Short DOCTYPE, like <!DOCTYPE html>
785        if ('>' == $tok) {
786            // DOCTYPE without a name.
787            if (0 == strlen($doctypeName)) {
788                $this->parseError('Expected a DOCTYPE name. Got nothing.');
789                $this->events->doctype($doctypeName, 0, null, true);
790                $this->scanner->consume();
791
792                return true;
793            }
794            $this->events->doctype($doctypeName);
795            $this->scanner->consume();
796
797            return true;
798        }
799        $this->scanner->whitespace();
800
801        $pub = strtoupper($this->scanner->getAsciiAlpha());
802        $white = $this->scanner->whitespace();
803
804        // Get ID, and flag it as pub or system.
805        if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) {
806            // Get the sys ID.
807            $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
808            $id = $this->quotedString("\0>");
809            if (false === $id) {
810                $this->events->doctype($doctypeName, $type, $pub, false);
811
812                return true;
813            }
814
815            // Premature EOF.
816            if (false === $this->scanner->current()) {
817                $this->parseError('Unexpected EOF in DOCTYPE');
818                $this->events->doctype($doctypeName, $type, $id, true);
819
820                return true;
821            }
822
823            // Well-formed complete DOCTYPE.
824            $this->scanner->whitespace();
825            if ('>' == $this->scanner->current()) {
826                $this->events->doctype($doctypeName, $type, $id, false);
827                $this->scanner->consume();
828
829                return true;
830            }
831
832            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
833            // Throw away the junk, parse error, quirks mode, return true.
834            $this->scanner->charsUntil('>');
835            $this->parseError('Malformed DOCTYPE.');
836            $this->events->doctype($doctypeName, $type, $id, true);
837            $this->scanner->consume();
838
839            return true;
840        }
841
842        // Else it's a bogus DOCTYPE.
843        // Consume to > and trash.
844        $this->scanner->charsUntil('>');
845
846        $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub);
847        $this->events->doctype($doctypeName, 0, null, true);
848        $this->scanner->consume();
849
850        return true;
851    }
852
853    /**
854     * Utility for reading a quoted string.
855     *
856     * @param string $stopchars Characters (in addition to a close-quote) that should stop the string.
857     *                          E.g. sometimes '>' is higher precedence than '"' or "'".
858     *
859     * @return mixed String if one is found (quotations omitted).
860     */
861    protected function quotedString($stopchars)
862    {
863        $tok = $this->scanner->current();
864        if ('"' == $tok || "'" == $tok) {
865            $this->scanner->consume();
866            $ret = $this->scanner->charsUntil($tok . $stopchars);
867            if ($this->scanner->current() == $tok) {
868                $this->scanner->consume();
869            } else {
870                // Parse error because no close quote.
871                $this->parseError('Expected %s, got %s', $tok, $this->scanner->current());
872            }
873
874            return $ret;
875        }
876
877        return false;
878    }
879
880    /**
881     * Handle a CDATA section.
882     *
883     * @return bool
884     */
885    protected function cdataSection()
886    {
887        $cdata = '';
888        $this->scanner->consume();
889
890        $chars = $this->scanner->charsWhile('CDAT');
891        if ('CDATA' != $chars || '[' != $this->scanner->current()) {
892            $this->parseError('Expected [CDATA[, got %s', $chars);
893
894            return $this->bogusComment('<![' . $chars);
895        }
896
897        $tok = $this->scanner->next();
898        do {
899            if (false === $tok) {
900                $this->parseError('Unexpected EOF inside CDATA.');
901                $this->bogusComment('<![CDATA[' . $cdata);
902
903                return true;
904            }
905            $cdata .= $tok;
906            $tok = $this->scanner->next();
907        } while (!$this->scanner->sequenceMatches(']]>'));
908
909        // Consume ]]>
910        $this->scanner->consume(3);
911
912        $this->events->cdata($cdata);
913
914        return true;
915    }
916
917    // ================================================================
918    // Non-HTML5
919    // ================================================================
920
921    /**
922     * Handle a processing instruction.
923     *
924     * XML processing instructions are supposed to be ignored in HTML5,
925     * treated as "bogus comments". However, since we're not a user
926     * agent, we allow them. We consume until ?> and then issue a
927     * EventListener::processingInstruction() event.
928     *
929     * @return bool
930     */
931    protected function processingInstruction()
932    {
933        if ('?' != $this->scanner->current()) {
934            return false;
935        }
936
937        $tok = $this->scanner->next();
938        $procName = $this->scanner->getAsciiAlpha();
939        $white = $this->scanner->whitespace();
940
941        // If not a PI, send to bogusComment.
942        if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) {
943            $this->parseError("Expected processing instruction name, got $tok");
944            $this->bogusComment('<?' . $tok . $procName);
945
946            return true;
947        }
948
949        $data = '';
950        // As long as it's not the case that the next two chars are ? and >.
951        while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) {
952            $data .= $this->scanner->current();
953
954            $tok = $this->scanner->next();
955            if (false === $tok) {
956                $this->parseError('Unexpected EOF in processing instruction.');
957                $this->events->processingInstruction($procName, $data);
958
959                return true;
960            }
961        }
962
963        $this->scanner->consume(2); // Consume the closing tag
964        $this->events->processingInstruction($procName, $data);
965
966        return true;
967    }
968
969    // ================================================================
970    // UTILITY FUNCTIONS
971    // ================================================================
972
973    /**
974     * Read from the input stream until we get to the desired sequene
975     * or hit the end of the input stream.
976     *
977     * @param string $sequence
978     *
979     * @return string
980     */
981    protected function readUntilSequence($sequence)
982    {
983        $buffer = '';
984
985        // Optimization for reading larger blocks faster.
986        $first = substr($sequence, 0, 1);
987        while (false !== $this->scanner->current()) {
988            $buffer .= $this->scanner->charsUntil($first);
989
990            // Stop as soon as we hit the stopping condition.
991            if ($this->scanner->sequenceMatches($sequence, false)) {
992                return $buffer;
993            }
994            $buffer .= $this->scanner->current();
995            $this->scanner->consume();
996        }
997
998        // If we get here, we hit the EOF.
999        $this->parseError('Unexpected EOF during text read.');
1000
1001        return $buffer;
1002    }
1003
1004    /**
1005     * Check if upcomming chars match the given sequence.
1006     *
1007     * This will read the stream for the $sequence. If it's
1008     * found, this will return true. If not, return false.
1009     * Since this unconsumes any chars it reads, the caller
1010     * will still need to read the next sequence, even if
1011     * this returns true.
1012     *
1013     * Example: $this->scanner->sequenceMatches('</script>') will
1014     * see if the input stream is at the start of a
1015     * '</script>' string.
1016     *
1017     * @param string $sequence
1018     * @param bool   $caseSensitive
1019     *
1020     * @return bool
1021     */
1022    protected function sequenceMatches($sequence, $caseSensitive = true)
1023    {
1024        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
1025
1026        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
1027    }
1028
1029    /**
1030     * Send a TEXT event with the contents of the text buffer.
1031     *
1032     * This emits an EventHandler::text() event with the current contents of the
1033     * temporary text buffer. (The buffer is used to group as much PCDATA
1034     * as we can instead of emitting lots and lots of TEXT events.)
1035     */
1036    protected function flushBuffer()
1037    {
1038        if ('' === $this->text) {
1039            return;
1040        }
1041        $this->events->text($this->text);
1042        $this->text = '';
1043    }
1044
1045    /**
1046     * Add text to the temporary buffer.
1047     *
1048     * @see flushBuffer()
1049     *
1050     * @param string $str
1051     */
1052    protected function buffer($str)
1053    {
1054        $this->text .= $str;
1055    }
1056
1057    /**
1058     * Emit a parse error.
1059     *
1060     * A parse error always returns false because it never consumes any
1061     * characters.
1062     *
1063     * @param string $msg
1064     *
1065     * @return string
1066     */
1067    protected function parseError($msg)
1068    {
1069        $args = func_get_args();
1070
1071        if (count($args) > 1) {
1072            array_shift($args);
1073            $msg = vsprintf($msg, $args);
1074        }
1075
1076        $line = $this->scanner->currentLine();
1077        $col = $this->scanner->columnOffset();
1078        $this->events->parseError($msg, $line, $col);
1079
1080        return false;
1081    }
1082
1083    /**
1084     * Decode a character reference and return the string.
1085     *
1086     * If $inAttribute is set to true, a bare & will be returned as-is.
1087     *
1088     * @param bool $inAttribute Set to true if the text is inside of an attribute value.
1089     *                          false otherwise.
1090     *
1091     * @return string
1092     */
1093    protected function decodeCharacterReference($inAttribute = false)
1094    {
1095        // Next char after &.
1096        $tok = $this->scanner->next();
1097        $start = $this->scanner->position();
1098
1099        if (false === $tok) {
1100            return '&';
1101        }
1102
1103        // These indicate not an entity. We return just
1104        // the &.
1105        if ("\t" === $tok || "\n" === $tok || "\f" === $tok || ' ' === $tok || '&' === $tok || '<' === $tok) {
1106            // $this->scanner->next();
1107            return '&';
1108        }
1109
1110        // Numeric entity
1111        if ('#' === $tok) {
1112            $tok = $this->scanner->next();
1113
1114            if (false === $tok) {
1115                $this->parseError('Expected &#DEC; &#HEX;, got EOF');
1116                $this->scanner->unconsume(1);
1117
1118                return '&';
1119            }
1120
1121            // Hexidecimal encoding.
1122            // X[0-9a-fA-F]+;
1123            // x[0-9a-fA-F]+;
1124            if ('x' === $tok || 'X' === $tok) {
1125                $tok = $this->scanner->next(); // Consume x
1126
1127                // Convert from hex code to char.
1128                $hex = $this->scanner->getHex();
1129                if (empty($hex)) {
1130                    $this->parseError('Expected &#xHEX;, got &#x%s', $tok);
1131                    // We unconsume because we don't know what parser rules might
1132                    // be in effect for the remaining chars. For example. '&#>'
1133                    // might result in a specific parsing rule inside of tag
1134                    // contexts, while not inside of pcdata context.
1135                    $this->scanner->unconsume(2);
1136
1137                    return '&';
1138                }
1139                $entity = CharacterReference::lookupHex($hex);
1140            }             // Decimal encoding.
1141            // [0-9]+;
1142            else {
1143                // Convert from decimal to char.
1144                $numeric = $this->scanner->getNumeric();
1145                if (false === $numeric) {
1146                    $this->parseError('Expected &#DIGITS;, got &#%s', $tok);
1147                    $this->scanner->unconsume(2);
1148
1149                    return '&';
1150                }
1151                $entity = CharacterReference::lookupDecimal($numeric);
1152            }
1153        } elseif ('=' === $tok && $inAttribute) {
1154            return '&';
1155        } else { // String entity.
1156            // Attempt to consume a string up to a ';'.
1157            // [a-zA-Z0-9]+;
1158            $cname = $this->scanner->getAsciiAlphaNum();
1159            $entity = CharacterReference::lookupName($cname);
1160
1161            // When no entity is found provide the name of the unmatched string
1162            // and continue on as the & is not part of an entity. The & will
1163            // be converted to &amp; elsewhere.
1164            if (null === $entity) {
1165                if (!$inAttribute || '' === $cname) {
1166                    $this->parseError("No match in entity table for '%s'", $cname);
1167                }
1168                $this->scanner->unconsume($this->scanner->position() - $start);
1169
1170                return '&';
1171            }
1172        }
1173
1174        // The scanner has advanced the cursor for us.
1175        $tok = $this->scanner->current();
1176
1177        // We have an entity. We're done here.
1178        if (';' === $tok) {
1179            $this->scanner->consume();
1180
1181            return $entity;
1182        }
1183
1184        // Failing to match ; means unconsume the entire string.
1185        $this->scanner->unconsume($this->scanner->position() - $start);
1186
1187        $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok);
1188
1189        return '&';
1190    }
1191}
1192