1<?php
2
3/* This file is part of the Markdownify project, which is under LGPL license */
4
5namespace Markdownify;
6
7class Parser
8{
9    public static $skipWhitespace = true;
10    public static $a_ord;
11    public static $z_ord;
12    public static $special_ords;
13
14    /**
15     * tags which are always empty (<br /> etc.)
16     *
17     * @var array<string>
18     */
19    public $emptyTags = array(
20        'br',
21        'hr',
22        'input',
23        'img',
24        'area',
25        'link',
26        'meta',
27        'param',
28    );
29
30    /**
31     * tags with preformatted text
32     * whitespaces wont be touched in them
33     *
34     * @var array<string>
35     */
36    public $preformattedTags = array(
37        'script',
38        'style',
39        'pre',
40        'code',
41    );
42
43    /**
44     * supress HTML tags inside preformatted tags (see above)
45     *
46     * @var bool
47     */
48    public $noTagsInCode = false;
49
50    /**
51     * html to be parsed
52     *
53     * @var string
54     */
55    public $html = '';
56
57    /**
58     * node type:
59     *
60     * - tag (see isStartTag)
61     * - text (includes cdata)
62     * - comment
63     * - doctype
64     * - pi (processing instruction)
65     *
66     * @var string
67     */
68    public $nodeType = '';
69
70    /**
71     * current node content, i.e. either a
72     * simple string (text node), or something like
73     * <tag attrib="value"...>
74     *
75     * @var string
76     */
77    public $node = '';
78
79    /**
80     * wether current node is an opening tag (<a>) or not (</a>)
81     * set to NULL if current node is not a tag
82     * NOTE: empty tags (<br />) set this to true as well!
83     *
84     * @var bool | null
85     */
86    public $isStartTag = null;
87
88    /**
89     * wether current node is an empty tag (<br />) or not (<a></a>)
90     *
91     * @var bool | null
92     */
93    public $isEmptyTag = null;
94
95    /**
96     * tag name
97     *
98     * @var string | null
99     */
100    public $tagName = '';
101
102    /**
103     * attributes of current tag
104     *
105     * @var array (attribName=>value) | null
106     */
107    public $tagAttributes = null;
108
109    /**
110     * whether or not the actual context is a inline context
111     *
112     * @var bool | null
113     */
114    public $isInlineContext = null;
115
116    /**
117     * whether the current tag is a block element
118     *
119     * @var bool | null
120     */
121    public $isBlockElement = null;
122
123    /**
124     * whether the previous tag (browser) is a block element
125     *
126     * @var bool | null
127     */
128    public $isNextToInlineContext = null;
129
130    /**
131     * keep whitespace
132     *
133     * @var int
134     */
135    public $keepWhitespace = 0;
136
137    /**
138     * list of open tags
139     * count this to get current depth
140     *
141     * @var array
142     */
143    public $openTags = array();
144
145    /**
146     * list of block elements
147     *
148     * @var array
149     * TODO: what shall we do with <del> and <ins> ?!
150     */
151    public $blockElements = array(
152        // tag name => <bool> is block
153        // block elements
154        'address' => true,
155        'blockquote' => true,
156        'center' => true,
157        'del' => true,
158        'dir' => true,
159        'div' => true,
160        'dl' => true,
161        'fieldset' => true,
162        'form' => true,
163        'h1' => true,
164        'h2' => true,
165        'h3' => true,
166        'h4' => true,
167        'h5' => true,
168        'h6' => true,
169        'hr' => true,
170        'ins' => true,
171        'isindex' => true,
172        'menu' => true,
173        'noframes' => true,
174        'noscript' => true,
175        'ol' => true,
176        'p' => true,
177        'pre' => true,
178        'table' => true,
179        'ul' => true,
180        // set table elements and list items to block as well
181        'thead' => true,
182        'tbody' => true,
183        'tfoot' => true,
184        'td' => true,
185        'tr' => true,
186        'th' => true,
187        'li' => true,
188        'dd' => true,
189        'dt' => true,
190        // header items and html / body as well
191        'html' => true,
192        'body' => true,
193        'head' => true,
194        'meta' => true,
195        'link' => true,
196        'style' => true,
197        'title' => true,
198        // unfancy media tags, when indented should be rendered as block
199        'map' => true,
200        'object' => true,
201        'param' => true,
202        'embed' => true,
203        'area' => true,
204        // inline elements
205        'a' => false,
206        'abbr' => false,
207        'acronym' => false,
208        'applet' => false,
209        'b' => false,
210        'basefont' => false,
211        'bdo' => false,
212        'big' => false,
213        'br' => false,
214        'button' => false,
215        'cite' => false,
216        'code' => false,
217        'del' => false,
218        'dfn' => false,
219        'em' => false,
220        'font' => false,
221        'i' => false,
222        'img' => false,
223        'ins' => false,
224        'input' => false,
225        'iframe' => false,
226        'kbd' => false,
227        'label' => false,
228        'q' => false,
229        'samp' => false,
230        'script' => false,
231        'select' => false,
232        'small' => false,
233        'span' => false,
234        'strong' => false,
235        'sub' => false,
236        'sup' => false,
237        'textarea' => false,
238        'tt' => false,
239        'var' => false,
240    );
241
242    /**
243     * get next node, set $this->html prior!
244     *
245     * @param void
246     * @return bool
247     */
248    public function nextNode()
249    {
250        if (empty($this->html)) {
251            // we are done with parsing the html string
252
253            return false;
254        }
255
256        if ($this->isStartTag && !$this->isEmptyTag) {
257            array_push($this->openTags, $this->tagName);
258            if (in_array($this->tagName, $this->preformattedTags)) {
259                // dont truncate whitespaces for <code> or <pre> contents
260                $this->keepWhitespace++;
261            }
262        }
263
264        if ($this->html[0] == '<') {
265            $token = substr($this->html, 0, 9);
266            if (substr($token, 0, 2) == '<?') {
267                // xml prolog or other pi's
268                /** TODO **/
269                // trigger_error('this might need some work', E_USER_NOTICE);
270                $pos = strpos($this->html, '>');
271                $this->setNode('pi', $pos + 1);
272
273                return true;
274            }
275            if (substr($token, 0, 4) == '<!--') {
276                // comment
277                $pos = strpos($this->html, '-->');
278                if ($pos === false) {
279                    // could not find a closing -->, use next gt instead
280                    // this is firefox' behaviour
281                    $pos = strpos($this->html, '>') + 1;
282                } else {
283                    $pos += 3;
284                }
285                $this->setNode('comment', $pos);
286
287                static::$skipWhitespace = true;
288
289                return true;
290            }
291            if ($token == '<!DOCTYPE') {
292                // doctype
293                $this->setNode('doctype', strpos($this->html, '>') + 1);
294
295                static::$skipWhitespace = true;
296
297                return true;
298            }
299            if ($token == '<![CDATA[') {
300                // cdata, use text node
301
302                // remove leading <![CDATA[
303                $this->html = substr($this->html, 9);
304
305                $this->setNode('text', strpos($this->html, ']]>') + 3);
306
307                // remove trailing ]]> and trim
308                $this->node = substr($this->node, 0, -3);
309                $this->handleWhitespaces();
310
311                static::$skipWhitespace = true;
312
313                return true;
314            }
315            if ($this->parseTag()) {
316                // seems to be a tag
317                // handle whitespaces
318                if ($this->isBlockElement) {
319                    static::$skipWhitespace = true;
320                } else {
321                    static::$skipWhitespace = false;
322                }
323
324                return true;
325            }
326        }
327        if ($this->keepWhitespace) {
328            static::$skipWhitespace = false;
329        }
330        // when we get here it seems to be a text node
331        $pos = strpos($this->html, '<');
332        if ($pos === false) {
333            $pos = strlen($this->html);
334        }
335        $this->setNode('text', $pos);
336        $this->handleWhitespaces();
337        if (static::$skipWhitespace && $this->node == ' ') {
338            return $this->nextNode();
339        }
340        $this->isInlineContext = true;
341        static::$skipWhitespace = false;
342
343        return true;
344    }
345
346    /**
347     * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
348     *
349     * @param void
350     * @return bool
351     */
352    protected function parseTag()
353    {
354        if (!isset(static::$a_ord)) {
355            static::$a_ord = ord('a');
356            static::$z_ord = ord('z');
357            static::$special_ords = array(
358                ord(':'), // for xml:lang
359                ord('-'), // for http-equiv
360            );
361        }
362
363        $tagName = '';
364
365        $pos = 1;
366        $isStartTag = $this->html[$pos] != '/';
367        if (!$isStartTag) {
368            $pos++;
369        }
370        // get tagName
371        while (isset($this->html[$pos])) {
372            $pos_ord = ord(strtolower($this->html[$pos]));
373            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
374                $tagName .= $this->html[$pos];
375                $pos++;
376            } else {
377                $pos--;
378                break;
379            }
380        }
381
382        $tagName = strtolower($tagName);
383        if (empty($tagName) || !isset($this->blockElements[$tagName])) {
384            // something went wrong => invalid tag
385            $this->invalidTag();
386
387            return false;
388        }
389        if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
390            // we supress all HTML tags inside code tags
391            $this->invalidTag();
392
393            return false;
394        }
395
396        // get tag attributes
397        /** TODO: in html 4 attributes do not need to be quoted **/
398        $isEmptyTag = false;
399        $attributes = array();
400        $currAttrib = '';
401        while (isset($this->html[$pos + 1])) {
402            $pos++;
403            // close tag
404            if ($this->html[$pos] == '>' || $this->html[$pos] . $this->html[$pos + 1] == '/>') {
405                if ($this->html[$pos] == '/') {
406                    $isEmptyTag = true;
407                    $pos++;
408                }
409                break;
410            }
411
412            $pos_ord = ord(strtolower($this->html[$pos]));
413            if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) {
414                // attribute name
415                $currAttrib .= $this->html[$pos];
416            } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
417                // drop whitespace
418            } elseif (in_array($this->html[$pos] . $this->html[$pos + 1], array('="', "='"))) {
419                // get attribute value
420                $pos++;
421                $await = $this->html[$pos]; // single or double quote
422                $pos++;
423                $value = '';
424                while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
425                    $value .= $this->html[$pos];
426                    $pos++;
427                }
428                $attributes[$currAttrib] = $value;
429                $currAttrib = '';
430            } else {
431                $this->invalidTag();
432
433                return false;
434            }
435        }
436        if ($this->html[$pos] != '>') {
437            $this->invalidTag();
438
439            return false;
440        }
441
442        if (!empty($currAttrib)) {
443            // html 4 allows something like <option selected> instead of <option selected="selected">
444            $attributes[$currAttrib] = $currAttrib;
445        }
446        if (!$isStartTag) {
447            if (!empty($attributes) || $tagName != end($this->openTags)) {
448                // end tags must not contain any attributes
449                // or maybe we did not expect a different tag to be closed
450                $this->invalidTag();
451
452                return false;
453            }
454            array_pop($this->openTags);
455            if (in_array($tagName, $this->preformattedTags)) {
456                $this->keepWhitespace--;
457            }
458        }
459        $pos++;
460        $this->node = substr($this->html, 0, $pos);
461        $this->html = substr($this->html, $pos);
462        $this->tagName = $tagName;
463        $this->tagAttributes = $attributes;
464        $this->isStartTag = $isStartTag;
465        $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
466        if ($this->isEmptyTag) {
467            // might be not well formed
468            $this->node = preg_replace('# */? *>$#', ' />', $this->node);
469        }
470        $this->nodeType = 'tag';
471        $this->isBlockElement = $this->blockElements[$tagName];
472        $this->isNextToInlineContext = $isStartTag && $this->isInlineContext;
473        $this->isInlineContext = !$this->isBlockElement;
474        return true;
475    }
476
477    /**
478     * handle invalid tags
479     *
480     * @param void
481     * @return void
482     */
483    protected function invalidTag()
484    {
485        $this->html = substr_replace($this->html, '&lt;', 0, 1);
486    }
487
488    /**
489     * update all vars and make $this->html shorter
490     *
491     * @param string $type see description for $this->nodeType
492     * @param int $pos to which position shall we cut?
493     * @return void
494     */
495    protected function setNode($type, $pos)
496    {
497        if ($this->nodeType == 'tag') {
498            // set tag specific vars to null
499            // $type == tag should not be called here
500            // see this::parseTag() for more
501            $this->tagName = null;
502            $this->tagAttributes = null;
503            $this->isStartTag = null;
504            $this->isEmptyTag = null;
505            $this->isBlockElement = null;
506
507        }
508        $this->nodeType = $type;
509        $this->node = substr($this->html, 0, $pos);
510        $this->html = substr($this->html, $pos);
511    }
512
513    /**
514     * check if $this->html begins with $str
515     *
516     * @param string $str
517     * @return bool
518     */
519    protected function match($str)
520    {
521        return substr($this->html, 0, strlen($str)) == $str;
522    }
523
524    /**
525     * truncate whitespaces
526     *
527     * @param void
528     * @return void
529     */
530    protected function handleWhitespaces()
531    {
532        if ($this->keepWhitespace) {
533            // <pre> or <code> before...
534
535            return;
536        }
537        // truncate multiple whitespaces to a single one
538        $this->node = preg_replace('#\s+#s', ' ', $this->node);
539    }
540
541    /**
542     * normalize self::node
543     *
544     * @param void
545     * @return void
546     */
547    protected function normalizeNode()
548    {
549        $this->node = '<';
550        if (!$this->isStartTag) {
551            $this->node .= '/' . $this->tagName . '>';
552
553            return;
554        }
555        $this->node .= $this->tagName;
556        foreach ($this->tagAttributes as $name => $value) {
557            $this->node .= ' ' . $name . '="' . str_replace('"', '&quot;', $value) . '"';
558        }
559        if ($this->isEmptyTag) {
560            $this->node .= ' /';
561        }
562        $this->node .= '>';
563    }
564}
565