1<?php
2namespace TYPO3\CMS\Core\Html;
3
4/*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17use TYPO3\CMS\Core\Utility\GeneralUtility;
18use TYPO3\CMS\Core\Utility\MathUtility;
19use TYPO3\CMS\Frontend\ContentObject\ContentObjectRenderer;
20
21/**
22 * Functions for parsing HTML.
23 * You are encouraged to use this class in your own applications
24 */
25class HtmlParser
26{
27    /**
28     * @var array
29     */
30    protected $caseShift_cache = [];
31
32    // Void elements that do not have closing tags, as defined by HTML5, except link element
33    const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr';
34
35    /************************************
36     *
37     * Parsing HTML code
38     *
39     ************************************/
40    /**
41     * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
42     * Even numbers in the array are outside the blocks, Odd numbers are block-content.
43     * Use ->removeFirstAndLastTag() to process the content if needed.
44     *
45     * @param string $tag List of tags, comma separated.
46     * @param string $content HTML-content
47     * @param bool $eliminateExtraEndTags If set, excessive end tags are ignored - you should probably set this in most cases.
48     * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
49     * @see splitTags(), removeFirstAndLastTag()
50     */
51    public function splitIntoBlock($tag, $content, $eliminateExtraEndTags = false)
52    {
53        $tags = array_unique(GeneralUtility::trimExplode(',', $tag, true));
54        array_walk($tags, function (&$tag) {
55            $tag = preg_quote($tag, '/');
56        });
57        $regexStr = '/\\<\\/?(' . implode('|', $tags) . ')(\\s*\\>|\\s[^\\>]*\\>)/si';
58        $parts = preg_split($regexStr, $content);
59        $newParts = [];
60        $pointer = strlen($parts[0]);
61        $buffer = $parts[0];
62        $nested = 0;
63        reset($parts);
64        // We skip the first element in foreach loop
65        $partsSliced = array_slice($parts, 1, null, true);
66        foreach ($partsSliced as $v) {
67            $isEndTag = substr($content, $pointer, 2) === '</';
68            $tagLen = strcspn(substr($content, $pointer), '>') + 1;
69            // We meet a start-tag:
70            if (!$isEndTag) {
71                // Ground level:
72                if (!$nested) {
73                    // Previous buffer stored
74                    $newParts[] = $buffer;
75                    $buffer = '';
76                }
77                // We are inside now!
78                $nested++;
79                // New buffer set and pointer increased
80                $mbuffer = substr($content, $pointer, strlen($v) + $tagLen);
81                $pointer += strlen($mbuffer);
82                $buffer .= $mbuffer;
83            } else {
84                // If we meet an endtag:
85                // Decrease nested-level
86                $nested--;
87                $eliminated = 0;
88                if ($eliminateExtraEndTags && $nested < 0) {
89                    $nested = 0;
90                    $eliminated = 1;
91                } else {
92                    // In any case, add the endtag to current buffer and increase pointer
93                    $buffer .= substr($content, $pointer, $tagLen);
94                }
95                $pointer += $tagLen;
96                // if we're back on ground level, (and not by eliminating tags...
97                if (!$nested && !$eliminated) {
98                    $newParts[] = $buffer;
99                    $buffer = '';
100                }
101                // New buffer set and pointer increased
102                $mbuffer = substr($content, $pointer, strlen($v));
103                $pointer += strlen($mbuffer);
104                $buffer .= $mbuffer;
105            }
106        }
107        $newParts[] = $buffer;
108        return $newParts;
109    }
110
111    /**
112     * Splitting content into blocks *recursively* and processing tags/content with call back functions.
113     *
114     * @param string $tag Tag list, see splitIntoBlock()
115     * @param string $content Content, see splitIntoBlock()
116     * @param object $procObj Object where call back methods are.
117     * @param string $callBackContent Name of call back method for content; "function callBackContent($str,$level)
118     * @param string $callBackTags Name of call back method for tags; "function callBackTags($tags,$level)
119     * @param int $level Indent level
120     * @return string Processed content
121     * @see splitIntoBlock()
122     */
123    public function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0)
124    {
125        $parts = $this->splitIntoBlock($tag, $content, true);
126        foreach ($parts as $k => $v) {
127            if ($k % 2) {
128                $firstTagName = $this->getFirstTagName($v, true);
129                $tagsArray = [];
130                $tagsArray['tag_start'] = $this->getFirstTag($v);
131                $tagsArray['tag_end'] = '</' . $firstTagName . '>';
132                $tagsArray['tag_name'] = strtolower($firstTagName);
133                $tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + 1);
134                if ($callBackTags) {
135                    $tagsArray = $procObj->{$callBackTags}($tagsArray, $level);
136                }
137                $parts[$k] = $tagsArray['tag_start'] . $tagsArray['content'] . $tagsArray['tag_end'];
138            } else {
139                if ($callBackContent) {
140                    $parts[$k] = $procObj->{$callBackContent}($parts[$k], $level);
141                }
142            }
143        }
144        return implode('', $parts);
145    }
146
147    /**
148     * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
149     * Even numbers in the array are outside the blocks, Odd numbers are block-content.
150     * Use ->removeFirstAndLastTag() to process the content if needed.
151     *
152     * @param string $tag List of tags
153     * @param string $content HTML-content
154     * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
155     * @see splitIntoBlock(), removeFirstAndLastTag()
156     */
157    public function splitTags($tag, $content)
158    {
159        $tags = GeneralUtility::trimExplode(',', $tag, true);
160        array_walk($tags, function (&$tag) {
161            $tag = preg_quote($tag, '/');
162        });
163        $regexStr = '/\\<(' . implode('|', $tags) . ')(\\s[^>]*)?\\/?>/si';
164        $parts = preg_split($regexStr, $content);
165        $pointer = strlen($parts[0]);
166        $newParts = [];
167        $newParts[] = $parts[0];
168        reset($parts);
169        // We skip the first element in foreach loop
170        $partsSliced = array_slice($parts, 1, null, true);
171        foreach ($partsSliced as $v) {
172            $tagLen = strcspn(substr($content, $pointer), '>') + 1;
173            // Set tag:
174            // New buffer set and pointer increased
175            $tag = substr($content, $pointer, $tagLen);
176            $newParts[] = $tag;
177            $pointer += strlen($tag);
178            // Set content:
179            $newParts[] = $v;
180            $pointer += strlen($v);
181        }
182        return $newParts;
183    }
184
185    /**
186     * Removes the first and last tag in the string
187     * Anything before the first and after the last tags respectively is also removed
188     *
189     * @param string $str String to process
190     * @return string
191     */
192    public function removeFirstAndLastTag($str)
193    {
194        $parser = SimpleParser::fromString($str);
195        $first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT);
196        $last = $parser->getLastNode(SimpleNode::TYPE_ELEMENT);
197        if ($first === null || $first === $last) {
198            return '';
199        }
200        $sequence = array_slice(
201            $parser->getNodes(),
202            $first->getIndex() + 1,
203            $last->getIndex() - $first->getIndex() - 1
204        );
205        return implode('', array_map('strval', $sequence));
206    }
207
208    /**
209     * Returns the first tag in $str
210     * Actually everything from the beginning of the $str is returned, so you better make sure the tag is the first thing...
211     *
212     * @param string $str HTML string with tags
213     * @return string
214     */
215    public function getFirstTag($str)
216    {
217        $parser = SimpleParser::fromString($str);
218        $first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT);
219        if ($first === null) {
220            return '';
221        }
222        $sequence = array_slice(
223            $parser->getNodes(),
224            0,
225            $first->getIndex() + 1
226        );
227        return implode('', array_map('strval', $sequence));
228    }
229
230    /**
231     * Returns the NAME of the first tag in $str
232     *
233     * @param string $str HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
234     * @param bool $preserveCase If set, then the tag is NOT converted to uppercase by case is preserved.
235     * @return string Tag name in upper case
236     * @see getFirstTag()
237     */
238    public function getFirstTagName($str, $preserveCase = false)
239    {
240        $parser = SimpleParser::fromString($str);
241        $elements = $parser->getNodes(SimpleNode::TYPE_ELEMENT);
242        foreach ($elements as $element) {
243            $name = $element->getElementName();
244            if ($name === null) {
245                continue;
246            }
247            return $preserveCase ? $name : strtoupper($name);
248        }
249        return '';
250    }
251
252    /**
253     * Returns an array with all attributes as keys. Attributes are only lowercase a-z
254     * If an attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset()
255     *
256     * Compared to the method in GeneralUtility::get_tag_attributes this method also returns meta data about each
257     * attribute, e.g. if it is a shorthand attribute, and what the quotation is. Also, since all attribute keys
258     * are lower-cased, the meta information contains the original attribute name.
259     *
260     * @param string $tag Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>')
261     * @param bool $deHSC If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set!
262     * @return array array(Tag attributes,Attribute meta-data)
263     */
264    public function get_tag_attributes($tag, $deHSC = false)
265    {
266        list($components, $metaC) = $this->split_tag_attributes($tag);
267        // Attribute name is stored here
268        $name = '';
269        $valuemode = false;
270        $attributes = [];
271        $attributesMeta = [];
272        if (is_array($components)) {
273            foreach ($components as $key => $val) {
274                // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
275                if ($val !== '=') {
276                    if ($valuemode) {
277                        if ($name) {
278                            $attributes[$name] = $deHSC ? htmlspecialchars_decode($val) : $val;
279                            $attributesMeta[$name]['dashType'] = $metaC[$key];
280                            $name = '';
281                        }
282                    } else {
283                        if ($namekey = preg_replace('/[^[:alnum:]_\\:\\-]/', '', $val)) {
284                            $name = strtolower($namekey);
285                            $attributesMeta[$name] = [];
286                            $attributesMeta[$name]['origTag'] = $namekey;
287                            $attributes[$name] = '';
288                        }
289                    }
290                    $valuemode = false;
291                } else {
292                    $valuemode = true;
293                }
294            }
295            return [$attributes, $attributesMeta];
296        }
297    }
298
299    /**
300     * Returns an array with the 'components' from an attribute list.
301     * The result is normally analyzed by get_tag_attributes
302     * Removes tag-name if found.
303     *
304     * The difference between this method and the one in GeneralUtility is that this method actually determines
305     * more information on the attribute, e.g. if the value is enclosed by a " or ' character.
306     * That's why this method returns two arrays, the "components" and the "meta-information" of the "components".
307     *
308     * @param string $tag The tag or attributes
309     * @return array
310     * @internal
311     * @see \TYPO3\CMS\Core\Utility\GeneralUtility::split_tag_attributes()
312     */
313    public function split_tag_attributes($tag)
314    {
315        $matches = [];
316        if (preg_match('/(\\<[^\\s]+\\s+)?(.*?)\\s*(\\>)?$/s', $tag, $matches) !== 1) {
317            return [[], []];
318        }
319        $tag_tmp = $matches[2];
320        $metaValue = [];
321        $value = [];
322        $matches = [];
323        if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\\s"\'\\=]+|\\=)/s', $tag_tmp, $matches) > 0) {
324            foreach ($matches[1] as $part) {
325                $firstChar = $part[0];
326                if ($firstChar === '"' || $firstChar === '\'') {
327                    $metaValue[] = $firstChar;
328                    $value[] = substr($part, 1, -1);
329                } else {
330                    $metaValue[] = '';
331                    $value[] = $part;
332                }
333            }
334        }
335        return [$value, $metaValue];
336    }
337
338    /*********************************
339     *
340     * Clean HTML code
341     *
342     *********************************/
343    /**
344     * Function that can clean up HTML content according to configuration given in the $tags array.
345     *
346     * Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this:		 $tags = array_flip(explode(',','b,a,i,u'))
347     * If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options:
348     *
349     * $tags[$tagname] = Array(
350     * 'overrideAttribs' => ''		If set, this string is preset as the attributes of the tag
351     * 'allowedAttribs' =>   '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed.
352     * 'fixAttrib' => Array(
353     * '[attribute name]' => Array (
354     * 'set' => Force the attribute value to this value.
355     * 'unset' => Boolean: If set, the attribute is unset.
356     * 'default' =>	 If no attribute exists by this name, this value is set as default value (if this value is not blank)
357     * 'always' =>	 Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists
358     * 'trim,intval,lower,upper' =>	 All booleans. If any of these keys are set, the value is passed through the respective PHP-functions.
359     * 'range' => Array ('[low limit]','[high limit, optional]')		Setting integer range.
360     * 'list' => Array ('[value1/default]','[value2]','[value3]')		Attribute must be in this list. If not, the value is set to the first element.
361     * 'removeIfFalse' =>	 Boolean/'blank'.	If set, then the attribute is removed if it is 'FALSE'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed)
362     * 'removeIfEquals' =>	 [value]	If the attribute value matches the value set here, then it is removed.
363     * 'casesensitiveComp' => 1	If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not.
364     * )
365     * ),
366     * 'protect' => '',	Boolean. If set, the tag <> is converted to &lt; and &gt;
367     * 'remap' => '',		String. If set, the tagname is remapped to this tagname
368     * 'rmTagIfNoAttrib' => '',	Boolean. If set, then the tag is removed if no attributes happened to be there.
369     * 'nesting' => '',	Boolean/'global'. If set TRUE, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>'
370     * )
371     *
372     * @param string $content Is the HTML-content being processed. This is also the result being returned.
373     * @param array $tags Is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure.
374     * @param mixed $keepAll Boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to &lt; and &gt;
375     * @param int $hSC Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&amp;" or "&#234;")
376     * @param array $addConfig Configuration array send along as $conf to the internal functions
377     * @return string Processed HTML content
378     */
379    public function HTMLcleaner($content, $tags = [], $keepAll = 0, $hSC = 0, $addConfig = [])
380    {
381        $newContent = [];
382        $tokArr = explode('<', $content);
383        $newContent[] = $this->bidir_htmlspecialchars(current($tokArr), $hSC);
384        // We skip the first element in foreach loop
385        $tokArrSliced = array_slice($tokArr, 1, null, true);
386        $c = 1;
387        $tagRegister = [];
388        $tagStack = [];
389        $inComment = false;
390        $inCdata = false;
391        $skipTag = false;
392        foreach ($tokArrSliced as $tok) {
393            if ($inComment) {
394                if (($eocPos = strpos($tok, '-->')) === false) {
395                    // End of comment is not found in the token. Go further until end of comment is found in other tokens.
396                    $newContent[$c++] = '<' . $tok;
397                    continue;
398                }
399                // Comment ends in the middle of the token: add comment and proceed with rest of the token
400                $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
401                $tok = substr($tok, $eocPos + 3);
402                $inComment = false;
403                $skipTag = true;
404            } elseif ($inCdata) {
405                if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
406                    // End of comment is not found in the token. Go further until end of comment is found in other tokens.
407                    $newContent[$c++] = '<' . $tok;
408                    continue;
409                }
410                // Comment ends in the middle of the token: add comment and proceed with rest of the token
411                $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
412                $tok = substr($tok, $eocPos + 10);
413                $inCdata = false;
414                $skipTag = true;
415            } elseif (strpos($tok, '!--') === 0) {
416                if (($eocPos = strpos($tok, '-->')) === false) {
417                    // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
418                    $newContent[$c++] = '<' . $tok;
419                    $inComment = true;
420                    continue;
421                }
422                // Start and end of comment are both in the current token. Add comment and proceed with rest of the token
423                $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
424                $tok = substr($tok, $eocPos + 3);
425                $skipTag = true;
426            } elseif (strpos($tok, '![CDATA[*/') === 0) {
427                if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
428                    // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
429                    $newContent[$c++] = '<' . $tok;
430                    $inCdata = true;
431                    continue;
432                }
433                // Start and end of comment are both in the current token. Add comment and proceed with rest of the token
434                $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
435                $tok = substr($tok, $eocPos + 10);
436                $skipTag = true;
437            }
438            $firstChar = $tok[0] ?? null;
439            // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
440            if (!$skipTag && preg_match('/[[:alnum:]\\/]/', $firstChar) === 1) {
441                $tagEnd = strpos($tok, '>');
442                // If there is and end-bracket...	tagEnd can't be 0 as the first character can't be a >
443                if ($tagEnd) {
444                    $endTag = $firstChar === '/' ? 1 : 0;
445                    $tagContent = substr($tok, $endTag, $tagEnd - $endTag);
446                    $tagParts = preg_split('/\\s+/s', $tagContent, 2);
447                    $tagName = strtolower($tagParts[0]);
448                    $emptyTag = 0;
449                    if (isset($tags[$tagName])) {
450                        // If there is processing to do for the tag:
451                        if (is_array($tags[$tagName])) {
452                            if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) {
453                                $emptyTag = 1;
454                            }
455                            // If NOT an endtag, do attribute processing (added dec. 2003)
456                            if (!$endTag) {
457                                // Override attributes
458                                if (isset($tags[$tagName]['overrideAttribs']) && (string)$tags[$tagName]['overrideAttribs'] !== '') {
459                                    $tagParts[1] = $tags[$tagName]['overrideAttribs'];
460                                }
461                                // Allowed tags
462                                if (isset($tags[$tagName]['allowedAttribs']) && (string)$tags[$tagName]['allowedAttribs'] !== '') {
463                                    // No attribs allowed
464                                    if ((string)$tags[$tagName]['allowedAttribs'] === '0') {
465                                        $tagParts[1] = '';
466                                    } elseif (isset($tagParts[1]) && trim($tagParts[1])) {
467                                        $tagAttrib = $this->get_tag_attributes($tagParts[1]);
468                                        $tagParts[1] = '';
469                                        $newTagAttrib = [];
470                                        $tList = (array)(
471                                            $tags[$tagName]['_allowedAttribs']
472                                            ?? GeneralUtility::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), true)
473                                        );
474                                        foreach ($tList as $allowTag) {
475                                            if (isset($tagAttrib[0][$allowTag])) {
476                                                $newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag];
477                                            }
478                                        }
479
480                                        $tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]);
481                                    }
482                                }
483                                // Fixed attrib values
484                                if (isset($tags[$tagName]['fixAttrib']) && is_array($tags[$tagName]['fixAttrib'])) {
485                                    $tagAttrib = $this->get_tag_attributes($tagParts[1]);
486                                    $tagParts[1] = '';
487                                    foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) {
488                                        if (isset($params['set']) && $params['set'] !== '') {
489                                            $tagAttrib[0][$attr] = $params['set'];
490                                        }
491                                        if (!empty($params['unset'])) {
492                                            unset($tagAttrib[0][$attr]);
493                                        }
494                                        if (!empty($params['default']) && !isset($tagAttrib[0][$attr])) {
495                                            $tagAttrib[0][$attr] = $params['default'];
496                                        }
497                                        if ($params['always'] || isset($tagAttrib[0][$attr])) {
498                                            if ($params['trim']) {
499                                                $tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]);
500                                            }
501                                            if ($params['intval']) {
502                                                $tagAttrib[0][$attr] = (int)$tagAttrib[0][$attr];
503                                            }
504                                            if ($params['lower']) {
505                                                $tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]);
506                                            }
507                                            if ($params['upper']) {
508                                                $tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]);
509                                            }
510                                            if ($params['range']) {
511                                                if (isset($params['range'][1])) {
512                                                    $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0], (int)$params['range'][1]);
513                                                } else {
514                                                    $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0]);
515                                                }
516                                            }
517                                            if (isset($params['list']) && is_array($params['list'])) {
518                                                // For the class attribute, remove from the attribute value any class not in the list
519                                                // Classes are case sensitive
520                                                if ($attr === 'class') {
521                                                    $newClasses = [];
522                                                    $classes = GeneralUtility::trimExplode(' ', $tagAttrib[0][$attr], true);
523                                                    foreach ($classes as $class) {
524                                                        if (in_array($class, $params['list'])) {
525                                                            $newClasses[] = $class;
526                                                        }
527                                                    }
528                                                    if (!empty($newClasses)) {
529                                                        $tagAttrib[0][$attr] = implode(' ', $newClasses);
530                                                    } else {
531                                                        $tagAttrib[0][$attr] = $params['list'][0];
532                                                    }
533                                                } else {
534                                                    if (!in_array($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['list'], $params['casesensitiveComp'], $tagName))) {
535                                                        $tagAttrib[0][$attr] = $params['list'][0];
536                                                    }
537                                                }
538                                            }
539                                            if ($params['removeIfFalse'] && $params['removeIfFalse'] !== 'blank' && !$tagAttrib[0][$attr] || $params['removeIfFalse'] === 'blank' && (string)$tagAttrib[0][$attr] === '') {
540                                                unset($tagAttrib[0][$attr]);
541                                            }
542                                            if ((string)$params['removeIfEquals'] !== '' && $this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']) === $this->caseShift($params['removeIfEquals'], $params['casesensitiveComp'])) {
543                                                unset($tagAttrib[0][$attr]);
544                                            }
545                                            if ($params['prefixLocalAnchors']) {
546                                                if ($tagAttrib[0][$attr][0] === '#') {
547                                                    if ($params['prefixLocalAnchors'] == 2) {
548                                                        /** @var ContentObjectRenderer $contentObjectRenderer */
549                                                        $contentObjectRenderer = GeneralUtility::makeInstance(ContentObjectRenderer::class);
550                                                        $prefix = $contentObjectRenderer->getUrlToCurrentLocation();
551                                                    } else {
552                                                        $prefix = GeneralUtility::getIndpEnv('TYPO3_REQUEST_URL');
553                                                    }
554                                                    $tagAttrib[0][$attr] = $prefix . $tagAttrib[0][$attr];
555                                                }
556                                            }
557                                            if ($params['prefixRelPathWith']) {
558                                                $urlParts = parse_url($tagAttrib[0][$attr]);
559                                                if (!$urlParts['scheme'] && $urlParts['path'][0] !== '/') {
560                                                    // If it is NOT an absolute URL (by http: or starting "/")
561                                                    $tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr];
562                                                }
563                                            }
564                                            if ($params['userFunc']) {
565                                                if (is_array($params['userFunc.'])) {
566                                                    $params['userFunc.']['attributeValue'] = $tagAttrib[0][$attr];
567                                                } else {
568                                                    $params['userFunc.'] = $tagAttrib[0][$attr];
569                                                }
570                                                $tagAttrib[0][$attr] = GeneralUtility::callUserFunction($params['userFunc'], $params['userFunc.'], $this);
571                                            }
572                                        }
573                                    }
574                                    $tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]);
575                                }
576                            } else {
577                                // If endTag, remove any possible attributes:
578                                $tagParts[1] = '';
579                            }
580                            // Protecting the tag by converting < and > to &lt; and &gt; ??
581                            if (!empty($tags[$tagName]['protect'])) {
582                                $lt = '&lt;';
583                                $gt = '&gt;';
584                            } else {
585                                $lt = '<';
586                                $gt = '>';
587                            }
588                            // Remapping tag name?
589                            if (!empty($tags[$tagName]['remap'])) {
590                                $tagParts[0] = $tags[$tagName]['remap'];
591                            }
592                            // rmTagIfNoAttrib
593                            if ($endTag || empty($tags[$tagName]['rmTagIfNoAttrib']) || trim($tagParts[1] ?? '')) {
594                                $setTag = true;
595                                // Remove this closing tag if $tagName was among $TSconfig['removeTags']
596                                if ($endTag && isset($tags[$tagName]['allowedAttribs']) && $tags[$tagName]['allowedAttribs'] === 0 && $tags[$tagName]['rmTagIfNoAttrib'] === 1) {
597                                    $setTag = false;
598                                }
599                                if (isset($tags[$tagName]['nesting'])) {
600                                    if (!isset($tagRegister[$tagName])) {
601                                        $tagRegister[$tagName] = [];
602                                    }
603                                    if ($endTag) {
604                                        $correctTag = true;
605                                        if ($tags[$tagName]['nesting'] === 'global') {
606                                            $lastEl = end($tagStack);
607                                            if ($tagName !== $lastEl) {
608                                                if (in_array($tagName, $tagStack, true)) {
609                                                    while (!empty($tagStack) && $tagName !== $lastEl) {
610                                                        $elPos = end($tagRegister[$lastEl]);
611                                                        unset($newContent[$elPos]);
612                                                        array_pop($tagRegister[$lastEl]);
613                                                        array_pop($tagStack);
614                                                        $lastEl = end($tagStack);
615                                                    }
616                                                } else {
617                                                    // In this case the
618                                                    $correctTag = false;
619                                                }
620                                            }
621                                        }
622                                        if (empty($tagRegister[$tagName]) || !$correctTag) {
623                                            $setTag = false;
624                                        } else {
625                                            array_pop($tagRegister[$tagName]);
626                                            if ($tags[$tagName]['nesting'] === 'global') {
627                                                array_pop($tagStack);
628                                            }
629                                        }
630                                    } else {
631                                        $tagRegister[$tagName][] = $c;
632                                        if ($tags[$tagName]['nesting'] === 'global') {
633                                            $tagStack[] = $tagName;
634                                        }
635                                    }
636                                }
637                                if ($setTag) {
638                                    // Setting the tag
639                                    $newContent[$c++] = $lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . ($tagParts[1] ?? '')) . ($emptyTag ? ' /' : '') . $gt;
640                                }
641                            }
642                        } else {
643                            $newContent[$c++] = '<' . ($endTag ? '/' : '') . $tagContent . '>';
644                        }
645                    } elseif ($keepAll) {
646                        // This is if the tag was not defined in the array for processing:
647                        if ($keepAll === 'protect') {
648                            $lt = '&lt;';
649                            $gt = '&gt;';
650                        } else {
651                            $lt = '<';
652                            $gt = '>';
653                        }
654                        $newContent[$c++] = $lt . ($endTag ? '/' : '') . $tagContent . $gt;
655                    }
656                    $newContent[$c++] = $this->bidir_htmlspecialchars(substr($tok, $tagEnd + 1), $hSC);
657                } else {
658                    $newContent[$c++] = $this->bidir_htmlspecialchars('<' . $tok, $hSC);
659                }
660            } else {
661                $newContent[$c++] = $this->bidir_htmlspecialchars(($skipTag ? '' : '<') . $tok, $hSC);
662                // It was not a tag anyways
663                $skipTag = false;
664            }
665        }
666        // Unsetting tags:
667        foreach ($tagRegister as $tag => $positions) {
668            foreach ($positions as $pKey) {
669                unset($newContent[$pKey]);
670            }
671        }
672        $newContent = implode('', $newContent);
673        $newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig);
674        return $newContent;
675    }
676
677    /**
678     * Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1)
679     *
680     * @param string $value Input value
681     * @param int $dir Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1)
682     * @return string Output value
683     */
684    public function bidir_htmlspecialchars($value, $dir)
685    {
686        switch ((int)$dir) {
687            case 1:
688                return htmlspecialchars($value);
689            case 2:
690                return htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false);
691            case -1:
692                return htmlspecialchars_decode($value);
693            default:
694                return $value;
695        }
696    }
697
698    /**
699     * Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives
700     *
701     * @param string $main_prefix Prefix string
702     * @param string $content HTML content
703     * @param array $alternatives Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase
704     * @param string $suffix Suffix string (put after the resource).
705     * @return string Processed HTML content
706     */
707    public function prefixResourcePath($main_prefix, $content, $alternatives = [], $suffix = '')
708    {
709        $parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param', $content);
710        foreach ($parts as $k => $v) {
711            if ($k % 2) {
712                $params = $this->get_tag_attributes($v);
713                // Detect tag-ending so that it is re-applied correctly.
714                $tagEnd = substr($v, -2) === '/>' ? ' />' : '>';
715                // The 'name' of the first tag
716                $firstTagName = $this->getFirstTagName($v);
717                $somethingDone = 0;
718                $prefix = $alternatives[strtoupper($firstTagName)] ?? $main_prefix;
719                switch (strtolower($firstTagName)) {
720                    case 'td':
721
722                    case 'body':
723
724                    case 'table':
725                        $src = $params[0]['background'];
726                        if ($src) {
727                            $params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix);
728                            $somethingDone = 1;
729                        }
730                        break;
731                    case 'img':
732
733                    case 'input':
734
735                    case 'script':
736
737                    case 'embed':
738                        $src = $params[0]['src'];
739                        if ($src) {
740                            $params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix);
741                            $somethingDone = 1;
742                        }
743                        break;
744                    case 'link':
745
746                    case 'a':
747                        $src = $params[0]['href'];
748                        if ($src) {
749                            $params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix);
750                            $somethingDone = 1;
751                        }
752                        break;
753                    case 'form':
754                        $src = $params[0]['action'];
755                        if ($src) {
756                            $params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix);
757                            $somethingDone = 1;
758                        }
759                        break;
760                    case 'param':
761                        $test = $params[0]['name'];
762                        if ($test && $test === 'movie') {
763                            if ($params[0]['value']) {
764                                $params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix);
765                                $somethingDone = 1;
766                            }
767                        }
768                        break;
769                }
770                if ($somethingDone) {
771                    $tagParts = preg_split('/\\s+/s', $v, 2);
772                    $tagParts[1] = $this->compileTagAttribs($params[0], $params[1]);
773                    $parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd;
774                }
775            }
776        }
777        $content = implode('', $parts);
778        // Fix <style> section:
779        $prefix = $alternatives['style'] ?? $main_prefix;
780        if ((string)$prefix !== '') {
781            $parts = $this->splitIntoBlock('style', $content);
782            foreach ($parts as $k => &$part) {
783                if ($k % 2) {
784                    $part = preg_replace('/(url[[:space:]]*\\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\\))/i', '\\1' . $prefix . '\\2' . $suffix . '\\3', $part);
785                }
786            }
787            unset($part);
788            $content = implode('', $parts);
789        }
790        return $content;
791    }
792
793    /**
794     * Internal sub-function for ->prefixResourcePath()
795     *
796     * @param string $prefix Prefix string
797     * @param string $srcVal Relative path/URL
798     * @param string $suffix Suffix string
799     * @return string Output path, prefixed if no scheme in input string
800     * @internal
801     */
802    public function prefixRelPath($prefix, $srcVal, $suffix = '')
803    {
804        // Only prefix if it's not an absolute URL or
805        // only a link to a section within the page.
806        if ($srcVal[0] !== '/' && $srcVal[0] !== '#') {
807            $urlParts = parse_url($srcVal);
808            // Only prefix URLs without a scheme
809            if (!$urlParts['scheme']) {
810                $srcVal = $prefix . $srcVal . $suffix;
811            }
812        }
813        return $srcVal;
814    }
815
816    /**
817     * Internal function for case shifting of a string or whole array
818     *
819     * @param mixed $str Input string/array
820     * @param bool $caseSensitiveComparison If this value is FALSE, the string is returned in uppercase
821     * @param string $cacheKey Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array.
822     * @return string Output string, processed
823     * @internal
824     */
825    public function caseShift($str, $caseSensitiveComparison, $cacheKey = '')
826    {
827        if ($caseSensitiveComparison) {
828            return $str;
829        }
830        if (is_array($str)) {
831            // Fetch from runlevel cache
832            if ($cacheKey && isset($this->caseShift_cache[$cacheKey])) {
833                $str = $this->caseShift_cache[$cacheKey];
834            } else {
835                array_walk($str, function (&$value) {
836                    $value = strtoupper($value);
837                });
838                if ($cacheKey) {
839                    $this->caseShift_cache[$cacheKey] = $str;
840                }
841            }
842        } else {
843            $str = strtoupper($str);
844        }
845        return $str;
846    }
847
848    /**
849     * Compiling an array with tag attributes into a string
850     *
851     * @param array $tagAttrib Tag attributes
852     * @param array $meta Meta information about these attributes (like if they were quoted)
853     * @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"'
854     * @internal
855     */
856    public function compileTagAttribs($tagAttrib, $meta = [])
857    {
858        $accu = [];
859        foreach ($tagAttrib as $k => $v) {
860            $attr = $meta[$k]['origTag'] ?: $k;
861            if (strcmp($v, '') || isset($meta[$k]['dashType'])) {
862                $dash = $meta[$k]['dashType'] ?: (MathUtility::canBeInterpretedAsInteger($v) ? '' : '"');
863                $attr .= '=' . $dash . $v . $dash;
864            }
865            $accu[] = $attr;
866        }
867        return implode(' ', $accu);
868    }
869
870    /**
871     * Converts TSconfig into an array for the HTMLcleaner function.
872     *
873     * @param array $TSconfig TSconfig for HTMLcleaner
874     * @param array $keepTags Array of tags to keep (?)
875     * @return array
876     * @internal
877     */
878    public function HTMLparserConfig($TSconfig, $keepTags = [])
879    {
880        // Allow tags (base list, merged with incoming array)
881        $alTags = array_flip(GeneralUtility::trimExplode(',', strtolower($TSconfig['allowTags'] ?? ''), true));
882        $keepTags = array_merge($alTags, $keepTags);
883        // Set config properties.
884        if (isset($TSconfig['tags.']) && is_array($TSconfig['tags.'])) {
885            foreach ($TSconfig['tags.'] as $key => $tagC) {
886                if (!is_array($tagC) && $key == strtolower($key)) {
887                    if ((string)$tagC === '0') {
888                        unset($keepTags[$key]);
889                    }
890                    if ((string)$tagC === '1' && !isset($keepTags[$key])) {
891                        $keepTags[$key] = 1;
892                    }
893                }
894            }
895            foreach ($TSconfig['tags.'] as $key => $tagC) {
896                if (is_array($tagC) && $key == strtolower($key)) {
897                    $key = substr($key, 0, -1);
898                    if (!is_array($keepTags[$key])) {
899                        $keepTags[$key] = [];
900                    }
901                    if (isset($tagC['fixAttrib.']) && is_array($tagC['fixAttrib.'])) {
902                        foreach ($tagC['fixAttrib.'] as $atName => $atConfig) {
903                            if (is_array($atConfig)) {
904                                $atName = substr($atName, 0, -1);
905                                if (!is_array($keepTags[$key]['fixAttrib'][$atName])) {
906                                    $keepTags[$key]['fixAttrib'][$atName] = [];
907                                }
908                                $keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig);
909                                if ((string)$keepTags[$key]['fixAttrib'][$atName]['range'] !== '') {
910                                    $keepTags[$key]['fixAttrib'][$atName]['range'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']);
911                                }
912                                if ((string)$keepTags[$key]['fixAttrib'][$atName]['list'] !== '') {
913                                    $keepTags[$key]['fixAttrib'][$atName]['list'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']);
914                                }
915                            }
916                        }
917                    }
918                    unset($tagC['fixAttrib.'], $tagC['fixAttrib']);
919                    if (!empty($tagC['rmTagIfNoAttrib']) && empty($tagC['nesting'])) {
920                        $tagC['nesting'] = 1;
921                    }
922                    $keepTags[$key] = array_merge($keepTags[$key], $tagC);
923                }
924            }
925        }
926        // LocalNesting
927        if (!empty($TSconfig['localNesting'])) {
928            $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['localNesting']), true);
929            foreach ($lN as $tn) {
930                if (isset($keepTags[$tn])) {
931                    if (!is_array($keepTags[$tn])) {
932                        $keepTags[$tn] = [];
933                    }
934                    $keepTags[$tn]['nesting'] = 1;
935                }
936            }
937        }
938        if (!empty($TSconfig['globalNesting'])) {
939            $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['globalNesting']), true);
940            foreach ($lN as $tn) {
941                if (isset($keepTags[$tn])) {
942                    if (!is_array($keepTags[$tn])) {
943                        $keepTags[$tn] = [];
944                    }
945                    $keepTags[$tn]['nesting'] = 'global';
946                }
947            }
948        }
949        if (!empty($TSconfig['rmTagIfNoAttrib'])) {
950            $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), true);
951            foreach ($lN as $tn) {
952                if (isset($keepTags[$tn])) {
953                    if (!is_array($keepTags[$tn])) {
954                        $keepTags[$tn] = [];
955                    }
956                    $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
957                    if (empty($keepTags[$tn]['nesting'])) {
958                        $keepTags[$tn]['nesting'] = 1;
959                    }
960                }
961            }
962        }
963        if (!empty($TSconfig['noAttrib'])) {
964            $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['noAttrib']), true);
965            foreach ($lN as $tn) {
966                if (isset($keepTags[$tn])) {
967                    if (!is_array($keepTags[$tn])) {
968                        $keepTags[$tn] = [];
969                    }
970                    $keepTags[$tn]['allowedAttribs'] = 0;
971                }
972            }
973        }
974        if (!empty($TSconfig['removeTags'])) {
975            $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['removeTags']), true);
976            foreach ($lN as $tn) {
977                $keepTags[$tn] = [];
978                $keepTags[$tn]['allowedAttribs'] = 0;
979                $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
980            }
981        }
982        // Create additional configuration:
983        $addConfig = [];
984        if (isset($TSconfig['stripEmptyTags'])) {
985            $addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags'];
986            if (isset($TSconfig['stripEmptyTags.'])) {
987                $addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.'];
988            }
989        }
990        return [
991            $keepTags,
992            '' . ($TSconfig['keepNonMatchedTags'] ?? ''),
993            (int)($TSconfig['htmlSpecialChars'] ?? 0),
994            $addConfig
995        ];
996    }
997
998    /**
999     * Strips empty tags from HTML.
1000     *
1001     * @param string $content The content to be stripped of empty tags
1002     * @param string $tagList The comma separated list of tags to be stripped.
1003     *                        If empty, all empty tags will be stripped
1004     * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only &nbsp; entities will be treated as empty.
1005     * @param bool $keepTags If true, the provided tags will be kept instead of stripped.
1006     * @return string the stripped content
1007     */
1008    public function stripEmptyTags($content, $tagList = '', $treatNonBreakingSpaceAsEmpty = false, $keepTags = false)
1009    {
1010        if (!empty($tagList)) {
1011            $tagRegEx = implode('|', GeneralUtility::trimExplode(',', $tagList, true));
1012            if ($keepTags) {
1013                $tagRegEx = '(?!' . $tagRegEx . ')[^ >]+';
1014            }
1015        } else {
1016            $tagRegEx = '[^ >]+'; // all characters until you reach a > or space;
1017        }
1018        $count = 1;
1019        $nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|(&nbsp;)' : '';
1020        $finalRegex = sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex);
1021        while ($count !== 0) {
1022            $content = preg_replace($finalRegex, '', $content, -1, $count);
1023        }
1024        return $content;
1025    }
1026
1027    /**
1028     * Strips the configured empty tags from the HMTL code.
1029     *
1030     * @param string $value
1031     * @param array $configuration
1032     * @return string
1033     */
1034    protected function stripEmptyTagsIfConfigured($value, $configuration)
1035    {
1036        if (empty($configuration['stripEmptyTags'])) {
1037            return $value;
1038        }
1039
1040        $tags = null;
1041        $keepTags = false;
1042        if (!empty($configuration['stripEmptyTags.']['keepTags'])) {
1043            $tags = $configuration['stripEmptyTags.']['keepTags'];
1044            $keepTags = true;
1045        } elseif (!empty($configuration['stripEmptyTags.']['tags'])) {
1046            $tags = $configuration['stripEmptyTags.']['tags'];
1047        }
1048
1049        $treatNonBreakingSpaceAsEmpty = !empty($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']);
1050
1051        return $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty, $keepTags);
1052    }
1053}
1054