1<?php 2namespace TYPO3\CMS\Core\Html; 3 4/* 5 * This file is part of the TYPO3 CMS project. 6 * 7 * It is free software; you can redistribute it and/or modify it under 8 * the terms of the GNU General Public License, either version 2 9 * of the License, or any later version. 10 * 11 * For the full copyright and license information, please read the 12 * LICENSE.txt file that was distributed with this source code. 13 * 14 * The TYPO3 project - inspiring people to share! 15 */ 16 17use TYPO3\CMS\Core\Utility\GeneralUtility; 18use TYPO3\CMS\Core\Utility\MathUtility; 19use TYPO3\CMS\Frontend\ContentObject\ContentObjectRenderer; 20 21/** 22 * Functions for parsing HTML. 23 * You are encouraged to use this class in your own applications 24 */ 25class HtmlParser 26{ 27 /** 28 * @var array 29 */ 30 protected $caseShift_cache = []; 31 32 // Void elements that do not have closing tags, as defined by HTML5, except link element 33 const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr'; 34 35 /************************************ 36 * 37 * Parsing HTML code 38 * 39 ************************************/ 40 /** 41 * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag 42 * Even numbers in the array are outside the blocks, Odd numbers are block-content. 43 * Use ->removeFirstAndLastTag() to process the content if needed. 44 * 45 * @param string $tag List of tags, comma separated. 46 * @param string $content HTML-content 47 * @param bool $eliminateExtraEndTags If set, excessive end tags are ignored - you should probably set this in most cases. 48 * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content. 49 * @see splitTags(), removeFirstAndLastTag() 50 */ 51 public function splitIntoBlock($tag, $content, $eliminateExtraEndTags = false) 52 { 53 $tags = array_unique(GeneralUtility::trimExplode(',', $tag, true)); 54 array_walk($tags, function (&$tag) { 55 $tag = preg_quote($tag, '/'); 56 }); 57 $regexStr = '/\\<\\/?(' . implode('|', $tags) . ')(\\s*\\>|\\s[^\\>]*\\>)/si'; 58 $parts = preg_split($regexStr, $content); 59 $newParts = []; 60 $pointer = strlen($parts[0]); 61 $buffer = $parts[0]; 62 $nested = 0; 63 reset($parts); 64 // We skip the first element in foreach loop 65 $partsSliced = array_slice($parts, 1, null, true); 66 foreach ($partsSliced as $v) { 67 $isEndTag = substr($content, $pointer, 2) === '</'; 68 $tagLen = strcspn(substr($content, $pointer), '>') + 1; 69 // We meet a start-tag: 70 if (!$isEndTag) { 71 // Ground level: 72 if (!$nested) { 73 // Previous buffer stored 74 $newParts[] = $buffer; 75 $buffer = ''; 76 } 77 // We are inside now! 78 $nested++; 79 // New buffer set and pointer increased 80 $mbuffer = substr($content, $pointer, strlen($v) + $tagLen); 81 $pointer += strlen($mbuffer); 82 $buffer .= $mbuffer; 83 } else { 84 // If we meet an endtag: 85 // Decrease nested-level 86 $nested--; 87 $eliminated = 0; 88 if ($eliminateExtraEndTags && $nested < 0) { 89 $nested = 0; 90 $eliminated = 1; 91 } else { 92 // In any case, add the endtag to current buffer and increase pointer 93 $buffer .= substr($content, $pointer, $tagLen); 94 } 95 $pointer += $tagLen; 96 // if we're back on ground level, (and not by eliminating tags... 97 if (!$nested && !$eliminated) { 98 $newParts[] = $buffer; 99 $buffer = ''; 100 } 101 // New buffer set and pointer increased 102 $mbuffer = substr($content, $pointer, strlen($v)); 103 $pointer += strlen($mbuffer); 104 $buffer .= $mbuffer; 105 } 106 } 107 $newParts[] = $buffer; 108 return $newParts; 109 } 110 111 /** 112 * Splitting content into blocks *recursively* and processing tags/content with call back functions. 113 * 114 * @param string $tag Tag list, see splitIntoBlock() 115 * @param string $content Content, see splitIntoBlock() 116 * @param object $procObj Object where call back methods are. 117 * @param string $callBackContent Name of call back method for content; "function callBackContent($str,$level) 118 * @param string $callBackTags Name of call back method for tags; "function callBackTags($tags,$level) 119 * @param int $level Indent level 120 * @return string Processed content 121 * @see splitIntoBlock() 122 */ 123 public function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0) 124 { 125 $parts = $this->splitIntoBlock($tag, $content, true); 126 foreach ($parts as $k => $v) { 127 if ($k % 2) { 128 $firstTagName = $this->getFirstTagName($v, true); 129 $tagsArray = []; 130 $tagsArray['tag_start'] = $this->getFirstTag($v); 131 $tagsArray['tag_end'] = '</' . $firstTagName . '>'; 132 $tagsArray['tag_name'] = strtolower($firstTagName); 133 $tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + 1); 134 if ($callBackTags) { 135 $tagsArray = $procObj->{$callBackTags}($tagsArray, $level); 136 } 137 $parts[$k] = $tagsArray['tag_start'] . $tagsArray['content'] . $tagsArray['tag_end']; 138 } else { 139 if ($callBackContent) { 140 $parts[$k] = $procObj->{$callBackContent}($parts[$k], $level); 141 } 142 } 143 } 144 return implode('', $parts); 145 } 146 147 /** 148 * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag 149 * Even numbers in the array are outside the blocks, Odd numbers are block-content. 150 * Use ->removeFirstAndLastTag() to process the content if needed. 151 * 152 * @param string $tag List of tags 153 * @param string $content HTML-content 154 * @return array Even numbers in the array are outside the blocks, Odd numbers are block-content. 155 * @see splitIntoBlock(), removeFirstAndLastTag() 156 */ 157 public function splitTags($tag, $content) 158 { 159 $tags = GeneralUtility::trimExplode(',', $tag, true); 160 array_walk($tags, function (&$tag) { 161 $tag = preg_quote($tag, '/'); 162 }); 163 $regexStr = '/\\<(' . implode('|', $tags) . ')(\\s[^>]*)?\\/?>/si'; 164 $parts = preg_split($regexStr, $content); 165 $pointer = strlen($parts[0]); 166 $newParts = []; 167 $newParts[] = $parts[0]; 168 reset($parts); 169 // We skip the first element in foreach loop 170 $partsSliced = array_slice($parts, 1, null, true); 171 foreach ($partsSliced as $v) { 172 $tagLen = strcspn(substr($content, $pointer), '>') + 1; 173 // Set tag: 174 // New buffer set and pointer increased 175 $tag = substr($content, $pointer, $tagLen); 176 $newParts[] = $tag; 177 $pointer += strlen($tag); 178 // Set content: 179 $newParts[] = $v; 180 $pointer += strlen($v); 181 } 182 return $newParts; 183 } 184 185 /** 186 * Removes the first and last tag in the string 187 * Anything before the first and after the last tags respectively is also removed 188 * 189 * @param string $str String to process 190 * @return string 191 */ 192 public function removeFirstAndLastTag($str) 193 { 194 $parser = SimpleParser::fromString($str); 195 $first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT); 196 $last = $parser->getLastNode(SimpleNode::TYPE_ELEMENT); 197 if ($first === null || $first === $last) { 198 return ''; 199 } 200 $sequence = array_slice( 201 $parser->getNodes(), 202 $first->getIndex() + 1, 203 $last->getIndex() - $first->getIndex() - 1 204 ); 205 return implode('', array_map('strval', $sequence)); 206 } 207 208 /** 209 * Returns the first tag in $str 210 * Actually everything from the beginning of the $str is returned, so you better make sure the tag is the first thing... 211 * 212 * @param string $str HTML string with tags 213 * @return string 214 */ 215 public function getFirstTag($str) 216 { 217 $parser = SimpleParser::fromString($str); 218 $first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT); 219 if ($first === null) { 220 return ''; 221 } 222 $sequence = array_slice( 223 $parser->getNodes(), 224 0, 225 $first->getIndex() + 1 226 ); 227 return implode('', array_map('strval', $sequence)); 228 } 229 230 /** 231 * Returns the NAME of the first tag in $str 232 * 233 * @param string $str HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do) 234 * @param bool $preserveCase If set, then the tag is NOT converted to uppercase by case is preserved. 235 * @return string Tag name in upper case 236 * @see getFirstTag() 237 */ 238 public function getFirstTagName($str, $preserveCase = false) 239 { 240 $parser = SimpleParser::fromString($str); 241 $elements = $parser->getNodes(SimpleNode::TYPE_ELEMENT); 242 foreach ($elements as $element) { 243 $name = $element->getElementName(); 244 if ($name === null) { 245 continue; 246 } 247 return $preserveCase ? $name : strtoupper($name); 248 } 249 return ''; 250 } 251 252 /** 253 * Returns an array with all attributes as keys. Attributes are only lowercase a-z 254 * If an attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset() 255 * 256 * Compared to the method in GeneralUtility::get_tag_attributes this method also returns meta data about each 257 * attribute, e.g. if it is a shorthand attribute, and what the quotation is. Also, since all attribute keys 258 * are lower-cased, the meta information contains the original attribute name. 259 * 260 * @param string $tag Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>') 261 * @param bool $deHSC If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set! 262 * @return array array(Tag attributes,Attribute meta-data) 263 */ 264 public function get_tag_attributes($tag, $deHSC = false) 265 { 266 list($components, $metaC) = $this->split_tag_attributes($tag); 267 // Attribute name is stored here 268 $name = ''; 269 $valuemode = false; 270 $attributes = []; 271 $attributesMeta = []; 272 if (is_array($components)) { 273 foreach ($components as $key => $val) { 274 // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value 275 if ($val !== '=') { 276 if ($valuemode) { 277 if ($name) { 278 $attributes[$name] = $deHSC ? htmlspecialchars_decode($val) : $val; 279 $attributesMeta[$name]['dashType'] = $metaC[$key]; 280 $name = ''; 281 } 282 } else { 283 if ($namekey = preg_replace('/[^[:alnum:]_\\:\\-]/', '', $val)) { 284 $name = strtolower($namekey); 285 $attributesMeta[$name] = []; 286 $attributesMeta[$name]['origTag'] = $namekey; 287 $attributes[$name] = ''; 288 } 289 } 290 $valuemode = false; 291 } else { 292 $valuemode = true; 293 } 294 } 295 return [$attributes, $attributesMeta]; 296 } 297 } 298 299 /** 300 * Returns an array with the 'components' from an attribute list. 301 * The result is normally analyzed by get_tag_attributes 302 * Removes tag-name if found. 303 * 304 * The difference between this method and the one in GeneralUtility is that this method actually determines 305 * more information on the attribute, e.g. if the value is enclosed by a " or ' character. 306 * That's why this method returns two arrays, the "components" and the "meta-information" of the "components". 307 * 308 * @param string $tag The tag or attributes 309 * @return array 310 * @internal 311 * @see \TYPO3\CMS\Core\Utility\GeneralUtility::split_tag_attributes() 312 */ 313 public function split_tag_attributes($tag) 314 { 315 $matches = []; 316 if (preg_match('/(\\<[^\\s]+\\s+)?(.*?)\\s*(\\>)?$/s', $tag, $matches) !== 1) { 317 return [[], []]; 318 } 319 $tag_tmp = $matches[2]; 320 $metaValue = []; 321 $value = []; 322 $matches = []; 323 if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\\s"\'\\=]+|\\=)/s', $tag_tmp, $matches) > 0) { 324 foreach ($matches[1] as $part) { 325 $firstChar = $part[0]; 326 if ($firstChar === '"' || $firstChar === '\'') { 327 $metaValue[] = $firstChar; 328 $value[] = substr($part, 1, -1); 329 } else { 330 $metaValue[] = ''; 331 $value[] = $part; 332 } 333 } 334 } 335 return [$value, $metaValue]; 336 } 337 338 /********************************* 339 * 340 * Clean HTML code 341 * 342 *********************************/ 343 /** 344 * Function that can clean up HTML content according to configuration given in the $tags array. 345 * 346 * Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this: $tags = array_flip(explode(',','b,a,i,u')) 347 * If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options: 348 * 349 * $tags[$tagname] = Array( 350 * 'overrideAttribs' => '' If set, this string is preset as the attributes of the tag 351 * 'allowedAttribs' => '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed. 352 * 'fixAttrib' => Array( 353 * '[attribute name]' => Array ( 354 * 'set' => Force the attribute value to this value. 355 * 'unset' => Boolean: If set, the attribute is unset. 356 * 'default' => If no attribute exists by this name, this value is set as default value (if this value is not blank) 357 * 'always' => Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists 358 * 'trim,intval,lower,upper' => All booleans. If any of these keys are set, the value is passed through the respective PHP-functions. 359 * 'range' => Array ('[low limit]','[high limit, optional]') Setting integer range. 360 * 'list' => Array ('[value1/default]','[value2]','[value3]') Attribute must be in this list. If not, the value is set to the first element. 361 * 'removeIfFalse' => Boolean/'blank'. If set, then the attribute is removed if it is 'FALSE'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed) 362 * 'removeIfEquals' => [value] If the attribute value matches the value set here, then it is removed. 363 * 'casesensitiveComp' => 1 If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not. 364 * ) 365 * ), 366 * 'protect' => '', Boolean. If set, the tag <> is converted to < and > 367 * 'remap' => '', String. If set, the tagname is remapped to this tagname 368 * 'rmTagIfNoAttrib' => '', Boolean. If set, then the tag is removed if no attributes happened to be there. 369 * 'nesting' => '', Boolean/'global'. If set TRUE, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>' 370 * ) 371 * 372 * @param string $content Is the HTML-content being processed. This is also the result being returned. 373 * @param array $tags Is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure. 374 * @param mixed $keepAll Boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to < and > 375 * @param int $hSC Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&" or "ê") 376 * @param array $addConfig Configuration array send along as $conf to the internal functions 377 * @return string Processed HTML content 378 */ 379 public function HTMLcleaner($content, $tags = [], $keepAll = 0, $hSC = 0, $addConfig = []) 380 { 381 $newContent = []; 382 $tokArr = explode('<', $content); 383 $newContent[] = $this->bidir_htmlspecialchars(current($tokArr), $hSC); 384 // We skip the first element in foreach loop 385 $tokArrSliced = array_slice($tokArr, 1, null, true); 386 $c = 1; 387 $tagRegister = []; 388 $tagStack = []; 389 $inComment = false; 390 $inCdata = false; 391 $skipTag = false; 392 foreach ($tokArrSliced as $tok) { 393 if ($inComment) { 394 if (($eocPos = strpos($tok, '-->')) === false) { 395 // End of comment is not found in the token. Go further until end of comment is found in other tokens. 396 $newContent[$c++] = '<' . $tok; 397 continue; 398 } 399 // Comment ends in the middle of the token: add comment and proceed with rest of the token 400 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3); 401 $tok = substr($tok, $eocPos + 3); 402 $inComment = false; 403 $skipTag = true; 404 } elseif ($inCdata) { 405 if (($eocPos = strpos($tok, '/*]]>*/')) === false) { 406 // End of comment is not found in the token. Go further until end of comment is found in other tokens. 407 $newContent[$c++] = '<' . $tok; 408 continue; 409 } 410 // Comment ends in the middle of the token: add comment and proceed with rest of the token 411 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10); 412 $tok = substr($tok, $eocPos + 10); 413 $inCdata = false; 414 $skipTag = true; 415 } elseif (strpos($tok, '!--') === 0) { 416 if (($eocPos = strpos($tok, '-->')) === false) { 417 // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment 418 $newContent[$c++] = '<' . $tok; 419 $inComment = true; 420 continue; 421 } 422 // Start and end of comment are both in the current token. Add comment and proceed with rest of the token 423 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3); 424 $tok = substr($tok, $eocPos + 3); 425 $skipTag = true; 426 } elseif (strpos($tok, '![CDATA[*/') === 0) { 427 if (($eocPos = strpos($tok, '/*]]>*/')) === false) { 428 // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment 429 $newContent[$c++] = '<' . $tok; 430 $inCdata = true; 431 continue; 432 } 433 // Start and end of comment are both in the current token. Add comment and proceed with rest of the token 434 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10); 435 $tok = substr($tok, $eocPos + 10); 436 $skipTag = true; 437 } 438 $firstChar = $tok[0] ?? null; 439 // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..> 440 if (!$skipTag && preg_match('/[[:alnum:]\\/]/', $firstChar) === 1) { 441 $tagEnd = strpos($tok, '>'); 442 // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a > 443 if ($tagEnd) { 444 $endTag = $firstChar === '/' ? 1 : 0; 445 $tagContent = substr($tok, $endTag, $tagEnd - $endTag); 446 $tagParts = preg_split('/\\s+/s', $tagContent, 2); 447 $tagName = strtolower($tagParts[0]); 448 $emptyTag = 0; 449 if (isset($tags[$tagName])) { 450 // If there is processing to do for the tag: 451 if (is_array($tags[$tagName])) { 452 if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) { 453 $emptyTag = 1; 454 } 455 // If NOT an endtag, do attribute processing (added dec. 2003) 456 if (!$endTag) { 457 // Override attributes 458 if (isset($tags[$tagName]['overrideAttribs']) && (string)$tags[$tagName]['overrideAttribs'] !== '') { 459 $tagParts[1] = $tags[$tagName]['overrideAttribs']; 460 } 461 // Allowed tags 462 if (isset($tags[$tagName]['allowedAttribs']) && (string)$tags[$tagName]['allowedAttribs'] !== '') { 463 // No attribs allowed 464 if ((string)$tags[$tagName]['allowedAttribs'] === '0') { 465 $tagParts[1] = ''; 466 } elseif (isset($tagParts[1]) && trim($tagParts[1])) { 467 $tagAttrib = $this->get_tag_attributes($tagParts[1]); 468 $tagParts[1] = ''; 469 $newTagAttrib = []; 470 $tList = (array)( 471 $tags[$tagName]['_allowedAttribs'] 472 ?? GeneralUtility::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), true) 473 ); 474 foreach ($tList as $allowTag) { 475 if (isset($tagAttrib[0][$allowTag])) { 476 $newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag]; 477 } 478 } 479 480 $tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]); 481 } 482 } 483 // Fixed attrib values 484 if (isset($tags[$tagName]['fixAttrib']) && is_array($tags[$tagName]['fixAttrib'])) { 485 $tagAttrib = $this->get_tag_attributes($tagParts[1]); 486 $tagParts[1] = ''; 487 foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) { 488 if (isset($params['set']) && $params['set'] !== '') { 489 $tagAttrib[0][$attr] = $params['set']; 490 } 491 if (!empty($params['unset'])) { 492 unset($tagAttrib[0][$attr]); 493 } 494 if (!empty($params['default']) && !isset($tagAttrib[0][$attr])) { 495 $tagAttrib[0][$attr] = $params['default']; 496 } 497 if ($params['always'] || isset($tagAttrib[0][$attr])) { 498 if ($params['trim']) { 499 $tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]); 500 } 501 if ($params['intval']) { 502 $tagAttrib[0][$attr] = (int)$tagAttrib[0][$attr]; 503 } 504 if ($params['lower']) { 505 $tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]); 506 } 507 if ($params['upper']) { 508 $tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]); 509 } 510 if ($params['range']) { 511 if (isset($params['range'][1])) { 512 $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0], (int)$params['range'][1]); 513 } else { 514 $tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0]); 515 } 516 } 517 if (isset($params['list']) && is_array($params['list'])) { 518 // For the class attribute, remove from the attribute value any class not in the list 519 // Classes are case sensitive 520 if ($attr === 'class') { 521 $newClasses = []; 522 $classes = GeneralUtility::trimExplode(' ', $tagAttrib[0][$attr], true); 523 foreach ($classes as $class) { 524 if (in_array($class, $params['list'])) { 525 $newClasses[] = $class; 526 } 527 } 528 if (!empty($newClasses)) { 529 $tagAttrib[0][$attr] = implode(' ', $newClasses); 530 } else { 531 $tagAttrib[0][$attr] = $params['list'][0]; 532 } 533 } else { 534 if (!in_array($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['list'], $params['casesensitiveComp'], $tagName))) { 535 $tagAttrib[0][$attr] = $params['list'][0]; 536 } 537 } 538 } 539 if ($params['removeIfFalse'] && $params['removeIfFalse'] !== 'blank' && !$tagAttrib[0][$attr] || $params['removeIfFalse'] === 'blank' && (string)$tagAttrib[0][$attr] === '') { 540 unset($tagAttrib[0][$attr]); 541 } 542 if ((string)$params['removeIfEquals'] !== '' && $this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']) === $this->caseShift($params['removeIfEquals'], $params['casesensitiveComp'])) { 543 unset($tagAttrib[0][$attr]); 544 } 545 if ($params['prefixLocalAnchors']) { 546 if ($tagAttrib[0][$attr][0] === '#') { 547 if ($params['prefixLocalAnchors'] == 2) { 548 /** @var ContentObjectRenderer $contentObjectRenderer */ 549 $contentObjectRenderer = GeneralUtility::makeInstance(ContentObjectRenderer::class); 550 $prefix = $contentObjectRenderer->getUrlToCurrentLocation(); 551 } else { 552 $prefix = GeneralUtility::getIndpEnv('TYPO3_REQUEST_URL'); 553 } 554 $tagAttrib[0][$attr] = $prefix . $tagAttrib[0][$attr]; 555 } 556 } 557 if ($params['prefixRelPathWith']) { 558 $urlParts = parse_url($tagAttrib[0][$attr]); 559 if (!$urlParts['scheme'] && $urlParts['path'][0] !== '/') { 560 // If it is NOT an absolute URL (by http: or starting "/") 561 $tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr]; 562 } 563 } 564 if ($params['userFunc']) { 565 if (is_array($params['userFunc.'])) { 566 $params['userFunc.']['attributeValue'] = $tagAttrib[0][$attr]; 567 } else { 568 $params['userFunc.'] = $tagAttrib[0][$attr]; 569 } 570 $tagAttrib[0][$attr] = GeneralUtility::callUserFunction($params['userFunc'], $params['userFunc.'], $this); 571 } 572 } 573 } 574 $tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]); 575 } 576 } else { 577 // If endTag, remove any possible attributes: 578 $tagParts[1] = ''; 579 } 580 // Protecting the tag by converting < and > to < and > ?? 581 if (!empty($tags[$tagName]['protect'])) { 582 $lt = '<'; 583 $gt = '>'; 584 } else { 585 $lt = '<'; 586 $gt = '>'; 587 } 588 // Remapping tag name? 589 if (!empty($tags[$tagName]['remap'])) { 590 $tagParts[0] = $tags[$tagName]['remap']; 591 } 592 // rmTagIfNoAttrib 593 if ($endTag || empty($tags[$tagName]['rmTagIfNoAttrib']) || trim($tagParts[1] ?? '')) { 594 $setTag = true; 595 // Remove this closing tag if $tagName was among $TSconfig['removeTags'] 596 if ($endTag && isset($tags[$tagName]['allowedAttribs']) && $tags[$tagName]['allowedAttribs'] === 0 && $tags[$tagName]['rmTagIfNoAttrib'] === 1) { 597 $setTag = false; 598 } 599 if (isset($tags[$tagName]['nesting'])) { 600 if (!isset($tagRegister[$tagName])) { 601 $tagRegister[$tagName] = []; 602 } 603 if ($endTag) { 604 $correctTag = true; 605 if ($tags[$tagName]['nesting'] === 'global') { 606 $lastEl = end($tagStack); 607 if ($tagName !== $lastEl) { 608 if (in_array($tagName, $tagStack, true)) { 609 while (!empty($tagStack) && $tagName !== $lastEl) { 610 $elPos = end($tagRegister[$lastEl]); 611 unset($newContent[$elPos]); 612 array_pop($tagRegister[$lastEl]); 613 array_pop($tagStack); 614 $lastEl = end($tagStack); 615 } 616 } else { 617 // In this case the 618 $correctTag = false; 619 } 620 } 621 } 622 if (empty($tagRegister[$tagName]) || !$correctTag) { 623 $setTag = false; 624 } else { 625 array_pop($tagRegister[$tagName]); 626 if ($tags[$tagName]['nesting'] === 'global') { 627 array_pop($tagStack); 628 } 629 } 630 } else { 631 $tagRegister[$tagName][] = $c; 632 if ($tags[$tagName]['nesting'] === 'global') { 633 $tagStack[] = $tagName; 634 } 635 } 636 } 637 if ($setTag) { 638 // Setting the tag 639 $newContent[$c++] = $lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . ($tagParts[1] ?? '')) . ($emptyTag ? ' /' : '') . $gt; 640 } 641 } 642 } else { 643 $newContent[$c++] = '<' . ($endTag ? '/' : '') . $tagContent . '>'; 644 } 645 } elseif ($keepAll) { 646 // This is if the tag was not defined in the array for processing: 647 if ($keepAll === 'protect') { 648 $lt = '<'; 649 $gt = '>'; 650 } else { 651 $lt = '<'; 652 $gt = '>'; 653 } 654 $newContent[$c++] = $lt . ($endTag ? '/' : '') . $tagContent . $gt; 655 } 656 $newContent[$c++] = $this->bidir_htmlspecialchars(substr($tok, $tagEnd + 1), $hSC); 657 } else { 658 $newContent[$c++] = $this->bidir_htmlspecialchars('<' . $tok, $hSC); 659 } 660 } else { 661 $newContent[$c++] = $this->bidir_htmlspecialchars(($skipTag ? '' : '<') . $tok, $hSC); 662 // It was not a tag anyways 663 $skipTag = false; 664 } 665 } 666 // Unsetting tags: 667 foreach ($tagRegister as $tag => $positions) { 668 foreach ($positions as $pKey) { 669 unset($newContent[$pKey]); 670 } 671 } 672 $newContent = implode('', $newContent); 673 $newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig); 674 return $newContent; 675 } 676 677 /** 678 * Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1) 679 * 680 * @param string $value Input value 681 * @param int $dir Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1) 682 * @return string Output value 683 */ 684 public function bidir_htmlspecialchars($value, $dir) 685 { 686 switch ((int)$dir) { 687 case 1: 688 return htmlspecialchars($value); 689 case 2: 690 return htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false); 691 case -1: 692 return htmlspecialchars_decode($value); 693 default: 694 return $value; 695 } 696 } 697 698 /** 699 * Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives 700 * 701 * @param string $main_prefix Prefix string 702 * @param string $content HTML content 703 * @param array $alternatives Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase 704 * @param string $suffix Suffix string (put after the resource). 705 * @return string Processed HTML content 706 */ 707 public function prefixResourcePath($main_prefix, $content, $alternatives = [], $suffix = '') 708 { 709 $parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param', $content); 710 foreach ($parts as $k => $v) { 711 if ($k % 2) { 712 $params = $this->get_tag_attributes($v); 713 // Detect tag-ending so that it is re-applied correctly. 714 $tagEnd = substr($v, -2) === '/>' ? ' />' : '>'; 715 // The 'name' of the first tag 716 $firstTagName = $this->getFirstTagName($v); 717 $somethingDone = 0; 718 $prefix = $alternatives[strtoupper($firstTagName)] ?? $main_prefix; 719 switch (strtolower($firstTagName)) { 720 case 'td': 721 722 case 'body': 723 724 case 'table': 725 $src = $params[0]['background']; 726 if ($src) { 727 $params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix); 728 $somethingDone = 1; 729 } 730 break; 731 case 'img': 732 733 case 'input': 734 735 case 'script': 736 737 case 'embed': 738 $src = $params[0]['src']; 739 if ($src) { 740 $params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix); 741 $somethingDone = 1; 742 } 743 break; 744 case 'link': 745 746 case 'a': 747 $src = $params[0]['href']; 748 if ($src) { 749 $params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix); 750 $somethingDone = 1; 751 } 752 break; 753 case 'form': 754 $src = $params[0]['action']; 755 if ($src) { 756 $params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix); 757 $somethingDone = 1; 758 } 759 break; 760 case 'param': 761 $test = $params[0]['name']; 762 if ($test && $test === 'movie') { 763 if ($params[0]['value']) { 764 $params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix); 765 $somethingDone = 1; 766 } 767 } 768 break; 769 } 770 if ($somethingDone) { 771 $tagParts = preg_split('/\\s+/s', $v, 2); 772 $tagParts[1] = $this->compileTagAttribs($params[0], $params[1]); 773 $parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd; 774 } 775 } 776 } 777 $content = implode('', $parts); 778 // Fix <style> section: 779 $prefix = $alternatives['style'] ?? $main_prefix; 780 if ((string)$prefix !== '') { 781 $parts = $this->splitIntoBlock('style', $content); 782 foreach ($parts as $k => &$part) { 783 if ($k % 2) { 784 $part = preg_replace('/(url[[:space:]]*\\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\\))/i', '\\1' . $prefix . '\\2' . $suffix . '\\3', $part); 785 } 786 } 787 unset($part); 788 $content = implode('', $parts); 789 } 790 return $content; 791 } 792 793 /** 794 * Internal sub-function for ->prefixResourcePath() 795 * 796 * @param string $prefix Prefix string 797 * @param string $srcVal Relative path/URL 798 * @param string $suffix Suffix string 799 * @return string Output path, prefixed if no scheme in input string 800 * @internal 801 */ 802 public function prefixRelPath($prefix, $srcVal, $suffix = '') 803 { 804 // Only prefix if it's not an absolute URL or 805 // only a link to a section within the page. 806 if ($srcVal[0] !== '/' && $srcVal[0] !== '#') { 807 $urlParts = parse_url($srcVal); 808 // Only prefix URLs without a scheme 809 if (!$urlParts['scheme']) { 810 $srcVal = $prefix . $srcVal . $suffix; 811 } 812 } 813 return $srcVal; 814 } 815 816 /** 817 * Internal function for case shifting of a string or whole array 818 * 819 * @param mixed $str Input string/array 820 * @param bool $caseSensitiveComparison If this value is FALSE, the string is returned in uppercase 821 * @param string $cacheKey Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array. 822 * @return string Output string, processed 823 * @internal 824 */ 825 public function caseShift($str, $caseSensitiveComparison, $cacheKey = '') 826 { 827 if ($caseSensitiveComparison) { 828 return $str; 829 } 830 if (is_array($str)) { 831 // Fetch from runlevel cache 832 if ($cacheKey && isset($this->caseShift_cache[$cacheKey])) { 833 $str = $this->caseShift_cache[$cacheKey]; 834 } else { 835 array_walk($str, function (&$value) { 836 $value = strtoupper($value); 837 }); 838 if ($cacheKey) { 839 $this->caseShift_cache[$cacheKey] = $str; 840 } 841 } 842 } else { 843 $str = strtoupper($str); 844 } 845 return $str; 846 } 847 848 /** 849 * Compiling an array with tag attributes into a string 850 * 851 * @param array $tagAttrib Tag attributes 852 * @param array $meta Meta information about these attributes (like if they were quoted) 853 * @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"' 854 * @internal 855 */ 856 public function compileTagAttribs($tagAttrib, $meta = []) 857 { 858 $accu = []; 859 foreach ($tagAttrib as $k => $v) { 860 $attr = $meta[$k]['origTag'] ?: $k; 861 if (strcmp($v, '') || isset($meta[$k]['dashType'])) { 862 $dash = $meta[$k]['dashType'] ?: (MathUtility::canBeInterpretedAsInteger($v) ? '' : '"'); 863 $attr .= '=' . $dash . $v . $dash; 864 } 865 $accu[] = $attr; 866 } 867 return implode(' ', $accu); 868 } 869 870 /** 871 * Converts TSconfig into an array for the HTMLcleaner function. 872 * 873 * @param array $TSconfig TSconfig for HTMLcleaner 874 * @param array $keepTags Array of tags to keep (?) 875 * @return array 876 * @internal 877 */ 878 public function HTMLparserConfig($TSconfig, $keepTags = []) 879 { 880 // Allow tags (base list, merged with incoming array) 881 $alTags = array_flip(GeneralUtility::trimExplode(',', strtolower($TSconfig['allowTags'] ?? ''), true)); 882 $keepTags = array_merge($alTags, $keepTags); 883 // Set config properties. 884 if (isset($TSconfig['tags.']) && is_array($TSconfig['tags.'])) { 885 foreach ($TSconfig['tags.'] as $key => $tagC) { 886 if (!is_array($tagC) && $key == strtolower($key)) { 887 if ((string)$tagC === '0') { 888 unset($keepTags[$key]); 889 } 890 if ((string)$tagC === '1' && !isset($keepTags[$key])) { 891 $keepTags[$key] = 1; 892 } 893 } 894 } 895 foreach ($TSconfig['tags.'] as $key => $tagC) { 896 if (is_array($tagC) && $key == strtolower($key)) { 897 $key = substr($key, 0, -1); 898 if (!is_array($keepTags[$key])) { 899 $keepTags[$key] = []; 900 } 901 if (isset($tagC['fixAttrib.']) && is_array($tagC['fixAttrib.'])) { 902 foreach ($tagC['fixAttrib.'] as $atName => $atConfig) { 903 if (is_array($atConfig)) { 904 $atName = substr($atName, 0, -1); 905 if (!is_array($keepTags[$key]['fixAttrib'][$atName])) { 906 $keepTags[$key]['fixAttrib'][$atName] = []; 907 } 908 $keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig); 909 if ((string)$keepTags[$key]['fixAttrib'][$atName]['range'] !== '') { 910 $keepTags[$key]['fixAttrib'][$atName]['range'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']); 911 } 912 if ((string)$keepTags[$key]['fixAttrib'][$atName]['list'] !== '') { 913 $keepTags[$key]['fixAttrib'][$atName]['list'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']); 914 } 915 } 916 } 917 } 918 unset($tagC['fixAttrib.'], $tagC['fixAttrib']); 919 if (!empty($tagC['rmTagIfNoAttrib']) && empty($tagC['nesting'])) { 920 $tagC['nesting'] = 1; 921 } 922 $keepTags[$key] = array_merge($keepTags[$key], $tagC); 923 } 924 } 925 } 926 // LocalNesting 927 if (!empty($TSconfig['localNesting'])) { 928 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['localNesting']), true); 929 foreach ($lN as $tn) { 930 if (isset($keepTags[$tn])) { 931 if (!is_array($keepTags[$tn])) { 932 $keepTags[$tn] = []; 933 } 934 $keepTags[$tn]['nesting'] = 1; 935 } 936 } 937 } 938 if (!empty($TSconfig['globalNesting'])) { 939 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['globalNesting']), true); 940 foreach ($lN as $tn) { 941 if (isset($keepTags[$tn])) { 942 if (!is_array($keepTags[$tn])) { 943 $keepTags[$tn] = []; 944 } 945 $keepTags[$tn]['nesting'] = 'global'; 946 } 947 } 948 } 949 if (!empty($TSconfig['rmTagIfNoAttrib'])) { 950 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), true); 951 foreach ($lN as $tn) { 952 if (isset($keepTags[$tn])) { 953 if (!is_array($keepTags[$tn])) { 954 $keepTags[$tn] = []; 955 } 956 $keepTags[$tn]['rmTagIfNoAttrib'] = 1; 957 if (empty($keepTags[$tn]['nesting'])) { 958 $keepTags[$tn]['nesting'] = 1; 959 } 960 } 961 } 962 } 963 if (!empty($TSconfig['noAttrib'])) { 964 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['noAttrib']), true); 965 foreach ($lN as $tn) { 966 if (isset($keepTags[$tn])) { 967 if (!is_array($keepTags[$tn])) { 968 $keepTags[$tn] = []; 969 } 970 $keepTags[$tn]['allowedAttribs'] = 0; 971 } 972 } 973 } 974 if (!empty($TSconfig['removeTags'])) { 975 $lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['removeTags']), true); 976 foreach ($lN as $tn) { 977 $keepTags[$tn] = []; 978 $keepTags[$tn]['allowedAttribs'] = 0; 979 $keepTags[$tn]['rmTagIfNoAttrib'] = 1; 980 } 981 } 982 // Create additional configuration: 983 $addConfig = []; 984 if (isset($TSconfig['stripEmptyTags'])) { 985 $addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags']; 986 if (isset($TSconfig['stripEmptyTags.'])) { 987 $addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.']; 988 } 989 } 990 return [ 991 $keepTags, 992 '' . ($TSconfig['keepNonMatchedTags'] ?? ''), 993 (int)($TSconfig['htmlSpecialChars'] ?? 0), 994 $addConfig 995 ]; 996 } 997 998 /** 999 * Strips empty tags from HTML. 1000 * 1001 * @param string $content The content to be stripped of empty tags 1002 * @param string $tagList The comma separated list of tags to be stripped. 1003 * If empty, all empty tags will be stripped 1004 * @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only entities will be treated as empty. 1005 * @param bool $keepTags If true, the provided tags will be kept instead of stripped. 1006 * @return string the stripped content 1007 */ 1008 public function stripEmptyTags($content, $tagList = '', $treatNonBreakingSpaceAsEmpty = false, $keepTags = false) 1009 { 1010 if (!empty($tagList)) { 1011 $tagRegEx = implode('|', GeneralUtility::trimExplode(',', $tagList, true)); 1012 if ($keepTags) { 1013 $tagRegEx = '(?!' . $tagRegEx . ')[^ >]+'; 1014 } 1015 } else { 1016 $tagRegEx = '[^ >]+'; // all characters until you reach a > or space; 1017 } 1018 $count = 1; 1019 $nbspRegex = $treatNonBreakingSpaceAsEmpty ? '|( )' : ''; 1020 $finalRegex = sprintf('/<(%s)[^>]*>( %s)*<\/\\1[^>]*>/i', $tagRegEx, $nbspRegex); 1021 while ($count !== 0) { 1022 $content = preg_replace($finalRegex, '', $content, -1, $count); 1023 } 1024 return $content; 1025 } 1026 1027 /** 1028 * Strips the configured empty tags from the HMTL code. 1029 * 1030 * @param string $value 1031 * @param array $configuration 1032 * @return string 1033 */ 1034 protected function stripEmptyTagsIfConfigured($value, $configuration) 1035 { 1036 if (empty($configuration['stripEmptyTags'])) { 1037 return $value; 1038 } 1039 1040 $tags = null; 1041 $keepTags = false; 1042 if (!empty($configuration['stripEmptyTags.']['keepTags'])) { 1043 $tags = $configuration['stripEmptyTags.']['keepTags']; 1044 $keepTags = true; 1045 } elseif (!empty($configuration['stripEmptyTags.']['tags'])) { 1046 $tags = $configuration['stripEmptyTags.']['tags']; 1047 } 1048 1049 $treatNonBreakingSpaceAsEmpty = !empty($configuration['stripEmptyTags.']['treatNonBreakingSpaceAsEmpty']); 1050 1051 return $this->stripEmptyTags($value, $tags, $treatNonBreakingSpaceAsEmpty, $keepTags); 1052 } 1053} 1054