1<?php 2 3namespace Pelago; 4 5use Pelago\Emogrifier\Utilities\CssConcatenator; 6 7/** 8 * This class provides functions for converting CSS styles into inline style attributes in your HTML code. 9 * 10 * For more information, please see the README.md file. 11 * 12 * @deprecated Will be removed for version 4.0.0. Please use the CssInliner class instead. 13 * 14 * @author Cameron Brooks 15 * @author Jaime Prado 16 * @author Oliver Klee <github@oliverklee.de> 17 * @author Roman Ožana <ozana@omdesign.cz> 18 * @author Sander Kruger <s.kruger@invessel.com> 19 * @author Zoli Szabó <zoli.szabo+github@gmail.com> 20 */ 21class Emogrifier 22{ 23 /** 24 * @var int 25 */ 26 const CACHE_KEY_CSS = 0; 27 28 /** 29 * @var int 30 */ 31 const CACHE_KEY_SELECTOR = 1; 32 33 /** 34 * @var int 35 */ 36 const CACHE_KEY_XPATH = 2; 37 38 /** 39 * @var int 40 */ 41 const CACHE_KEY_CSS_DECLARATIONS_BLOCK = 3; 42 43 /** 44 * @var int 45 */ 46 const CACHE_KEY_COMBINED_STYLES = 4; 47 48 /** 49 * for calculating nth-of-type and nth-child selectors 50 * 51 * @var int 52 */ 53 const INDEX = 0; 54 55 /** 56 * for calculating nth-of-type and nth-child selectors 57 * 58 * @var int 59 */ 60 const MULTIPLIER = 1; 61 62 /** 63 * @var string 64 */ 65 const ID_ATTRIBUTE_MATCHER = '/(\\w+)?\\#([\\w\\-]+)/'; 66 67 /** 68 * @var string 69 */ 70 const CLASS_ATTRIBUTE_MATCHER = '/(\\w+|[\\*\\]])?((\\.[\\w\\-]+)+)/'; 71 72 /** 73 * Regular expression component matching a static pseudo class in a selector, without the preceding ":", 74 * for which the applicable elements can be determined (by converting the selector to an XPath expression). 75 * (Contains alternation without a group and is intended to be placed within a capturing, non-capturing or lookahead 76 * group, as appropriate for the usage context.) 77 * 78 * @var string 79 */ 80 const PSEUDO_CLASS_MATCHER = '(?:first|last|nth)-child|nth-of-type|not\\([[:ascii:]]*\\)'; 81 82 /** 83 * @var string 84 */ 85 const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'; 86 87 /** 88 * @var string 89 */ 90 const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>'; 91 92 /** 93 * @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are 94 * self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen> 95 * (deprecated) are also included. 96 * 97 * @see https://bugs.php.net/bug.php?id=73175 98 */ 99 const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)'; 100 101 /** 102 * @var \DOMDocument 103 */ 104 protected $domDocument = null; 105 106 /** 107 * @var \DOMXPath 108 */ 109 protected $xPath = null; 110 111 /** 112 * @var string 113 */ 114 private $css = ''; 115 116 /** 117 * @var bool[] 118 */ 119 private $excludedSelectors = []; 120 121 /** 122 * @var string[] 123 */ 124 private $unprocessableHtmlTags = ['wbr']; 125 126 /** 127 * @var bool[] 128 */ 129 private $allowedMediaTypes = ['all' => true, 'screen' => true, 'print' => true]; 130 131 /** 132 * @var mixed[] 133 */ 134 private $caches = [ 135 self::CACHE_KEY_CSS => [], 136 self::CACHE_KEY_SELECTOR => [], 137 self::CACHE_KEY_XPATH => [], 138 self::CACHE_KEY_CSS_DECLARATIONS_BLOCK => [], 139 self::CACHE_KEY_COMBINED_STYLES => [], 140 ]; 141 142 /** 143 * the visited nodes with the XPath paths as array keys 144 * 145 * @var \DOMElement[] 146 */ 147 private $visitedNodes = []; 148 149 /** 150 * the styles to apply to the nodes with the XPath paths as array keys for the outer array 151 * and the attribute names/values as key/value pairs for the inner array 152 * 153 * @var string[][] 154 */ 155 private $styleAttributesForNodes = []; 156 157 /** 158 * Determines whether the "style" attributes of tags in the the HTML passed to this class should be preserved. 159 * If set to false, the value of the style attributes will be discarded. 160 * 161 * @var bool 162 */ 163 private $isInlineStyleAttributesParsingEnabled = true; 164 165 /** 166 * Determines whether the <style> blocks in the HTML passed to this class should be parsed. 167 * 168 * If set to true, the <style> blocks will be removed from the HTML and their contents will be applied to the HTML 169 * via inline styles. 170 * 171 * If set to false, the <style> blocks will be left as they are in the HTML. 172 * 173 * @var bool 174 */ 175 private $isStyleBlocksParsingEnabled = true; 176 177 /** 178 * For calculating selector precedence order. 179 * Keys are a regular expression part to match before a CSS name. 180 * Values are a multiplier factor per match to weight specificity. 181 * 182 * @var int[] 183 */ 184 private $selectorPrecedenceMatchers = [ 185 // IDs: worth 10000 186 '\\#' => 10000, 187 // classes, attributes, pseudo-classes (not pseudo-elements) except `:not`: worth 100 188 '(?:\\.|\\[|(?<!:):(?!not\\())' => 100, 189 // elements (not attribute values or `:not`), pseudo-elements: worth 1 190 '(?:(?<![="\':\\w\\-])|::)' => 1, 191 ]; 192 193 /** 194 * @var string[] 195 */ 196 private $xPathRules = [ 197 // attribute presence 198 '/^\\[(\\w+|\\w+\\=[\'"]?\\w+[\'"]?)\\]/' => '*[@\\1]', 199 // type and attribute exact value 200 '/(\\w)\\[(\\w+)\\=[\'"]?([\\w\\s]+)[\'"]?\\]/' => '\\1[@\\2="\\3"]', 201 // type and attribute value with ~ (one word within a whitespace-separated list of words) 202 '/([\\w\\*]+)\\[(\\w+)[\\s]*\\~\\=[\\s]*[\'"]?([\\w\\-_\\/]+)[\'"]?\\]/' 203 => '\\1[contains(concat(" ", @\\2, " "), concat(" ", "\\3", " "))]', 204 // type and attribute value with | (either exact value match or prefix followed by a hyphen) 205 '/([\\w\\*]+)\\[(\\w+)[\\s]*\\|\\=[\\s]*[\'"]?([\\w\\-_\\s\\/]+)[\'"]?\\]/' 206 => '\\1[@\\2="\\3" or starts-with(@\\2, concat("\\3", "-"))]', 207 // type and attribute value with ^ (prefix match) 208 '/([\\w\\*]+)\\[(\\w+)[\\s]*\\^\\=[\\s]*[\'"]?([\\w\\-_\\/]+)[\'"]?\\]/' => '\\1[starts-with(@\\2, "\\3")]', 209 // type and attribute value with * (substring match) 210 '/([\\w\\*]+)\\[(\\w+)[\\s]*\\*\\=[\\s]*[\'"]?([\\w\\-_\\s\\/:;]+)[\'"]?\\]/' => '\\1[contains(@\\2, "\\3")]', 211 // adjacent sibling 212 '/\\s*\\+\\s*/' => '/following-sibling::*[1]/self::', 213 // child 214 '/\\s*>\\s*/' => '/', 215 // descendant (don't match spaces within already translated XPath predicates) 216 '/\\s+(?![^\\[\\]]*+\\])/' => '//', 217 // type and :first-child 218 '/([^\\/]+):first-child/i' => '*[1]/self::\\1', 219 // type and :last-child 220 '/([^\\/]+):last-child/i' => '*[last()]/self::\\1', 221 222 // The following matcher will break things if it is placed before the adjacent matcher. 223 // So one of the matchers matches either too much or not enough. 224 // type and attribute value with $ (suffix match) 225 '/([\\w\\*]+)\\[(\\w+)[\\s]*\\$\\=[\\s]*[\'"]?([\\w\\-_\\s\\/]+)[\'"]?\\]/' 226 => '\\1[substring(@\\2, string-length(@\\2) - string-length("\\3") + 1) = "\\3"]', 227 ]; 228 229 /** 230 * Emogrifier will throw Exceptions when it encounters an error instead of silently ignoring them. 231 * 232 * @var bool 233 */ 234 private $debug = false; 235 236 /** 237 * @param string $unprocessedHtml the HTML to process, must be UTF-8-encoded 238 * @param string $css the CSS to merge, must be UTF-8-encoded 239 */ 240 public function __construct($unprocessedHtml = '', $css = '') 241 { 242 if ($unprocessedHtml !== '') { 243 $this->setHtml($unprocessedHtml); 244 } 245 $this->setCss($css); 246 } 247 248 /** 249 * Sets the HTML to process. 250 * 251 * @param string $html the HTML to process, must be UTF-encoded, must not be empty 252 * 253 * @return void 254 * 255 * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string 256 */ 257 public function setHtml($html) 258 { 259 if (!\is_string($html)) { 260 throw new \InvalidArgumentException('The provided HTML must be a string.', 1540403913); 261 } 262 if ($html === '') { 263 throw new \InvalidArgumentException('The provided HTML must not be empty.', 1540403910); 264 } 265 266 $this->createUnifiedDomDocument($html); 267 } 268 269 /** 270 * Provides access to the internal DOMDocument representation of the HTML in its current state. 271 * 272 * @return \DOMDocument 273 */ 274 public function getDomDocument() 275 { 276 return $this->domDocument; 277 } 278 279 /** 280 * Sets the CSS to merge with the HTML. 281 * 282 * @param string $css the CSS to merge, must be UTF-8-encoded 283 * 284 * @return void 285 */ 286 public function setCss($css) 287 { 288 $this->css = $css; 289 } 290 291 /** 292 * Renders the normalized and processed HTML. 293 * 294 * @return string 295 */ 296 protected function render() 297 { 298 $htmlWithPossibleErroneousClosingTags = $this->domDocument->saveHTML(); 299 300 return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); 301 } 302 303 /** 304 * Renders the content of the BODY element of the normalized and processed HTML. 305 * 306 * @return string 307 */ 308 protected function renderBodyContent() 309 { 310 $htmlWithPossibleErroneousClosingTags = $this->domDocument->saveHTML($this->getBodyElement()); 311 $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); 312 313 return \preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml); 314 } 315 316 /** 317 * Eliminates any invalid closing tags for void elements from the given HTML. 318 * 319 * @param string $html 320 * 321 * @return string 322 */ 323 private function removeSelfClosingTagsClosingTags($html) 324 { 325 return \preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html); 326 } 327 328 /** 329 * Returns the BODY element. 330 * 331 * This method assumes that there always is a BODY element. 332 * 333 * @return \DOMElement 334 */ 335 private function getBodyElement() 336 { 337 return $this->domDocument->getElementsByTagName('body')->item(0); 338 } 339 340 /** 341 * Returns the HEAD element. 342 * 343 * This method assumes that there always is a HEAD element. 344 * 345 * @return \DOMElement 346 */ 347 private function getHeadElement() 348 { 349 return $this->domDocument->getElementsByTagName('head')->item(0); 350 } 351 352 /** 353 * Applies $this->css to the given HTML and returns the HTML with the CSS 354 * applied. 355 * 356 * This method places the CSS inline. 357 * 358 * @return string 359 * 360 * @throws \BadMethodCallException 361 */ 362 public function emogrify() 363 { 364 $this->assertExistenceOfHtml(); 365 366 $this->process(); 367 368 return $this->render(); 369 } 370 371 /** 372 * Applies $this->css to the given HTML and returns only the HTML content 373 * within the <body> tag. 374 * 375 * This method places the CSS inline. 376 * 377 * @return string 378 * 379 * @throws \BadMethodCallException 380 */ 381 public function emogrifyBodyContent() 382 { 383 $this->assertExistenceOfHtml(); 384 385 $this->process(); 386 387 return $this->renderBodyContent(); 388 } 389 390 /** 391 * Checks that some HTML has been set, and throws an exception otherwise. 392 * 393 * @return void 394 * 395 * @throws \BadMethodCallException 396 */ 397 private function assertExistenceOfHtml() 398 { 399 if ($this->domDocument === null) { 400 throw new \BadMethodCallException('Please set some HTML first.', 1390393096); 401 } 402 } 403 404 /** 405 * Creates a DOM document from the given HTML and stores it in $this->domDocument. 406 * 407 * The DOM document will always have a BODY element. 408 * 409 * @param string $html 410 * 411 * @return void 412 */ 413 private function createUnifiedDomDocument($html) 414 { 415 $this->createRawDomDocument($html); 416 $this->ensureExistenceOfBodyElement(); 417 } 418 419 /** 420 * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument. 421 * 422 * @param string $html 423 * 424 * @return void 425 */ 426 private function createRawDomDocument($html) 427 { 428 $domDocument = new \DOMDocument(); 429 $domDocument->strictErrorChecking = false; 430 $domDocument->formatOutput = true; 431 $libXmlState = \libxml_use_internal_errors(true); 432 $domDocument->loadHTML($this->prepareHtmlForDomConversion($html)); 433 \libxml_clear_errors(); 434 \libxml_use_internal_errors($libXmlState); 435 436 $this->domDocument = $domDocument; 437 $this->xPath = new \DOMXPath($this->domDocument); 438 } 439 440 /** 441 * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed, 442 * ensuring that the HTML will be good for creating a DOM document from it. 443 * 444 * @param string $html 445 * 446 * @return string the unified HTML 447 */ 448 private function prepareHtmlForDomConversion($html) 449 { 450 $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html); 451 $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes); 452 453 return $this->addContentTypeMetaTag($htmlWithDocumentType); 454 } 455 456 /** 457 * Applies $this->css to $this->domDocument. 458 * 459 * This method places the CSS inline. 460 * 461 * @return void 462 * 463 * @throws \InvalidArgumentException 464 */ 465 protected function process() 466 { 467 $this->clearAllCaches(); 468 $this->purgeVisitedNodes(); 469 470 \set_error_handler([$this, 'handleXpathQueryWarnings'], E_WARNING); 471 $this->removeUnprocessableTags(); 472 $this->normalizeStyleAttributesOfAllNodes(); 473 474 // grab any existing style blocks from the html and append them to the existing CSS 475 // (these blocks should be appended so as to have precedence over conflicting styles in the existing CSS) 476 $allCss = $this->css; 477 if ($this->isStyleBlocksParsingEnabled) { 478 $allCss .= $this->getCssFromAllStyleNodes(); 479 } 480 481 $cssWithoutComments = $this->removeCssComments($allCss); 482 list($cssWithoutCommentsCharsetOrImport, $cssImportRules) 483 = $this->extractImportAndCharsetRules($cssWithoutComments); 484 485 $excludedNodes = $this->getNodesToExclude(); 486 $cssRules = $this->parseCssRules($cssWithoutCommentsCharsetOrImport); 487 foreach ($cssRules['inlinable'] as $cssRule) { 488 // There's no real way to test "PHP Warning" output generated by the following XPath query unless PHPUnit 489 // converts it to an exception. Unfortunately, this would only apply to tests and not work for production 490 // executions, which can still flood logs/output unnecessarily. Instead, Emogrifier's error handler should 491 // always throw an exception and it must be caught here and only rethrown if in debug mode. 492 try { 493 // \DOMXPath::query will always return a DOMNodeList or throw an exception when errors are caught. 494 $nodesMatchingCssSelectors = $this->xPath->query($this->translateCssToXpath($cssRule['selector'])); 495 } catch (\InvalidArgumentException $e) { 496 if ($this->debug) { 497 throw $e; 498 } 499 continue; 500 } 501 502 /** @var \DOMElement $node */ 503 foreach ($nodesMatchingCssSelectors as $node) { 504 if (\in_array($node, $excludedNodes, true)) { 505 continue; 506 } 507 $this->copyInlinableCssToStyleAttribute($node, $cssRule); 508 } 509 } 510 511 if ($this->isInlineStyleAttributesParsingEnabled) { 512 $this->fillStyleAttributesWithMergedStyles(); 513 } 514 515 $this->removeImportantAnnotationFromAllInlineStyles(); 516 517 $this->copyUninlinableCssToStyleNode($cssRules['uninlinable'], $cssImportRules); 518 519 \restore_error_handler(); 520 } 521 522 /** 523 * Searches for all nodes with a style attribute and removes the "!important" annotations out of 524 * the inline style declarations, eventually by rearranging declarations. 525 * 526 * @return void 527 */ 528 private function removeImportantAnnotationFromAllInlineStyles() 529 { 530 foreach ($this->getAllNodesWithStyleAttribute() as $node) { 531 $this->removeImportantAnnotationFromNodeInlineStyle($node); 532 } 533 } 534 535 /** 536 * Removes the "!important" annotations out of the inline style declarations, 537 * eventually by rearranging declarations. 538 * Rearranging needed when !important shorthand properties are followed by some of their 539 * not !important expanded-version properties. 540 * For example "font: 12px serif !important; font-size: 13px;" must be reordered 541 * to "font-size: 13px; font: 12px serif;" in order to remain correct. 542 * 543 * @param \DOMElement $node 544 * 545 * @return void 546 */ 547 private function removeImportantAnnotationFromNodeInlineStyle(\DOMElement $node) 548 { 549 $inlineStyleDeclarations = $this->parseCssDeclarationsBlock($node->getAttribute('style')); 550 $regularStyleDeclarations = []; 551 $importantStyleDeclarations = []; 552 foreach ($inlineStyleDeclarations as $property => $value) { 553 if ($this->attributeValueIsImportant($value)) { 554 $importantStyleDeclarations[$property] = \trim(\str_replace('!important', '', $value)); 555 } else { 556 $regularStyleDeclarations[$property] = $value; 557 } 558 } 559 $inlineStyleDeclarationsInNewOrder = \array_merge( 560 $regularStyleDeclarations, 561 $importantStyleDeclarations 562 ); 563 $node->setAttribute( 564 'style', 565 $this->generateStyleStringFromSingleDeclarationsArray($inlineStyleDeclarationsInNewOrder) 566 ); 567 } 568 569 /** 570 * Returns a list with all DOM nodes that have a style attribute. 571 * 572 * @return \DOMNodeList 573 */ 574 private function getAllNodesWithStyleAttribute() 575 { 576 return $this->xPath->query('//*[@style]'); 577 } 578 579 /** 580 * Extracts and parses the individual rules from a CSS string. 581 * 582 * @param string $css a string of raw CSS code with comments removed 583 * 584 * @return string[][][] A 2-entry array with the key "inlinable" containing rules which can be inlined as `style` 585 * attributes and the key "uninlinable" containing rules which cannot. Each value is an array of string 586 * sub-arrays with the keys 587 * "media" (the media query string, e.g. "@media screen and (max-width: 480px)", 588 * or an empty string if not from a `@media` rule), 589 * "selector" (the CSS selector, e.g., "*" or "header h1"), 590 * "hasUnmatchablePseudo" (true if that selector contains pseudo-elements or dynamic pseudo-classes 591 * such that the declarations cannot be applied inline), 592 * "declarationsBlock" (the semicolon-separated CSS declarations for that selector, 593 * e.g., "color: red; height: 4px;"), 594 * and "line" (the line number e.g. 42) 595 */ 596 private function parseCssRules($css) 597 { 598 $cssKey = \md5($css); 599 if (!isset($this->caches[self::CACHE_KEY_CSS][$cssKey])) { 600 $matches = $this->getCssRuleMatches($css); 601 602 $cssRules = [ 603 'inlinable' => [], 604 'uninlinable' => [], 605 ]; 606 /** @var string[][] $matches */ 607 /** @var string[] $cssRule */ 608 foreach ($matches as $key => $cssRule) { 609 $cssDeclaration = \trim($cssRule['declarations']); 610 if ($cssDeclaration === '') { 611 continue; 612 } 613 614 foreach (\explode(',', $cssRule['selectors']) as $selector) { 615 // don't process pseudo-elements and behavioral (dynamic) pseudo-classes; 616 // only allow structural pseudo-classes 617 $hasPseudoElement = \strpos($selector, '::') !== false; 618 $hasUnsupportedPseudoClass = (bool)\preg_match( 619 '/:(?!' . self::PSEUDO_CLASS_MATCHER . ')[\\w\\-]/i', 620 $selector 621 ); 622 $hasUnmatchablePseudo = $hasPseudoElement || $hasUnsupportedPseudoClass; 623 624 $parsedCssRule = [ 625 'media' => $cssRule['media'], 626 'selector' => \trim($selector), 627 'hasUnmatchablePseudo' => $hasUnmatchablePseudo, 628 'declarationsBlock' => $cssDeclaration, 629 // keep track of where it appears in the file, since order is important 630 'line' => $key, 631 ]; 632 $ruleType = ($cssRule['media'] === '' && !$hasUnmatchablePseudo) ? 'inlinable' : 'uninlinable'; 633 $cssRules[$ruleType][] = $parsedCssRule; 634 } 635 } 636 637 \usort($cssRules['inlinable'], [$this, 'sortBySelectorPrecedence']); 638 639 $this->caches[self::CACHE_KEY_CSS][$cssKey] = $cssRules; 640 } 641 642 return $this->caches[self::CACHE_KEY_CSS][$cssKey]; 643 } 644 645 /** 646 * Parses a string of CSS into the media query, selectors and declarations for each ruleset in order. 647 * 648 * @param string $css CSS with comments removed 649 * 650 * @return string[][] Array of string sub-arrays with the keys 651 * "media" (the media query string, e.g. "@media screen and (max-width: 480px)", 652 * or an empty string if not from an `@media` rule), 653 * "selectors" (the CSS selector(s), e.g., "*" or "h1, h2"), 654 * "declarations" (the semicolon-separated CSS declarations for that/those selector(s), 655 * e.g., "color: red; height: 4px;"), 656 */ 657 private function getCssRuleMatches($css) 658 { 659 $splitCss = $this->splitCssAndMediaQuery($css); 660 661 $ruleMatches = []; 662 foreach ($splitCss as $cssPart) { 663 // process each part for selectors and definitions 664 \preg_match_all('/(?:^|[\\s^{}]*)([^{]+){([^}]*)}/mi', $cssPart['css'], $matches, PREG_SET_ORDER); 665 666 /** @var string[][] $matches */ 667 foreach ($matches as $cssRule) { 668 $ruleMatches[] = [ 669 'media' => $cssPart['media'], 670 'selectors' => $cssRule[1], 671 'declarations' => $cssRule[2], 672 ]; 673 } 674 } 675 676 return $ruleMatches; 677 } 678 679 /** 680 * Disables the parsing of inline styles. 681 * 682 * @return void 683 */ 684 public function disableInlineStyleAttributesParsing() 685 { 686 $this->isInlineStyleAttributesParsingEnabled = false; 687 } 688 689 /** 690 * Disables the parsing of <style> blocks. 691 * 692 * @return void 693 */ 694 public function disableStyleBlocksParsing() 695 { 696 $this->isStyleBlocksParsingEnabled = false; 697 } 698 699 /** 700 * Clears all caches. 701 * 702 * @return void 703 */ 704 private function clearAllCaches() 705 { 706 $this->caches = [ 707 self::CACHE_KEY_CSS => [], 708 self::CACHE_KEY_SELECTOR => [], 709 self::CACHE_KEY_XPATH => [], 710 self::CACHE_KEY_CSS_DECLARATIONS_BLOCK => [], 711 self::CACHE_KEY_COMBINED_STYLES => [], 712 ]; 713 } 714 715 /** 716 * Purges the visited nodes. 717 * 718 * @return void 719 */ 720 private function purgeVisitedNodes() 721 { 722 $this->visitedNodes = []; 723 $this->styleAttributesForNodes = []; 724 } 725 726 /** 727 * Marks a tag for removal. 728 * 729 * There are some HTML tags that DOMDocument cannot process, and it will throw an error if it encounters them. 730 * In particular, DOMDocument will complain if you try to use HTML5 tags in an XHTML document. 731 * 732 * Note: The tags will not be removed if they have any content. 733 * 734 * @param string $tagName the tag name, e.g., "p" 735 * 736 * @return void 737 */ 738 public function addUnprocessableHtmlTag($tagName) 739 { 740 $this->unprocessableHtmlTags[] = $tagName; 741 } 742 743 /** 744 * Drops a tag from the removal list. 745 * 746 * @param string $tagName the tag name, e.g., "p" 747 * 748 * @return void 749 */ 750 public function removeUnprocessableHtmlTag($tagName) 751 { 752 $key = \array_search($tagName, $this->unprocessableHtmlTags, true); 753 if ($key !== false) { 754 /** @var int|string $key */ 755 unset($this->unprocessableHtmlTags[$key]); 756 } 757 } 758 759 /** 760 * Marks a media query type to keep. 761 * 762 * @param string $mediaName the media type name, e.g., "braille" 763 * 764 * @return void 765 */ 766 public function addAllowedMediaType($mediaName) 767 { 768 $this->allowedMediaTypes[$mediaName] = true; 769 } 770 771 /** 772 * Drops a media query type from the allowed list. 773 * 774 * @param string $mediaName the tag name, e.g., "braille" 775 * 776 * @return void 777 */ 778 public function removeAllowedMediaType($mediaName) 779 { 780 if (isset($this->allowedMediaTypes[$mediaName])) { 781 unset($this->allowedMediaTypes[$mediaName]); 782 } 783 } 784 785 /** 786 * Adds a selector to exclude nodes from emogrification. 787 * 788 * Any nodes that match the selector will not have their style altered. 789 * 790 * @param string $selector the selector to exclude, e.g., ".editor" 791 * 792 * @return void 793 */ 794 public function addExcludedSelector($selector) 795 { 796 $this->excludedSelectors[$selector] = true; 797 } 798 799 /** 800 * No longer excludes the nodes matching this selector from emogrification. 801 * 802 * @param string $selector the selector to no longer exclude, e.g., ".editor" 803 * 804 * @return void 805 */ 806 public function removeExcludedSelector($selector) 807 { 808 if (isset($this->excludedSelectors[$selector])) { 809 unset($this->excludedSelectors[$selector]); 810 } 811 } 812 813 /** 814 * Parses the document and normalizes all existing CSS attributes. 815 * This changes 'DISPLAY: none' to 'display: none'. 816 * We wouldn't have to do this if DOMXPath supported XPath 2.0. 817 * Also stores a reference of nodes with existing inline styles so we don't overwrite them. 818 * 819 * @return void 820 */ 821 private function normalizeStyleAttributesOfAllNodes() 822 { 823 /** @var \DOMElement $node */ 824 foreach ($this->getAllNodesWithStyleAttribute() as $node) { 825 if ($this->isInlineStyleAttributesParsingEnabled) { 826 $this->normalizeStyleAttributes($node); 827 } 828 // Remove style attribute in every case, so we can add them back (if inline style attributes 829 // parsing is enabled) to the end of the style list, thus keeping the right priority of CSS rules; 830 // else original inline style rules may remain at the beginning of the final inline style definition 831 // of a node, which may give not the desired results 832 $node->removeAttribute('style'); 833 } 834 } 835 836 /** 837 * Normalizes the value of the "style" attribute and saves it. 838 * 839 * @param \DOMElement $node 840 * 841 * @return void 842 */ 843 private function normalizeStyleAttributes(\DOMElement $node) 844 { 845 $normalizedOriginalStyle = \preg_replace_callback( 846 '/-?+[_a-zA-Z][\\w\\-]*+(?=:)/S', 847 static function (array $m) { 848 return \strtolower($m[0]); 849 }, 850 $node->getAttribute('style') 851 ); 852 853 // in order to not overwrite existing style attributes in the HTML, we 854 // have to save the original HTML styles 855 $nodePath = $node->getNodePath(); 856 if (!isset($this->styleAttributesForNodes[$nodePath])) { 857 $this->styleAttributesForNodes[$nodePath] = $this->parseCssDeclarationsBlock($normalizedOriginalStyle); 858 $this->visitedNodes[$nodePath] = $node; 859 } 860 861 $node->setAttribute('style', $normalizedOriginalStyle); 862 } 863 864 /** 865 * Merges styles from styles attributes and style nodes and applies them to the attribute nodes 866 * 867 * @return void 868 */ 869 private function fillStyleAttributesWithMergedStyles() 870 { 871 foreach ($this->styleAttributesForNodes as $nodePath => $styleAttributesForNode) { 872 $node = $this->visitedNodes[$nodePath]; 873 $currentStyleAttributes = $this->parseCssDeclarationsBlock($node->getAttribute('style')); 874 $node->setAttribute( 875 'style', 876 $this->generateStyleStringFromDeclarationsArrays( 877 $currentStyleAttributes, 878 $styleAttributesForNode 879 ) 880 ); 881 } 882 } 883 884 /** 885 * This method merges old or existing name/value array with new name/value array 886 * and then generates a string of the combined style suitable for placing inline. 887 * This becomes the single point for CSS string generation allowing for consistent 888 * CSS output no matter where the CSS originally came from. 889 * 890 * @param string[] $oldStyles 891 * @param string[] $newStyles 892 * 893 * @return string 894 */ 895 private function generateStyleStringFromDeclarationsArrays(array $oldStyles, array $newStyles) 896 { 897 $cacheKey = \serialize([$oldStyles, $newStyles]); 898 if (isset($this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey])) { 899 return $this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey]; 900 } 901 902 // Unset the overridden styles to preserve order, important if shorthand and individual properties are mixed 903 foreach ($oldStyles as $attributeName => $attributeValue) { 904 if (!isset($newStyles[$attributeName])) { 905 continue; 906 } 907 908 $newAttributeValue = $newStyles[$attributeName]; 909 if ( 910 $this->attributeValueIsImportant($attributeValue) 911 && !$this->attributeValueIsImportant($newAttributeValue) 912 ) { 913 unset($newStyles[$attributeName]); 914 } else { 915 unset($oldStyles[$attributeName]); 916 } 917 } 918 919 $combinedStyles = \array_merge($oldStyles, $newStyles); 920 921 $style = ''; 922 foreach ($combinedStyles as $attributeName => $attributeValue) { 923 $style .= \strtolower(\trim($attributeName)) . ': ' . \trim($attributeValue) . '; '; 924 } 925 $trimmedStyle = \rtrim($style); 926 927 $this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey] = $trimmedStyle; 928 929 return $trimmedStyle; 930 } 931 932 /** 933 * Generates a CSS style string suitable to be used inline from the $styleDeclarations property => value array. 934 * 935 * @param string[] $styleDeclarations 936 * 937 * @return string 938 */ 939 private function generateStyleStringFromSingleDeclarationsArray(array $styleDeclarations) 940 { 941 return $this->generateStyleStringFromDeclarationsArrays([], $styleDeclarations); 942 } 943 944 /** 945 * Checks whether $attributeValue is marked as !important. 946 * 947 * @param string $attributeValue 948 * 949 * @return bool 950 */ 951 private function attributeValueIsImportant($attributeValue) 952 { 953 return \strtolower(\substr(\trim($attributeValue), -10)) === '!important'; 954 } 955 956 /** 957 * Copies $cssRule into the style attribute of $node. 958 * 959 * Note: This method does not check whether $cssRule matches $node. 960 * 961 * @param \DOMElement $node 962 * @param string[][] $cssRule 963 * 964 * @return void 965 */ 966 private function copyInlinableCssToStyleAttribute(\DOMElement $node, array $cssRule) 967 { 968 $newStyleDeclarations = $this->parseCssDeclarationsBlock($cssRule['declarationsBlock']); 969 if ($newStyleDeclarations === []) { 970 return; 971 } 972 973 // if it has a style attribute, get it, process it, and append (overwrite) new stuff 974 if ($node->hasAttribute('style')) { 975 // break it up into an associative array 976 $oldStyleDeclarations = $this->parseCssDeclarationsBlock($node->getAttribute('style')); 977 } else { 978 $oldStyleDeclarations = []; 979 } 980 $node->setAttribute( 981 'style', 982 $this->generateStyleStringFromDeclarationsArrays($oldStyleDeclarations, $newStyleDeclarations) 983 ); 984 } 985 986 /** 987 * Applies $cssRules to $this->domDocument, limited to the rules that actually apply to the document, by placing 988 * them as CSS in a `<style>` element. 989 * 990 * @param string[][] $cssRules the "uninlinable" array of CSS rules returned by `parseCssRules` 991 * @param string $cssImportRules This may contain any `@import` rules that should precede the CSS placed in the 992 * `<style>` element. If there are no unlinlinable CSS rules to copy there, a `<style>` element will be 993 * created containing just `$cssImportRules`. `$cssImportRules` may be an empty string; if it is, and there 994 * are no unlinlinable CSS rules, an empty `<style>` element will not be created. 995 * 996 * @return void 997 */ 998 private function copyUninlinableCssToStyleNode(array $cssRules, $cssImportRules) 999 { 1000 $css = $cssImportRules; 1001 1002 $cssRulesRelevantForDocument = \array_filter($cssRules, [$this, 'existsMatchForSelectorInCssRule']); 1003 1004 // avoid including unneeded class dependency if there are no rules 1005 if ($cssRulesRelevantForDocument !== []) { 1006 // support use without autoload 1007 if (!\class_exists(CssConcatenator::class)) { 1008 require_once __DIR__ . '/Emogrifier/Utilities/CssConcatenator.php'; 1009 } 1010 1011 $cssConcatenator = new CssConcatenator(); 1012 foreach ($cssRulesRelevantForDocument as $cssRule) { 1013 $cssConcatenator->append([$cssRule['selector']], $cssRule['declarationsBlock'], $cssRule['media']); 1014 } 1015 1016 $css .= $cssConcatenator->getCss(); 1017 } 1018 1019 // avoid adding empty style element 1020 if ($css !== '') { 1021 $this->addStyleElementToDocument($css); 1022 } 1023 } 1024 1025 /** 1026 * Checks whether there is at least one matching element for the CSS selector contained in the `selector` element 1027 * of the provided CSS rule. 1028 * 1029 * Any dynamic pseudo-classes will be assumed to apply. If the selector matches a pseudo-element, 1030 * it will test for a match with its originating element. 1031 * 1032 * @param string[] $cssRule 1033 * 1034 * @return bool 1035 * 1036 * @throws \InvalidArgumentException 1037 */ 1038 private function existsMatchForSelectorInCssRule(array $cssRule) 1039 { 1040 $selector = $cssRule['selector']; 1041 if ($cssRule['hasUnmatchablePseudo']) { 1042 $selector = $this->removeUnmatchablePseudoComponents($selector); 1043 } 1044 return $this->existsMatchForCssSelector($selector); 1045 } 1046 1047 /** 1048 * Removes pseudo-elements and dynamic pseudo-classes from a CSS selector, replacing them with "*" if necessary. 1049 * If such a pseudo-component is within the argument of `:not`, the entire `:not` component is removed or replaced. 1050 * 1051 * @param string $selector 1052 * 1053 * @return string Selector which will match the relevant DOM elements if the pseudo-classes are assumed to apply, 1054 * or in the case of pseudo-elements will match their originating element. 1055 */ 1056 private function removeUnmatchablePseudoComponents($selector) 1057 { 1058 // The regex allows nested brackets via `(?2)`. 1059 // A space is temporarily prepended because the callback can't determine if the match was at the very start. 1060 $selectorWithoutNots = \ltrim(\preg_replace_callback( 1061 '/(\\s?+):not(\\([^()]*+(?:(?2)[^()]*+)*+\\))/i', 1062 [$this, 'replaceUnmatchableNotComponent'], 1063 ' ' . $selector 1064 )); 1065 1066 $pseudoComponentMatcher = ':(?!' . self::PSEUDO_CLASS_MATCHER . '):?+[\\w\\-]++(?:\\([^\\)]*+\\))?+'; 1067 return \preg_replace( 1068 ['/(\\s|^)' . $pseudoComponentMatcher . '/i', '/' . $pseudoComponentMatcher . '/i'], 1069 ['$1*', ''], 1070 $selectorWithoutNots 1071 ); 1072 } 1073 1074 /** 1075 * Helps `removeUnmatchablePseudoComponents()` replace or remove a selector `:not(...)` component if its argument 1076 * contains pseudo-elements or dynamic pseudo-classes. 1077 * 1078 * @param string[] $matches array of elements matched by the regular expression 1079 * 1080 * @return string the full match if there were no unmatchable pseudo components within; otherwise, any preceding 1081 * whitespace followed by "*", or an empty string if there was no preceding whitespace 1082 */ 1083 private function replaceUnmatchableNotComponent(array $matches) 1084 { 1085 list($notComponentWithAnyPrecedingWhitespace, $anyPrecedingWhitespace, $notArgumentInBrackets) = $matches; 1086 1087 $hasUnmatchablePseudo = \preg_match( 1088 '/:(?!' . self::PSEUDO_CLASS_MATCHER . ')[\\w\\-:]/i', 1089 $notArgumentInBrackets 1090 ); 1091 1092 if ($hasUnmatchablePseudo) { 1093 return $anyPrecedingWhitespace !== '' ? $anyPrecedingWhitespace . '*' : ''; 1094 } 1095 return $notComponentWithAnyPrecedingWhitespace; 1096 } 1097 1098 /** 1099 * Checks whether there is at least one matching element for $cssSelector. 1100 * When not in debug mode, it returns true also for invalid selectors (because they may be valid, 1101 * just not implemented/recognized yet by Emogrifier). 1102 * 1103 * @param string $cssSelector 1104 * 1105 * @return bool 1106 * 1107 * @throws \InvalidArgumentException 1108 */ 1109 private function existsMatchForCssSelector($cssSelector) 1110 { 1111 try { 1112 $nodesMatchingSelector = $this->xPath->query($this->translateCssToXpath($cssSelector)); 1113 } catch (\InvalidArgumentException $e) { 1114 if ($this->debug) { 1115 throw $e; 1116 } 1117 return true; 1118 } 1119 1120 return $nodesMatchingSelector !== false && $nodesMatchingSelector->length !== 0; 1121 } 1122 1123 /** 1124 * Returns CSS content. 1125 * 1126 * @return string 1127 */ 1128 private function getCssFromAllStyleNodes() 1129 { 1130 $styleNodes = $this->xPath->query('//style'); 1131 1132 if ($styleNodes === false) { 1133 return ''; 1134 } 1135 1136 $css = ''; 1137 /** @var \DOMNode $styleNode */ 1138 foreach ($styleNodes as $styleNode) { 1139 $css .= "\n\n" . $styleNode->nodeValue; 1140 $styleNode->parentNode->removeChild($styleNode); 1141 } 1142 1143 return $css; 1144 } 1145 1146 /** 1147 * Adds a style element with $css to $this->domDocument. 1148 * 1149 * This method is protected to allow overriding. 1150 * 1151 * @see https://github.com/MyIntervals/emogrifier/issues/103 1152 * 1153 * @param string $css 1154 * 1155 * @return void 1156 */ 1157 protected function addStyleElementToDocument($css) 1158 { 1159 $styleElement = $this->domDocument->createElement('style', $css); 1160 $styleAttribute = $this->domDocument->createAttribute('type'); 1161 $styleAttribute->value = 'text/css'; 1162 $styleElement->appendChild($styleAttribute); 1163 1164 $headElement = $this->getHeadElement(); 1165 $headElement->appendChild($styleElement); 1166 } 1167 1168 /** 1169 * Checks that $this->domDocument has a BODY element and adds it if it is missing. 1170 * 1171 * @return void 1172 * 1173 * @throws \UnexpectedValueException 1174 */ 1175 private function ensureExistenceOfBodyElement() 1176 { 1177 if ($this->domDocument->getElementsByTagName('body')->item(0) !== null) { 1178 return; 1179 } 1180 1181 $htmlElement = $this->domDocument->getElementsByTagName('html')->item(0); 1182 if ($htmlElement === null) { 1183 throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930874); 1184 } 1185 $htmlElement->appendChild($this->domDocument->createElement('body')); 1186 } 1187 1188 /** 1189 * Removes comments from the supplied CSS. 1190 * 1191 * @param string $css 1192 * 1193 * @return string CSS with the comments removed 1194 */ 1195 private function removeCssComments($css) 1196 { 1197 return \preg_replace('%/\\*[^*]*+(?:\\*(?!/)[^*]*+)*+\\*/%', '', $css); 1198 } 1199 1200 /** 1201 * Extracts `@import` and `@charset` rules from the supplied CSS. These rules must not be preceded by any other 1202 * rules, or they will be ignored. (From the CSS 2.1 specification: "CSS 2.1 user agents must ignore any '@import' 1203 * rule that occurs inside a block or after any non-ignored statement other than an @charset or an @import rule." 1204 * Note also that `@charset` is case sensitive whereas `@import` is not.) 1205 * 1206 * @param string $css CSS with comments removed 1207 * 1208 * @return string[] The first element is the CSS with the valid `@import` and `@charset` rules removed. The second 1209 * element contains a concatenation of the valid `@import` rules, each followed by whatever whitespace followed it 1210 * in the original CSS (so that either unminified or minified formatting is preserved); if there were no `@import` 1211 * rules, it will be an empty string. The (valid) `@charset` rules are discarded. 1212 */ 1213 private function extractImportAndCharsetRules($css) 1214 { 1215 $possiblyModifiedCss = $css; 1216 $importRules = ''; 1217 1218 while ( 1219 \preg_match( 1220 '/^\\s*+(@((?i)import(?-i)|charset)\\s[^;]++;\\s*+)/', 1221 $possiblyModifiedCss, 1222 $matches 1223 ) 1224 ) { 1225 list($fullMatch, $atRuleAndFollowingWhitespace, $atRuleName) = $matches; 1226 1227 if (\strtolower($atRuleName) === 'import') { 1228 $importRules .= $atRuleAndFollowingWhitespace; 1229 } 1230 1231 $possiblyModifiedCss = \substr($possiblyModifiedCss, \strlen($fullMatch)); 1232 } 1233 1234 return [$possiblyModifiedCss, $importRules]; 1235 } 1236 1237 /** 1238 * Splits input CSS code into an array of parts for different media queries, in order. 1239 * Each part is an array where: 1240 * 1241 * - key "css" will contain clean CSS code (for @media rules this will be the group rule body within "{...}") 1242 * - key "media" will contain "@media " followed by the media query list, for all allowed media queries, 1243 * or an empty string for CSS not within a media query 1244 * 1245 * Example: 1246 * 1247 * The CSS code 1248 * 1249 * "@import "file.css"; h1 { color:red; } @media { h1 {}} @media tv { h1 {}}" 1250 * 1251 * will be parsed into the following array: 1252 * 1253 * 0 => [ 1254 * "css" => "h1 { color:red; }", 1255 * "media" => "" 1256 * ], 1257 * 1 => [ 1258 * "css" => " h1 {}", 1259 * "media" => "@media " 1260 * ] 1261 * 1262 * @param string $css 1263 * 1264 * @return string[][] 1265 */ 1266 private function splitCssAndMediaQuery($css) 1267 { 1268 $mediaTypesExpression = ''; 1269 if (!empty($this->allowedMediaTypes)) { 1270 $mediaTypesExpression = '|' . \implode('|', \array_keys($this->allowedMediaTypes)); 1271 } 1272 1273 $mediaRuleBodyMatcher = '[^{]*+{(?:[^{}]*+{.*})?\\s*+}\\s*+'; 1274 1275 $cssSplitForAllowedMediaTypes = \preg_split( 1276 '#(@media\\s++(?:only\\s++)?+(?:(?=[{(])' . $mediaTypesExpression . ')' . $mediaRuleBodyMatcher 1277 . ')#misU', 1278 $css, 1279 -1, 1280 PREG_SPLIT_DELIM_CAPTURE 1281 ); 1282 1283 // filter the CSS outside/between allowed @media rules 1284 $cssCleaningMatchers = [ 1285 'import/charset directives' => '/\\s*+@(?:import|charset)\\s[^;]++;/i', 1286 'remaining media enclosures' => '/\\s*+@media\\s' . $mediaRuleBodyMatcher . '/isU', 1287 ]; 1288 1289 $splitCss = []; 1290 foreach ($cssSplitForAllowedMediaTypes as $index => $cssPart) { 1291 $isMediaRule = $index % 2 !== 0; 1292 if ($isMediaRule) { 1293 \preg_match('/^([^{]*+){(.*)}[^}]*+$/s', $cssPart, $matches); 1294 $splitCss[] = [ 1295 'css' => $matches[2], 1296 'media' => $matches[1], 1297 ]; 1298 } else { 1299 $cleanedCss = \trim(\preg_replace($cssCleaningMatchers, '', $cssPart)); 1300 if ($cleanedCss !== '') { 1301 $splitCss[] = [ 1302 'css' => $cleanedCss, 1303 'media' => '', 1304 ]; 1305 } 1306 } 1307 } 1308 return $splitCss; 1309 } 1310 1311 /** 1312 * Removes empty unprocessable tags from the DOM document. 1313 * 1314 * @return void 1315 */ 1316 private function removeUnprocessableTags() 1317 { 1318 foreach ($this->unprocessableHtmlTags as $tagName) { 1319 // Deleting nodes from a 'live' NodeList invalidates iteration on it, so a copy must be made to iterate. 1320 $nodes = []; 1321 foreach ($this->domDocument->getElementsByTagName($tagName) as $node) { 1322 $nodes[] = $node; 1323 } 1324 /** @var \DOMNode $node */ 1325 foreach ($nodes as $node) { 1326 if (!$node->hasChildNodes()) { 1327 $node->parentNode->removeChild($node); 1328 } 1329 } 1330 } 1331 } 1332 1333 /** 1334 * Makes sure that the passed HTML has a document type. 1335 * 1336 * @param string $html 1337 * 1338 * @return string HTML with document type 1339 */ 1340 private function ensureDocumentType($html) 1341 { 1342 $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false; 1343 if ($hasDocumentType) { 1344 return $html; 1345 } 1346 1347 return self::DEFAULT_DOCUMENT_TYPE . $html; 1348 } 1349 1350 /** 1351 * Adds a Content-Type meta tag for the charset. 1352 * 1353 * This method also ensures that there is a HEAD element. 1354 * 1355 * @param string $html 1356 * 1357 * @return string the HTML with the meta tag added 1358 */ 1359 private function addContentTypeMetaTag($html) 1360 { 1361 $hasContentTypeMetaTag = \stripos($html, 'Content-Type') !== false; 1362 if ($hasContentTypeMetaTag) { 1363 return $html; 1364 } 1365 1366 // We are trying to insert the meta tag to the right spot in the DOM. 1367 // If we just prepended it to the HTML, we would lose attributes set to the HTML tag. 1368 $hasHeadTag = \stripos($html, '<head') !== false; 1369 $hasHtmlTag = \stripos($html, '<html') !== false; 1370 1371 if ($hasHeadTag) { 1372 $reworkedHtml = \preg_replace('/<head(.*?)>/i', '<head$1>' . self::CONTENT_TYPE_META_TAG, $html); 1373 } elseif ($hasHtmlTag) { 1374 $reworkedHtml = \preg_replace( 1375 '/<html(.*?)>/i', 1376 '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>', 1377 $html 1378 ); 1379 } else { 1380 $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html; 1381 } 1382 1383 return $reworkedHtml; 1384 } 1385 1386 /** 1387 * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a 1388 * self-closing slash. 1389 * 1390 * @param string $html 1391 * 1392 * @return string HTML with problematic tags converted. 1393 */ 1394 private function ensurePhpUnrecognizedSelfClosingTagsAreXml($html) 1395 { 1396 return \preg_replace( 1397 '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%', 1398 '$0/', 1399 $html 1400 ); 1401 } 1402 1403 /** 1404 * @param string[] $a 1405 * @param string[] $b 1406 * 1407 * @return int 1408 */ 1409 private function sortBySelectorPrecedence(array $a, array $b) 1410 { 1411 $precedenceA = $this->getCssSelectorPrecedence($a['selector']); 1412 $precedenceB = $this->getCssSelectorPrecedence($b['selector']); 1413 1414 // We want these sorted in ascending order so selectors with lesser precedence get processed first and 1415 // selectors with greater precedence get sorted last. 1416 $precedenceForEquals = ($a['line'] < $b['line'] ? -1 : 1); 1417 $precedenceForNotEquals = ($precedenceA < $precedenceB ? -1 : 1); 1418 return ($precedenceA === $precedenceB) ? $precedenceForEquals : $precedenceForNotEquals; 1419 } 1420 1421 /** 1422 * @param string $selector 1423 * 1424 * @return int 1425 */ 1426 private function getCssSelectorPrecedence($selector) 1427 { 1428 $selectorKey = \md5($selector); 1429 if (!isset($this->caches[self::CACHE_KEY_SELECTOR][$selectorKey])) { 1430 $precedence = 0; 1431 foreach ($this->selectorPrecedenceMatchers as $matcher => $value) { 1432 if (\trim($selector) === '') { 1433 break; 1434 } 1435 $number = 0; 1436 $selector = \preg_replace('/' . $matcher . '\\w+/', '', $selector, -1, $number); 1437 $precedence += ($value * $number); 1438 } 1439 $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey] = $precedence; 1440 } 1441 1442 return $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey]; 1443 } 1444 1445 /** 1446 * Maps a CSS selector to an XPath query string. 1447 * 1448 * @see http://plasmasturm.org/log/444/ 1449 * 1450 * @param string $cssSelector a CSS selector 1451 * 1452 * @return string the corresponding XPath selector 1453 */ 1454 private function translateCssToXpath($cssSelector) 1455 { 1456 $paddedSelector = ' ' . $cssSelector . ' '; 1457 $lowercasePaddedSelector = \preg_replace_callback( 1458 '/\\s+\\w+\\s+/', 1459 static function (array $matches) { 1460 return \strtolower($matches[0]); 1461 }, 1462 $paddedSelector 1463 ); 1464 $trimmedLowercaseSelector = \trim($lowercasePaddedSelector); 1465 $xPathKey = \md5($trimmedLowercaseSelector); 1466 if (isset($this->caches[self::CACHE_KEY_XPATH][$xPathKey])) { 1467 return $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey]; 1468 } 1469 1470 $hasNotSelector = (bool)\preg_match( 1471 '/^([^:]+):not\\(\\s*([[:ascii:]]+)\\s*\\)$/', 1472 $trimmedLowercaseSelector, 1473 $matches 1474 ); 1475 if ($hasNotSelector) { 1476 /** @var string[] $matches */ 1477 list(, $partBeforeNot, $notContents) = $matches; 1478 $xPath = '//' . $this->translateCssToXpathPass($partBeforeNot) . 1479 '[not(' . $this->translateCssToXpathPassInline($notContents) . ')]'; 1480 } else { 1481 $xPath = '//' . $this->translateCssToXpathPass($trimmedLowercaseSelector); 1482 } 1483 $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey] = $xPath; 1484 1485 return $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey]; 1486 } 1487 1488 /** 1489 * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector. 1490 * 1491 * @param string $trimmedLowercaseSelector 1492 * 1493 * @return string 1494 */ 1495 private function translateCssToXpathPass($trimmedLowercaseSelector) 1496 { 1497 return $this->translateCssToXpathPassWithMatchClassAttributesCallback( 1498 $trimmedLowercaseSelector, 1499 [$this, 'matchClassAttributes'] 1500 ); 1501 } 1502 1503 /** 1504 * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector for inline usage. 1505 * 1506 * @param string $trimmedLowercaseSelector 1507 * 1508 * @return string 1509 */ 1510 private function translateCssToXpathPassInline($trimmedLowercaseSelector) 1511 { 1512 return $this->translateCssToXpathPassWithMatchClassAttributesCallback( 1513 $trimmedLowercaseSelector, 1514 [$this, 'matchClassAttributesInline'] 1515 ); 1516 } 1517 1518 /** 1519 * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector while using 1520 * $matchClassAttributesCallback as to match the class attributes. 1521 * 1522 * @param string $trimmedLowercaseSelector 1523 * @param callable $matchClassAttributesCallback 1524 * 1525 * @return string 1526 */ 1527 private function translateCssToXpathPassWithMatchClassAttributesCallback( 1528 $trimmedLowercaseSelector, 1529 callable $matchClassAttributesCallback 1530 ) { 1531 $roughXpath = \preg_replace(\array_keys($this->xPathRules), $this->xPathRules, $trimmedLowercaseSelector); 1532 $xPathWithIdAttributeMatchers = \preg_replace_callback( 1533 self::ID_ATTRIBUTE_MATCHER, 1534 [$this, 'matchIdAttributes'], 1535 $roughXpath 1536 ); 1537 $xPathWithIdAttributeAndClassMatchers = \preg_replace_callback( 1538 self::CLASS_ATTRIBUTE_MATCHER, 1539 $matchClassAttributesCallback, 1540 $xPathWithIdAttributeMatchers 1541 ); 1542 1543 // Advanced selectors are going to require a bit more advanced emogrification. 1544 $xPathWithIdAttributeAndClassMatchers = \preg_replace_callback( 1545 '/([^\\/]+):nth-child\\(\\s*(odd|even|[+\\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i', 1546 [$this, 'translateNthChild'], 1547 $xPathWithIdAttributeAndClassMatchers 1548 ); 1549 $finalXpath = \preg_replace_callback( 1550 '/([^\\/]+):nth-of-type\\(\\s*(odd|even|[+\\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i', 1551 [$this, 'translateNthOfType'], 1552 $xPathWithIdAttributeAndClassMatchers 1553 ); 1554 1555 return $finalXpath; 1556 } 1557 1558 /** 1559 * @param string[] $match 1560 * 1561 * @return string 1562 */ 1563 private function matchIdAttributes(array $match) 1564 { 1565 return ($match[1] !== '' ? $match[1] : '*') . '[@id="' . $match[2] . '"]'; 1566 } 1567 1568 /** 1569 * @param string[] $match 1570 * 1571 * @return string xPath class attribute query wrapped in element selector 1572 */ 1573 private function matchClassAttributes(array $match) 1574 { 1575 return ($match[1] !== '' ? $match[1] : '*') . '[' . $this->matchClassAttributesInline($match) . ']'; 1576 } 1577 1578 /** 1579 * @param string[] $match 1580 * 1581 * @return string xPath class attribute query 1582 */ 1583 private function matchClassAttributesInline(array $match) 1584 { 1585 return 'contains(concat(" ",@class," "),concat(" ","' . 1586 \str_replace('.', '"," "))][contains(concat(" ",@class," "),concat(" ","', \substr($match[2], 1)) . 1587 '"," "))'; 1588 } 1589 1590 /** 1591 * @param string[] $match 1592 * 1593 * @return string 1594 */ 1595 private function translateNthChild(array $match) 1596 { 1597 $parseResult = $this->parseNth($match); 1598 1599 if (isset($parseResult[self::MULTIPLIER])) { 1600 if ($parseResult[self::MULTIPLIER] < 0) { 1601 $parseResult[self::MULTIPLIER] = \abs($parseResult[self::MULTIPLIER]); 1602 $xPathExpression = \sprintf( 1603 '*[(last() - position()) mod %1%u = %2$u]/self::%3$s', 1604 $parseResult[self::MULTIPLIER], 1605 $parseResult[self::INDEX], 1606 $match[1] 1607 ); 1608 } else { 1609 $xPathExpression = \sprintf( 1610 '*[position() mod %1$u = %2$u]/self::%3$s', 1611 $parseResult[self::MULTIPLIER], 1612 $parseResult[self::INDEX], 1613 $match[1] 1614 ); 1615 } 1616 } else { 1617 $xPathExpression = \sprintf('*[%1$u]/self::%2$s', $parseResult[self::INDEX], $match[1]); 1618 } 1619 1620 return $xPathExpression; 1621 } 1622 1623 /** 1624 * @param string[] $match 1625 * 1626 * @return string 1627 */ 1628 private function translateNthOfType(array $match) 1629 { 1630 $parseResult = $this->parseNth($match); 1631 1632 if (isset($parseResult[self::MULTIPLIER])) { 1633 if ($parseResult[self::MULTIPLIER] < 0) { 1634 $parseResult[self::MULTIPLIER] = \abs($parseResult[self::MULTIPLIER]); 1635 $xPathExpression = \sprintf( 1636 '%1$s[(last() - position()) mod %2$u = %3$u]', 1637 $match[1], 1638 $parseResult[self::MULTIPLIER], 1639 $parseResult[self::INDEX] 1640 ); 1641 } else { 1642 $xPathExpression = \sprintf( 1643 '%1$s[position() mod %2$u = %3$u]', 1644 $match[1], 1645 $parseResult[self::MULTIPLIER], 1646 $parseResult[self::INDEX] 1647 ); 1648 } 1649 } else { 1650 $xPathExpression = \sprintf('%1$s[%2$u]', $match[1], $parseResult[self::INDEX]); 1651 } 1652 1653 return $xPathExpression; 1654 } 1655 1656 /** 1657 * @param string[] $match 1658 * 1659 * @return int[] 1660 */ 1661 private function parseNth(array $match) 1662 { 1663 if (\in_array(\strtolower($match[2]), ['even', 'odd'], true)) { 1664 // we have "even" or "odd" 1665 $index = \strtolower($match[2]) === 'even' ? 0 : 1; 1666 return [self::MULTIPLIER => 2, self::INDEX => $index]; 1667 } 1668 if (\stripos($match[2], 'n') === false) { 1669 // if there is a multiplier 1670 $index = (int)\str_replace(' ', '', $match[2]); 1671 return [self::INDEX => $index]; 1672 } 1673 1674 if (isset($match[3])) { 1675 $multipleTerm = \str_replace($match[3], '', $match[2]); 1676 $index = (int)\str_replace(' ', '', $match[3]); 1677 } else { 1678 $multipleTerm = $match[2]; 1679 $index = 0; 1680 } 1681 1682 $multiplier = \str_ireplace('n', '', $multipleTerm); 1683 1684 if ($multiplier === '') { 1685 $multiplier = 1; 1686 } elseif ($multiplier === '0') { 1687 return [self::INDEX => $index]; 1688 } else { 1689 $multiplier = (int)$multiplier; 1690 } 1691 1692 while ($index < 0) { 1693 $index += \abs($multiplier); 1694 } 1695 1696 return [self::MULTIPLIER => $multiplier, self::INDEX => $index]; 1697 } 1698 1699 /** 1700 * Parses a CSS declaration block into property name/value pairs. 1701 * 1702 * Example: 1703 * 1704 * The declaration block 1705 * 1706 * "color: #000; font-weight: bold;" 1707 * 1708 * will be parsed into the following array: 1709 * 1710 * "color" => "#000" 1711 * "font-weight" => "bold" 1712 * 1713 * @param string $cssDeclarationsBlock the CSS declarations block without the curly braces, may be empty 1714 * 1715 * @return string[] 1716 * the CSS declarations with the property names as array keys and the property values as array values 1717 */ 1718 private function parseCssDeclarationsBlock($cssDeclarationsBlock) 1719 { 1720 if (isset($this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock])) { 1721 return $this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock]; 1722 } 1723 1724 $properties = []; 1725 foreach (\preg_split('/;(?!base64|charset)/', $cssDeclarationsBlock) as $declaration) { 1726 $matches = []; 1727 if (!\preg_match('/^([A-Za-z\\-]+)\\s*:\\s*(.+)$/s', \trim($declaration), $matches)) { 1728 continue; 1729 } 1730 1731 $propertyName = \strtolower($matches[1]); 1732 $propertyValue = $matches[2]; 1733 $properties[$propertyName] = $propertyValue; 1734 } 1735 $this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock] = $properties; 1736 1737 return $properties; 1738 } 1739 1740 /** 1741 * Find the nodes that are not to be emogrified. 1742 * 1743 * @return \DOMElement[] 1744 * 1745 * @throws \InvalidArgumentException 1746 */ 1747 private function getNodesToExclude() 1748 { 1749 $excludedNodes = []; 1750 foreach (\array_keys($this->excludedSelectors) as $selectorToExclude) { 1751 try { 1752 $matchingNodes = $this->xPath->query($this->translateCssToXpath($selectorToExclude)); 1753 } catch (\InvalidArgumentException $e) { 1754 if ($this->debug) { 1755 throw $e; 1756 } 1757 continue; 1758 } 1759 foreach ($matchingNodes as $node) { 1760 $excludedNodes[] = $node; 1761 } 1762 } 1763 1764 return $excludedNodes; 1765 } 1766 1767 /** 1768 * Handles invalid xPath expression warnings, generated during the process() method, 1769 * during querying \DOMDocument and trigger an \InvalidArgumentException with an invalid selector 1770 * or \RuntimeException, depending on the source of the warning. 1771 * 1772 * @param int $type 1773 * @param string $message 1774 * @param string $file 1775 * @param int $line 1776 * @param array $context 1777 * 1778 * @return bool always false 1779 * 1780 * @throws \InvalidArgumentException 1781 * @throws \RuntimeException 1782 */ 1783 public function handleXpathQueryWarnings(// phpcs:ignore Generic.CodeAnalysis.UnusedFunctionParameter 1784 $type, 1785 $message, 1786 $file, 1787 $line, 1788 array $context 1789 ) { 1790 $selector = ''; 1791 if (isset($context['cssRule']['selector'])) { 1792 // warnings generated by invalid/unrecognized selectors in method process() 1793 $selector = $context['cssRule']['selector']; 1794 } elseif (isset($context['selectorToExclude'])) { 1795 // warnings generated by invalid/unrecognized selectors in method getNodesToExclude() 1796 $selector = $context['selectorToExclude']; 1797 } elseif (isset($context['cssSelector'])) { 1798 // warnings generated by invalid/unrecognized selectors in method existsMatchForCssSelector() 1799 $selector = $context['cssSelector']; 1800 } 1801 1802 if ($selector !== '') { 1803 throw new \InvalidArgumentException( 1804 \sprintf('%1$s in selector >> %2$s << in %3$s on line %4$u', $message, $selector, $file, $line), 1805 1509279985 1806 ); 1807 } 1808 1809 // Catches eventual warnings generated by method getAllNodesWithStyleAttribute() 1810 if (isset($context['xPath'])) { 1811 throw new \RuntimeException( 1812 \sprintf('%1$s in %2$s on line %3$u', $message, $file, $line), 1813 1509280067 1814 ); 1815 } 1816 1817 // the normal error handling continues when handler return false 1818 return false; 1819 } 1820 1821 /** 1822 * Sets the debug mode. 1823 * 1824 * @param bool $debug set to true to enable debug mode 1825 * 1826 * @return void 1827 */ 1828 public function setDebug($debug) 1829 { 1830 $this->debug = $debug; 1831 } 1832} 1833