1<?php
2
3namespace Pelago;
4
5use Pelago\Emogrifier\Utilities\CssConcatenator;
6
7/**
8 * This class provides functions for converting CSS styles into inline style attributes in your HTML code.
9 *
10 * For more information, please see the README.md file.
11 *
12 * @deprecated Will be removed for version 4.0.0. Please use the CssInliner class instead.
13 *
14 * @author Cameron Brooks
15 * @author Jaime Prado
16 * @author Oliver Klee <github@oliverklee.de>
17 * @author Roman Ožana <ozana@omdesign.cz>
18 * @author Sander Kruger <s.kruger@invessel.com>
19 * @author Zoli Szabó <zoli.szabo+github@gmail.com>
20 */
21class Emogrifier
22{
23    /**
24     * @var int
25     */
26    const CACHE_KEY_CSS = 0;
27
28    /**
29     * @var int
30     */
31    const CACHE_KEY_SELECTOR = 1;
32
33    /**
34     * @var int
35     */
36    const CACHE_KEY_XPATH = 2;
37
38    /**
39     * @var int
40     */
41    const CACHE_KEY_CSS_DECLARATIONS_BLOCK = 3;
42
43    /**
44     * @var int
45     */
46    const CACHE_KEY_COMBINED_STYLES = 4;
47
48    /**
49     * for calculating nth-of-type and nth-child selectors
50     *
51     * @var int
52     */
53    const INDEX = 0;
54
55    /**
56     * for calculating nth-of-type and nth-child selectors
57     *
58     * @var int
59     */
60    const MULTIPLIER = 1;
61
62    /**
63     * @var string
64     */
65    const ID_ATTRIBUTE_MATCHER = '/(\\w+)?\\#([\\w\\-]+)/';
66
67    /**
68     * @var string
69     */
70    const CLASS_ATTRIBUTE_MATCHER = '/(\\w+|[\\*\\]])?((\\.[\\w\\-]+)+)/';
71
72    /**
73     * Regular expression component matching a static pseudo class in a selector, without the preceding ":",
74     * for which the applicable elements can be determined (by converting the selector to an XPath expression).
75     * (Contains alternation without a group and is intended to be placed within a capturing, non-capturing or lookahead
76     * group, as appropriate for the usage context.)
77     *
78     * @var string
79     */
80    const PSEUDO_CLASS_MATCHER = '(?:first|last|nth)-child|nth-of-type|not\\([[:ascii:]]*\\)';
81
82    /**
83     * @var string
84     */
85    const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
86
87    /**
88     * @var string
89     */
90    const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
91
92    /**
93     * @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are
94     *      self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen>
95     *      (deprecated) are also included.
96     *
97     * @see https://bugs.php.net/bug.php?id=73175
98     */
99    const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
100
101    /**
102     * @var \DOMDocument
103     */
104    protected $domDocument = null;
105
106    /**
107     * @var \DOMXPath
108     */
109    protected $xPath = null;
110
111    /**
112     * @var string
113     */
114    private $css = '';
115
116    /**
117     * @var bool[]
118     */
119    private $excludedSelectors = [];
120
121    /**
122     * @var string[]
123     */
124    private $unprocessableHtmlTags = ['wbr'];
125
126    /**
127     * @var bool[]
128     */
129    private $allowedMediaTypes = ['all' => true, 'screen' => true, 'print' => true];
130
131    /**
132     * @var mixed[]
133     */
134    private $caches = [
135        self::CACHE_KEY_CSS => [],
136        self::CACHE_KEY_SELECTOR => [],
137        self::CACHE_KEY_XPATH => [],
138        self::CACHE_KEY_CSS_DECLARATIONS_BLOCK => [],
139        self::CACHE_KEY_COMBINED_STYLES => [],
140    ];
141
142    /**
143     * the visited nodes with the XPath paths as array keys
144     *
145     * @var \DOMElement[]
146     */
147    private $visitedNodes = [];
148
149    /**
150     * the styles to apply to the nodes with the XPath paths as array keys for the outer array
151     * and the attribute names/values as key/value pairs for the inner array
152     *
153     * @var string[][]
154     */
155    private $styleAttributesForNodes = [];
156
157    /**
158     * Determines whether the "style" attributes of tags in the the HTML passed to this class should be preserved.
159     * If set to false, the value of the style attributes will be discarded.
160     *
161     * @var bool
162     */
163    private $isInlineStyleAttributesParsingEnabled = true;
164
165    /**
166     * Determines whether the <style> blocks in the HTML passed to this class should be parsed.
167     *
168     * If set to true, the <style> blocks will be removed from the HTML and their contents will be applied to the HTML
169     * via inline styles.
170     *
171     * If set to false, the <style> blocks will be left as they are in the HTML.
172     *
173     * @var bool
174     */
175    private $isStyleBlocksParsingEnabled = true;
176
177    /**
178     * For calculating selector precedence order.
179     * Keys are a regular expression part to match before a CSS name.
180     * Values are a multiplier factor per match to weight specificity.
181     *
182     * @var int[]
183     */
184    private $selectorPrecedenceMatchers = [
185        // IDs: worth 10000
186        '\\#' => 10000,
187        // classes, attributes, pseudo-classes (not pseudo-elements) except `:not`: worth 100
188        '(?:\\.|\\[|(?<!:):(?!not\\())' => 100,
189        // elements (not attribute values or `:not`), pseudo-elements: worth 1
190        '(?:(?<![="\':\\w\\-])|::)' => 1,
191    ];
192
193    /**
194     * @var string[]
195     */
196    private $xPathRules = [
197        // attribute presence
198        '/^\\[(\\w+|\\w+\\=[\'"]?\\w+[\'"]?)\\]/' => '*[@\\1]',
199        // type and attribute exact value
200        '/(\\w)\\[(\\w+)\\=[\'"]?([\\w\\s]+)[\'"]?\\]/' => '\\1[@\\2="\\3"]',
201        // type and attribute value with ~ (one word within a whitespace-separated list of words)
202        '/([\\w\\*]+)\\[(\\w+)[\\s]*\\~\\=[\\s]*[\'"]?([\\w\\-_\\/]+)[\'"]?\\]/'
203        => '\\1[contains(concat(" ", @\\2, " "), concat(" ", "\\3", " "))]',
204        // type and attribute value with | (either exact value match or prefix followed by a hyphen)
205        '/([\\w\\*]+)\\[(\\w+)[\\s]*\\|\\=[\\s]*[\'"]?([\\w\\-_\\s\\/]+)[\'"]?\\]/'
206        => '\\1[@\\2="\\3" or starts-with(@\\2, concat("\\3", "-"))]',
207        // type and attribute value with ^ (prefix match)
208        '/([\\w\\*]+)\\[(\\w+)[\\s]*\\^\\=[\\s]*[\'"]?([\\w\\-_\\/]+)[\'"]?\\]/' => '\\1[starts-with(@\\2, "\\3")]',
209        // type and attribute value with * (substring match)
210        '/([\\w\\*]+)\\[(\\w+)[\\s]*\\*\\=[\\s]*[\'"]?([\\w\\-_\\s\\/:;]+)[\'"]?\\]/' => '\\1[contains(@\\2, "\\3")]',
211        // adjacent sibling
212        '/\\s*\\+\\s*/' => '/following-sibling::*[1]/self::',
213        // child
214        '/\\s*>\\s*/' => '/',
215        // descendant (don't match spaces within already translated XPath predicates)
216        '/\\s+(?![^\\[\\]]*+\\])/' => '//',
217        // type and :first-child
218        '/([^\\/]+):first-child/i' => '*[1]/self::\\1',
219        // type and :last-child
220        '/([^\\/]+):last-child/i' => '*[last()]/self::\\1',
221
222        // The following matcher will break things if it is placed before the adjacent matcher.
223        // So one of the matchers matches either too much or not enough.
224        // type and attribute value with $ (suffix match)
225        '/([\\w\\*]+)\\[(\\w+)[\\s]*\\$\\=[\\s]*[\'"]?([\\w\\-_\\s\\/]+)[\'"]?\\]/'
226        => '\\1[substring(@\\2, string-length(@\\2) - string-length("\\3") + 1) = "\\3"]',
227    ];
228
229    /**
230     * Emogrifier will throw Exceptions when it encounters an error instead of silently ignoring them.
231     *
232     * @var bool
233     */
234    private $debug = false;
235
236    /**
237     * @param string $unprocessedHtml the HTML to process, must be UTF-8-encoded
238     * @param string $css the CSS to merge, must be UTF-8-encoded
239     */
240    public function __construct($unprocessedHtml = '', $css = '')
241    {
242        if ($unprocessedHtml !== '') {
243            $this->setHtml($unprocessedHtml);
244        }
245        $this->setCss($css);
246    }
247
248    /**
249     * Sets the HTML to process.
250     *
251     * @param string $html the HTML to process, must be UTF-encoded, must not be empty
252     *
253     * @return void
254     *
255     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
256     */
257    public function setHtml($html)
258    {
259        if (!\is_string($html)) {
260            throw new \InvalidArgumentException('The provided HTML must be a string.', 1540403913);
261        }
262        if ($html === '') {
263            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1540403910);
264        }
265
266        $this->createUnifiedDomDocument($html);
267    }
268
269    /**
270     * Provides access to the internal DOMDocument representation of the HTML in its current state.
271     *
272     * @return \DOMDocument
273     */
274    public function getDomDocument()
275    {
276        return $this->domDocument;
277    }
278
279    /**
280     * Sets the CSS to merge with the HTML.
281     *
282     * @param string $css the CSS to merge, must be UTF-8-encoded
283     *
284     * @return void
285     */
286    public function setCss($css)
287    {
288        $this->css = $css;
289    }
290
291    /**
292     * Renders the normalized and processed HTML.
293     *
294     * @return string
295     */
296    protected function render()
297    {
298        $htmlWithPossibleErroneousClosingTags = $this->domDocument->saveHTML();
299
300        return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
301    }
302
303    /**
304     * Renders the content of the BODY element of the normalized and processed HTML.
305     *
306     * @return string
307     */
308    protected function renderBodyContent()
309    {
310        $htmlWithPossibleErroneousClosingTags = $this->domDocument->saveHTML($this->getBodyElement());
311        $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
312
313        return \preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
314    }
315
316    /**
317     * Eliminates any invalid closing tags for void elements from the given HTML.
318     *
319     * @param string $html
320     *
321     * @return string
322     */
323    private function removeSelfClosingTagsClosingTags($html)
324    {
325        return \preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
326    }
327
328    /**
329     * Returns the BODY element.
330     *
331     * This method assumes that there always is a BODY element.
332     *
333     * @return \DOMElement
334     */
335    private function getBodyElement()
336    {
337        return $this->domDocument->getElementsByTagName('body')->item(0);
338    }
339
340    /**
341     * Returns the HEAD element.
342     *
343     * This method assumes that there always is a HEAD element.
344     *
345     * @return \DOMElement
346     */
347    private function getHeadElement()
348    {
349        return $this->domDocument->getElementsByTagName('head')->item(0);
350    }
351
352    /**
353     * Applies $this->css to the given HTML and returns the HTML with the CSS
354     * applied.
355     *
356     * This method places the CSS inline.
357     *
358     * @return string
359     *
360     * @throws \BadMethodCallException
361     */
362    public function emogrify()
363    {
364        $this->assertExistenceOfHtml();
365
366        $this->process();
367
368        return $this->render();
369    }
370
371    /**
372     * Applies $this->css to the given HTML and returns only the HTML content
373     * within the <body> tag.
374     *
375     * This method places the CSS inline.
376     *
377     * @return string
378     *
379     * @throws \BadMethodCallException
380     */
381    public function emogrifyBodyContent()
382    {
383        $this->assertExistenceOfHtml();
384
385        $this->process();
386
387        return $this->renderBodyContent();
388    }
389
390    /**
391     * Checks that some HTML has been set, and throws an exception otherwise.
392     *
393     * @return void
394     *
395     * @throws \BadMethodCallException
396     */
397    private function assertExistenceOfHtml()
398    {
399        if ($this->domDocument === null) {
400            throw new \BadMethodCallException('Please set some HTML first.', 1390393096);
401        }
402    }
403
404    /**
405     * Creates a DOM document from the given HTML and stores it in $this->domDocument.
406     *
407     * The DOM document will always have a BODY element.
408     *
409     * @param string $html
410     *
411     * @return void
412     */
413    private function createUnifiedDomDocument($html)
414    {
415        $this->createRawDomDocument($html);
416        $this->ensureExistenceOfBodyElement();
417    }
418
419    /**
420     * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
421     *
422     * @param string $html
423     *
424     * @return void
425     */
426    private function createRawDomDocument($html)
427    {
428        $domDocument = new \DOMDocument();
429        $domDocument->strictErrorChecking = false;
430        $domDocument->formatOutput = true;
431        $libXmlState = \libxml_use_internal_errors(true);
432        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
433        \libxml_clear_errors();
434        \libxml_use_internal_errors($libXmlState);
435
436        $this->domDocument = $domDocument;
437        $this->xPath = new \DOMXPath($this->domDocument);
438    }
439
440    /**
441     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
442     * ensuring that the HTML will be good for creating a DOM document from it.
443     *
444     * @param string $html
445     *
446     * @return string the unified HTML
447     */
448    private function prepareHtmlForDomConversion($html)
449    {
450        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
451        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
452
453        return $this->addContentTypeMetaTag($htmlWithDocumentType);
454    }
455
456    /**
457     * Applies $this->css to $this->domDocument.
458     *
459     * This method places the CSS inline.
460     *
461     * @return void
462     *
463     * @throws \InvalidArgumentException
464     */
465    protected function process()
466    {
467        $this->clearAllCaches();
468        $this->purgeVisitedNodes();
469
470        \set_error_handler([$this, 'handleXpathQueryWarnings'], E_WARNING);
471        $this->removeUnprocessableTags();
472        $this->normalizeStyleAttributesOfAllNodes();
473
474        // grab any existing style blocks from the html and append them to the existing CSS
475        // (these blocks should be appended so as to have precedence over conflicting styles in the existing CSS)
476        $allCss = $this->css;
477        if ($this->isStyleBlocksParsingEnabled) {
478            $allCss .= $this->getCssFromAllStyleNodes();
479        }
480
481        $cssWithoutComments = $this->removeCssComments($allCss);
482        list($cssWithoutCommentsCharsetOrImport, $cssImportRules)
483            = $this->extractImportAndCharsetRules($cssWithoutComments);
484
485        $excludedNodes = $this->getNodesToExclude();
486        $cssRules = $this->parseCssRules($cssWithoutCommentsCharsetOrImport);
487        foreach ($cssRules['inlinable'] as $cssRule) {
488            // There's no real way to test "PHP Warning" output generated by the following XPath query unless PHPUnit
489            // converts it to an exception. Unfortunately, this would only apply to tests and not work for production
490            // executions, which can still flood logs/output unnecessarily. Instead, Emogrifier's error handler should
491            // always throw an exception and it must be caught here and only rethrown if in debug mode.
492            try {
493                // \DOMXPath::query will always return a DOMNodeList or throw an exception when errors are caught.
494                $nodesMatchingCssSelectors = $this->xPath->query($this->translateCssToXpath($cssRule['selector']));
495            } catch (\InvalidArgumentException $e) {
496                if ($this->debug) {
497                    throw $e;
498                }
499                continue;
500            }
501
502            /** @var \DOMElement $node */
503            foreach ($nodesMatchingCssSelectors as $node) {
504                if (\in_array($node, $excludedNodes, true)) {
505                    continue;
506                }
507                $this->copyInlinableCssToStyleAttribute($node, $cssRule);
508            }
509        }
510
511        if ($this->isInlineStyleAttributesParsingEnabled) {
512            $this->fillStyleAttributesWithMergedStyles();
513        }
514
515        $this->removeImportantAnnotationFromAllInlineStyles();
516
517        $this->copyUninlinableCssToStyleNode($cssRules['uninlinable'], $cssImportRules);
518
519        \restore_error_handler();
520    }
521
522    /**
523     * Searches for all nodes with a style attribute and removes the "!important" annotations out of
524     * the inline style declarations, eventually by rearranging declarations.
525     *
526     * @return void
527     */
528    private function removeImportantAnnotationFromAllInlineStyles()
529    {
530        foreach ($this->getAllNodesWithStyleAttribute() as $node) {
531            $this->removeImportantAnnotationFromNodeInlineStyle($node);
532        }
533    }
534
535    /**
536     * Removes the "!important" annotations out of the inline style declarations,
537     * eventually by rearranging declarations.
538     * Rearranging needed when !important shorthand properties are followed by some of their
539     * not !important expanded-version properties.
540     * For example "font: 12px serif !important; font-size: 13px;" must be reordered
541     * to "font-size: 13px; font: 12px serif;" in order to remain correct.
542     *
543     * @param \DOMElement $node
544     *
545     * @return void
546     */
547    private function removeImportantAnnotationFromNodeInlineStyle(\DOMElement $node)
548    {
549        $inlineStyleDeclarations = $this->parseCssDeclarationsBlock($node->getAttribute('style'));
550        $regularStyleDeclarations = [];
551        $importantStyleDeclarations = [];
552        foreach ($inlineStyleDeclarations as $property => $value) {
553            if ($this->attributeValueIsImportant($value)) {
554                $importantStyleDeclarations[$property] = \trim(\str_replace('!important', '', $value));
555            } else {
556                $regularStyleDeclarations[$property] = $value;
557            }
558        }
559        $inlineStyleDeclarationsInNewOrder = \array_merge(
560            $regularStyleDeclarations,
561            $importantStyleDeclarations
562        );
563        $node->setAttribute(
564            'style',
565            $this->generateStyleStringFromSingleDeclarationsArray($inlineStyleDeclarationsInNewOrder)
566        );
567    }
568
569    /**
570     * Returns a list with all DOM nodes that have a style attribute.
571     *
572     * @return \DOMNodeList
573     */
574    private function getAllNodesWithStyleAttribute()
575    {
576        return $this->xPath->query('//*[@style]');
577    }
578
579    /**
580     * Extracts and parses the individual rules from a CSS string.
581     *
582     * @param string $css a string of raw CSS code with comments removed
583     *
584     * @return string[][][] A 2-entry array with the key "inlinable" containing rules which can be inlined as `style`
585     *         attributes and the key "uninlinable" containing rules which cannot.  Each value is an array of string
586     *         sub-arrays with the keys
587     *         "media" (the media query string, e.g. "@media screen and (max-width: 480px)",
588     *         or an empty string if not from a `@media` rule),
589     *         "selector" (the CSS selector, e.g., "*" or "header h1"),
590     *         "hasUnmatchablePseudo" (true if that selector contains pseudo-elements or dynamic pseudo-classes
591     *         such that the declarations cannot be applied inline),
592     *         "declarationsBlock" (the semicolon-separated CSS declarations for that selector,
593     *         e.g., "color: red; height: 4px;"),
594     *         and "line" (the line number e.g. 42)
595     */
596    private function parseCssRules($css)
597    {
598        $cssKey = \md5($css);
599        if (!isset($this->caches[self::CACHE_KEY_CSS][$cssKey])) {
600            $matches = $this->getCssRuleMatches($css);
601
602            $cssRules = [
603                'inlinable' => [],
604                'uninlinable' => [],
605            ];
606            /** @var string[][] $matches */
607            /** @var string[] $cssRule */
608            foreach ($matches as $key => $cssRule) {
609                $cssDeclaration = \trim($cssRule['declarations']);
610                if ($cssDeclaration === '') {
611                    continue;
612                }
613
614                foreach (\explode(',', $cssRule['selectors']) as $selector) {
615                    // don't process pseudo-elements and behavioral (dynamic) pseudo-classes;
616                    // only allow structural pseudo-classes
617                    $hasPseudoElement = \strpos($selector, '::') !== false;
618                    $hasUnsupportedPseudoClass = (bool)\preg_match(
619                        '/:(?!' . self::PSEUDO_CLASS_MATCHER . ')[\\w\\-]/i',
620                        $selector
621                    );
622                    $hasUnmatchablePseudo = $hasPseudoElement || $hasUnsupportedPseudoClass;
623
624                    $parsedCssRule = [
625                        'media' => $cssRule['media'],
626                        'selector' => \trim($selector),
627                        'hasUnmatchablePseudo' => $hasUnmatchablePseudo,
628                        'declarationsBlock' => $cssDeclaration,
629                        // keep track of where it appears in the file, since order is important
630                        'line' => $key,
631                    ];
632                    $ruleType = ($cssRule['media'] === '' && !$hasUnmatchablePseudo) ? 'inlinable' : 'uninlinable';
633                    $cssRules[$ruleType][] = $parsedCssRule;
634                }
635            }
636
637            \usort($cssRules['inlinable'], [$this, 'sortBySelectorPrecedence']);
638
639            $this->caches[self::CACHE_KEY_CSS][$cssKey] = $cssRules;
640        }
641
642        return $this->caches[self::CACHE_KEY_CSS][$cssKey];
643    }
644
645    /**
646     * Parses a string of CSS into the media query, selectors and declarations for each ruleset in order.
647     *
648     * @param string $css CSS with comments removed
649     *
650     * @return string[][] Array of string sub-arrays with the keys
651     *         "media" (the media query string, e.g. "@media screen and (max-width: 480px)",
652     *         or an empty string if not from an `@media` rule),
653     *         "selectors" (the CSS selector(s), e.g., "*" or "h1, h2"),
654     *         "declarations" (the semicolon-separated CSS declarations for that/those selector(s),
655     *         e.g., "color: red; height: 4px;"),
656     */
657    private function getCssRuleMatches($css)
658    {
659        $splitCss = $this->splitCssAndMediaQuery($css);
660
661        $ruleMatches = [];
662        foreach ($splitCss as $cssPart) {
663            // process each part for selectors and definitions
664            \preg_match_all('/(?:^|[\\s^{}]*)([^{]+){([^}]*)}/mi', $cssPart['css'], $matches, PREG_SET_ORDER);
665
666            /** @var string[][] $matches */
667            foreach ($matches as $cssRule) {
668                $ruleMatches[] = [
669                    'media' => $cssPart['media'],
670                    'selectors' => $cssRule[1],
671                    'declarations' => $cssRule[2],
672                ];
673            }
674        }
675
676        return $ruleMatches;
677    }
678
679    /**
680     * Disables the parsing of inline styles.
681     *
682     * @return void
683     */
684    public function disableInlineStyleAttributesParsing()
685    {
686        $this->isInlineStyleAttributesParsingEnabled = false;
687    }
688
689    /**
690     * Disables the parsing of <style> blocks.
691     *
692     * @return void
693     */
694    public function disableStyleBlocksParsing()
695    {
696        $this->isStyleBlocksParsingEnabled = false;
697    }
698
699    /**
700     * Clears all caches.
701     *
702     * @return void
703     */
704    private function clearAllCaches()
705    {
706        $this->caches = [
707            self::CACHE_KEY_CSS => [],
708            self::CACHE_KEY_SELECTOR => [],
709            self::CACHE_KEY_XPATH => [],
710            self::CACHE_KEY_CSS_DECLARATIONS_BLOCK => [],
711            self::CACHE_KEY_COMBINED_STYLES => [],
712        ];
713    }
714
715    /**
716     * Purges the visited nodes.
717     *
718     * @return void
719     */
720    private function purgeVisitedNodes()
721    {
722        $this->visitedNodes = [];
723        $this->styleAttributesForNodes = [];
724    }
725
726    /**
727     * Marks a tag for removal.
728     *
729     * There are some HTML tags that DOMDocument cannot process, and it will throw an error if it encounters them.
730     * In particular, DOMDocument will complain if you try to use HTML5 tags in an XHTML document.
731     *
732     * Note: The tags will not be removed if they have any content.
733     *
734     * @param string $tagName the tag name, e.g., "p"
735     *
736     * @return void
737     */
738    public function addUnprocessableHtmlTag($tagName)
739    {
740        $this->unprocessableHtmlTags[] = $tagName;
741    }
742
743    /**
744     * Drops a tag from the removal list.
745     *
746     * @param string $tagName the tag name, e.g., "p"
747     *
748     * @return void
749     */
750    public function removeUnprocessableHtmlTag($tagName)
751    {
752        $key = \array_search($tagName, $this->unprocessableHtmlTags, true);
753        if ($key !== false) {
754            /** @var int|string $key */
755            unset($this->unprocessableHtmlTags[$key]);
756        }
757    }
758
759    /**
760     * Marks a media query type to keep.
761     *
762     * @param string $mediaName the media type name, e.g., "braille"
763     *
764     * @return void
765     */
766    public function addAllowedMediaType($mediaName)
767    {
768        $this->allowedMediaTypes[$mediaName] = true;
769    }
770
771    /**
772     * Drops a media query type from the allowed list.
773     *
774     * @param string $mediaName the tag name, e.g., "braille"
775     *
776     * @return void
777     */
778    public function removeAllowedMediaType($mediaName)
779    {
780        if (isset($this->allowedMediaTypes[$mediaName])) {
781            unset($this->allowedMediaTypes[$mediaName]);
782        }
783    }
784
785    /**
786     * Adds a selector to exclude nodes from emogrification.
787     *
788     * Any nodes that match the selector will not have their style altered.
789     *
790     * @param string $selector the selector to exclude, e.g., ".editor"
791     *
792     * @return void
793     */
794    public function addExcludedSelector($selector)
795    {
796        $this->excludedSelectors[$selector] = true;
797    }
798
799    /**
800     * No longer excludes the nodes matching this selector from emogrification.
801     *
802     * @param string $selector the selector to no longer exclude, e.g., ".editor"
803     *
804     * @return void
805     */
806    public function removeExcludedSelector($selector)
807    {
808        if (isset($this->excludedSelectors[$selector])) {
809            unset($this->excludedSelectors[$selector]);
810        }
811    }
812
813    /**
814     * Parses the document and normalizes all existing CSS attributes.
815     * This changes 'DISPLAY: none' to 'display: none'.
816     * We wouldn't have to do this if DOMXPath supported XPath 2.0.
817     * Also stores a reference of nodes with existing inline styles so we don't overwrite them.
818     *
819     * @return void
820     */
821    private function normalizeStyleAttributesOfAllNodes()
822    {
823        /** @var \DOMElement $node */
824        foreach ($this->getAllNodesWithStyleAttribute() as $node) {
825            if ($this->isInlineStyleAttributesParsingEnabled) {
826                $this->normalizeStyleAttributes($node);
827            }
828            // Remove style attribute in every case, so we can add them back (if inline style attributes
829            // parsing is enabled) to the end of the style list, thus keeping the right priority of CSS rules;
830            // else original inline style rules may remain at the beginning of the final inline style definition
831            // of a node, which may give not the desired results
832            $node->removeAttribute('style');
833        }
834    }
835
836    /**
837     * Normalizes the value of the "style" attribute and saves it.
838     *
839     * @param \DOMElement $node
840     *
841     * @return void
842     */
843    private function normalizeStyleAttributes(\DOMElement $node)
844    {
845        $normalizedOriginalStyle = \preg_replace_callback(
846            '/-?+[_a-zA-Z][\\w\\-]*+(?=:)/S',
847            static function (array $m) {
848                return \strtolower($m[0]);
849            },
850            $node->getAttribute('style')
851        );
852
853        // in order to not overwrite existing style attributes in the HTML, we
854        // have to save the original HTML styles
855        $nodePath = $node->getNodePath();
856        if (!isset($this->styleAttributesForNodes[$nodePath])) {
857            $this->styleAttributesForNodes[$nodePath] = $this->parseCssDeclarationsBlock($normalizedOriginalStyle);
858            $this->visitedNodes[$nodePath] = $node;
859        }
860
861        $node->setAttribute('style', $normalizedOriginalStyle);
862    }
863
864    /**
865     * Merges styles from styles attributes and style nodes and applies them to the attribute nodes
866     *
867     * @return void
868     */
869    private function fillStyleAttributesWithMergedStyles()
870    {
871        foreach ($this->styleAttributesForNodes as $nodePath => $styleAttributesForNode) {
872            $node = $this->visitedNodes[$nodePath];
873            $currentStyleAttributes = $this->parseCssDeclarationsBlock($node->getAttribute('style'));
874            $node->setAttribute(
875                'style',
876                $this->generateStyleStringFromDeclarationsArrays(
877                    $currentStyleAttributes,
878                    $styleAttributesForNode
879                )
880            );
881        }
882    }
883
884    /**
885     * This method merges old or existing name/value array with new name/value array
886     * and then generates a string of the combined style suitable for placing inline.
887     * This becomes the single point for CSS string generation allowing for consistent
888     * CSS output no matter where the CSS originally came from.
889     *
890     * @param string[] $oldStyles
891     * @param string[] $newStyles
892     *
893     * @return string
894     */
895    private function generateStyleStringFromDeclarationsArrays(array $oldStyles, array $newStyles)
896    {
897        $cacheKey = \serialize([$oldStyles, $newStyles]);
898        if (isset($this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey])) {
899            return $this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey];
900        }
901
902        // Unset the overridden styles to preserve order, important if shorthand and individual properties are mixed
903        foreach ($oldStyles as $attributeName => $attributeValue) {
904            if (!isset($newStyles[$attributeName])) {
905                continue;
906            }
907
908            $newAttributeValue = $newStyles[$attributeName];
909            if (
910                $this->attributeValueIsImportant($attributeValue)
911                && !$this->attributeValueIsImportant($newAttributeValue)
912            ) {
913                unset($newStyles[$attributeName]);
914            } else {
915                unset($oldStyles[$attributeName]);
916            }
917        }
918
919        $combinedStyles = \array_merge($oldStyles, $newStyles);
920
921        $style = '';
922        foreach ($combinedStyles as $attributeName => $attributeValue) {
923            $style .= \strtolower(\trim($attributeName)) . ': ' . \trim($attributeValue) . '; ';
924        }
925        $trimmedStyle = \rtrim($style);
926
927        $this->caches[self::CACHE_KEY_COMBINED_STYLES][$cacheKey] = $trimmedStyle;
928
929        return $trimmedStyle;
930    }
931
932    /**
933     * Generates a CSS style string suitable to be used inline from the $styleDeclarations property => value array.
934     *
935     * @param string[] $styleDeclarations
936     *
937     * @return string
938     */
939    private function generateStyleStringFromSingleDeclarationsArray(array $styleDeclarations)
940    {
941        return $this->generateStyleStringFromDeclarationsArrays([], $styleDeclarations);
942    }
943
944    /**
945     * Checks whether $attributeValue is marked as !important.
946     *
947     * @param string $attributeValue
948     *
949     * @return bool
950     */
951    private function attributeValueIsImportant($attributeValue)
952    {
953        return \strtolower(\substr(\trim($attributeValue), -10)) === '!important';
954    }
955
956    /**
957     * Copies $cssRule into the style attribute of $node.
958     *
959     * Note: This method does not check whether $cssRule matches $node.
960     *
961     * @param \DOMElement $node
962     * @param string[][] $cssRule
963     *
964     * @return void
965     */
966    private function copyInlinableCssToStyleAttribute(\DOMElement $node, array $cssRule)
967    {
968        $newStyleDeclarations = $this->parseCssDeclarationsBlock($cssRule['declarationsBlock']);
969        if ($newStyleDeclarations === []) {
970            return;
971        }
972
973        // if it has a style attribute, get it, process it, and append (overwrite) new stuff
974        if ($node->hasAttribute('style')) {
975            // break it up into an associative array
976            $oldStyleDeclarations = $this->parseCssDeclarationsBlock($node->getAttribute('style'));
977        } else {
978            $oldStyleDeclarations = [];
979        }
980        $node->setAttribute(
981            'style',
982            $this->generateStyleStringFromDeclarationsArrays($oldStyleDeclarations, $newStyleDeclarations)
983        );
984    }
985
986    /**
987     * Applies $cssRules to $this->domDocument, limited to the rules that actually apply to the document, by placing
988     * them as CSS in a `<style>` element.
989     *
990     * @param string[][] $cssRules the "uninlinable" array of CSS rules returned by `parseCssRules`
991     * @param string $cssImportRules This may contain any `@import` rules that should precede the CSS placed in the
992     *        `<style>` element.  If there are no unlinlinable CSS rules to copy there, a `<style>` element will be
993     *        created containing just `$cssImportRules`.  `$cssImportRules` may be an empty string; if it is, and there
994     *        are no unlinlinable CSS rules, an empty `<style>` element will not be created.
995     *
996     * @return void
997     */
998    private function copyUninlinableCssToStyleNode(array $cssRules, $cssImportRules)
999    {
1000        $css = $cssImportRules;
1001
1002        $cssRulesRelevantForDocument = \array_filter($cssRules, [$this, 'existsMatchForSelectorInCssRule']);
1003
1004        // avoid including unneeded class dependency if there are no rules
1005        if ($cssRulesRelevantForDocument !== []) {
1006            // support use without autoload
1007            if (!\class_exists(CssConcatenator::class)) {
1008                require_once __DIR__ . '/Emogrifier/Utilities/CssConcatenator.php';
1009            }
1010
1011            $cssConcatenator = new CssConcatenator();
1012            foreach ($cssRulesRelevantForDocument as $cssRule) {
1013                $cssConcatenator->append([$cssRule['selector']], $cssRule['declarationsBlock'], $cssRule['media']);
1014            }
1015
1016            $css .= $cssConcatenator->getCss();
1017        }
1018
1019        // avoid adding empty style element
1020        if ($css !== '') {
1021            $this->addStyleElementToDocument($css);
1022        }
1023    }
1024
1025    /**
1026     * Checks whether there is at least one matching element for the CSS selector contained in the `selector` element
1027     * of the provided CSS rule.
1028     *
1029     * Any dynamic pseudo-classes will be assumed to apply. If the selector matches a pseudo-element,
1030     * it will test for a match with its originating element.
1031     *
1032     * @param string[] $cssRule
1033     *
1034     * @return bool
1035     *
1036     * @throws \InvalidArgumentException
1037     */
1038    private function existsMatchForSelectorInCssRule(array $cssRule)
1039    {
1040        $selector = $cssRule['selector'];
1041        if ($cssRule['hasUnmatchablePseudo']) {
1042            $selector = $this->removeUnmatchablePseudoComponents($selector);
1043        }
1044        return $this->existsMatchForCssSelector($selector);
1045    }
1046
1047    /**
1048     * Removes pseudo-elements and dynamic pseudo-classes from a CSS selector, replacing them with "*" if necessary.
1049     * If such a pseudo-component is within the argument of `:not`, the entire `:not` component is removed or replaced.
1050     *
1051     * @param string $selector
1052     *
1053     * @return string Selector which will match the relevant DOM elements if the pseudo-classes are assumed to apply,
1054     *                or in the case of pseudo-elements will match their originating element.
1055     */
1056    private function removeUnmatchablePseudoComponents($selector)
1057    {
1058        // The regex allows nested brackets via `(?2)`.
1059        // A space is temporarily prepended because the callback can't determine if the match was at the very start.
1060        $selectorWithoutNots = \ltrim(\preg_replace_callback(
1061            '/(\\s?+):not(\\([^()]*+(?:(?2)[^()]*+)*+\\))/i',
1062            [$this, 'replaceUnmatchableNotComponent'],
1063            ' ' . $selector
1064        ));
1065
1066        $pseudoComponentMatcher = ':(?!' . self::PSEUDO_CLASS_MATCHER . '):?+[\\w\\-]++(?:\\([^\\)]*+\\))?+';
1067        return \preg_replace(
1068            ['/(\\s|^)' . $pseudoComponentMatcher . '/i', '/' . $pseudoComponentMatcher . '/i'],
1069            ['$1*', ''],
1070            $selectorWithoutNots
1071        );
1072    }
1073
1074    /**
1075     * Helps `removeUnmatchablePseudoComponents()` replace or remove a selector `:not(...)` component if its argument
1076     * contains pseudo-elements or dynamic pseudo-classes.
1077     *
1078     * @param string[] $matches array of elements matched by the regular expression
1079     *
1080     * @return string the full match if there were no unmatchable pseudo components within; otherwise, any preceding
1081     *         whitespace followed by "*", or an empty string if there was no preceding whitespace
1082     */
1083    private function replaceUnmatchableNotComponent(array $matches)
1084    {
1085        list($notComponentWithAnyPrecedingWhitespace, $anyPrecedingWhitespace, $notArgumentInBrackets) = $matches;
1086
1087        $hasUnmatchablePseudo = \preg_match(
1088            '/:(?!' . self::PSEUDO_CLASS_MATCHER . ')[\\w\\-:]/i',
1089            $notArgumentInBrackets
1090        );
1091
1092        if ($hasUnmatchablePseudo) {
1093            return $anyPrecedingWhitespace !== '' ? $anyPrecedingWhitespace . '*' : '';
1094        }
1095        return $notComponentWithAnyPrecedingWhitespace;
1096    }
1097
1098    /**
1099     * Checks whether there is at least one matching element for $cssSelector.
1100     * When not in debug mode, it returns true also for invalid selectors (because they may be valid,
1101     * just not implemented/recognized yet by Emogrifier).
1102     *
1103     * @param string $cssSelector
1104     *
1105     * @return bool
1106     *
1107     * @throws \InvalidArgumentException
1108     */
1109    private function existsMatchForCssSelector($cssSelector)
1110    {
1111        try {
1112            $nodesMatchingSelector = $this->xPath->query($this->translateCssToXpath($cssSelector));
1113        } catch (\InvalidArgumentException $e) {
1114            if ($this->debug) {
1115                throw $e;
1116            }
1117            return true;
1118        }
1119
1120        return $nodesMatchingSelector !== false && $nodesMatchingSelector->length !== 0;
1121    }
1122
1123    /**
1124     * Returns CSS content.
1125     *
1126     * @return string
1127     */
1128    private function getCssFromAllStyleNodes()
1129    {
1130        $styleNodes = $this->xPath->query('//style');
1131
1132        if ($styleNodes === false) {
1133            return '';
1134        }
1135
1136        $css = '';
1137        /** @var \DOMNode $styleNode */
1138        foreach ($styleNodes as $styleNode) {
1139            $css .= "\n\n" . $styleNode->nodeValue;
1140            $styleNode->parentNode->removeChild($styleNode);
1141        }
1142
1143        return $css;
1144    }
1145
1146    /**
1147     * Adds a style element with $css to $this->domDocument.
1148     *
1149     * This method is protected to allow overriding.
1150     *
1151     * @see https://github.com/MyIntervals/emogrifier/issues/103
1152     *
1153     * @param string $css
1154     *
1155     * @return void
1156     */
1157    protected function addStyleElementToDocument($css)
1158    {
1159        $styleElement = $this->domDocument->createElement('style', $css);
1160        $styleAttribute = $this->domDocument->createAttribute('type');
1161        $styleAttribute->value = 'text/css';
1162        $styleElement->appendChild($styleAttribute);
1163
1164        $headElement = $this->getHeadElement();
1165        $headElement->appendChild($styleElement);
1166    }
1167
1168    /**
1169     * Checks that $this->domDocument has a BODY element and adds it if it is missing.
1170     *
1171     * @return void
1172     *
1173     * @throws \UnexpectedValueException
1174     */
1175    private function ensureExistenceOfBodyElement()
1176    {
1177        if ($this->domDocument->getElementsByTagName('body')->item(0) !== null) {
1178            return;
1179        }
1180
1181        $htmlElement = $this->domDocument->getElementsByTagName('html')->item(0);
1182        if ($htmlElement === null) {
1183            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930874);
1184        }
1185        $htmlElement->appendChild($this->domDocument->createElement('body'));
1186    }
1187
1188    /**
1189     * Removes comments from the supplied CSS.
1190     *
1191     * @param string $css
1192     *
1193     * @return string CSS with the comments removed
1194     */
1195    private function removeCssComments($css)
1196    {
1197        return \preg_replace('%/\\*[^*]*+(?:\\*(?!/)[^*]*+)*+\\*/%', '', $css);
1198    }
1199
1200    /**
1201     * Extracts `@import` and `@charset` rules from the supplied CSS.  These rules must not be preceded by any other
1202     * rules, or they will be ignored.  (From the CSS 2.1 specification: "CSS 2.1 user agents must ignore any '@import'
1203     * rule that occurs inside a block or after any non-ignored statement other than an @charset or an @import rule."
1204     * Note also that `@charset` is case sensitive whereas `@import` is not.)
1205     *
1206     * @param string $css CSS with comments removed
1207     *
1208     * @return string[] The first element is the CSS with the valid `@import` and `@charset` rules removed.  The second
1209     * element contains a concatenation of the valid `@import` rules, each followed by whatever whitespace followed it
1210     * in the original CSS (so that either unminified or minified formatting is preserved); if there were no `@import`
1211     * rules, it will be an empty string.  The (valid) `@charset` rules are discarded.
1212     */
1213    private function extractImportAndCharsetRules($css)
1214    {
1215        $possiblyModifiedCss = $css;
1216        $importRules = '';
1217
1218        while (
1219            \preg_match(
1220                '/^\\s*+(@((?i)import(?-i)|charset)\\s[^;]++;\\s*+)/',
1221                $possiblyModifiedCss,
1222                $matches
1223            )
1224        ) {
1225            list($fullMatch, $atRuleAndFollowingWhitespace, $atRuleName) = $matches;
1226
1227            if (\strtolower($atRuleName) === 'import') {
1228                $importRules .= $atRuleAndFollowingWhitespace;
1229            }
1230
1231            $possiblyModifiedCss = \substr($possiblyModifiedCss, \strlen($fullMatch));
1232        }
1233
1234        return [$possiblyModifiedCss, $importRules];
1235    }
1236
1237    /**
1238     * Splits input CSS code into an array of parts for different media queries, in order.
1239     * Each part is an array where:
1240     *
1241     * - key "css" will contain clean CSS code (for @media rules this will be the group rule body within "{...}")
1242     * - key "media" will contain "@media " followed by the media query list, for all allowed media queries,
1243     *   or an empty string for CSS not within a media query
1244     *
1245     * Example:
1246     *
1247     * The CSS code
1248     *
1249     *   "@import "file.css"; h1 { color:red; } @media { h1 {}} @media tv { h1 {}}"
1250     *
1251     * will be parsed into the following array:
1252     *
1253     *   0 => [
1254     *     "css" => "h1 { color:red; }",
1255     *     "media" => ""
1256     *   ],
1257     *   1 => [
1258     *     "css" => " h1 {}",
1259     *     "media" => "@media "
1260     *   ]
1261     *
1262     * @param string $css
1263     *
1264     * @return string[][]
1265     */
1266    private function splitCssAndMediaQuery($css)
1267    {
1268        $mediaTypesExpression = '';
1269        if (!empty($this->allowedMediaTypes)) {
1270            $mediaTypesExpression = '|' . \implode('|', \array_keys($this->allowedMediaTypes));
1271        }
1272
1273        $mediaRuleBodyMatcher = '[^{]*+{(?:[^{}]*+{.*})?\\s*+}\\s*+';
1274
1275        $cssSplitForAllowedMediaTypes = \preg_split(
1276            '#(@media\\s++(?:only\\s++)?+(?:(?=[{(])' . $mediaTypesExpression . ')' . $mediaRuleBodyMatcher
1277            . ')#misU',
1278            $css,
1279            -1,
1280            PREG_SPLIT_DELIM_CAPTURE
1281        );
1282
1283        // filter the CSS outside/between allowed @media rules
1284        $cssCleaningMatchers = [
1285            'import/charset directives' => '/\\s*+@(?:import|charset)\\s[^;]++;/i',
1286            'remaining media enclosures' => '/\\s*+@media\\s' . $mediaRuleBodyMatcher . '/isU',
1287        ];
1288
1289        $splitCss = [];
1290        foreach ($cssSplitForAllowedMediaTypes as $index => $cssPart) {
1291            $isMediaRule = $index % 2 !== 0;
1292            if ($isMediaRule) {
1293                \preg_match('/^([^{]*+){(.*)}[^}]*+$/s', $cssPart, $matches);
1294                $splitCss[] = [
1295                    'css' => $matches[2],
1296                    'media' => $matches[1],
1297                ];
1298            } else {
1299                $cleanedCss = \trim(\preg_replace($cssCleaningMatchers, '', $cssPart));
1300                if ($cleanedCss !== '') {
1301                    $splitCss[] = [
1302                        'css' => $cleanedCss,
1303                        'media' => '',
1304                    ];
1305                }
1306            }
1307        }
1308        return $splitCss;
1309    }
1310
1311    /**
1312     * Removes empty unprocessable tags from the DOM document.
1313     *
1314     * @return void
1315     */
1316    private function removeUnprocessableTags()
1317    {
1318        foreach ($this->unprocessableHtmlTags as $tagName) {
1319            // Deleting nodes from a 'live' NodeList invalidates iteration on it, so a copy must be made to iterate.
1320            $nodes = [];
1321            foreach ($this->domDocument->getElementsByTagName($tagName) as $node) {
1322                $nodes[] = $node;
1323            }
1324            /** @var \DOMNode $node */
1325            foreach ($nodes as $node) {
1326                if (!$node->hasChildNodes()) {
1327                    $node->parentNode->removeChild($node);
1328                }
1329            }
1330        }
1331    }
1332
1333    /**
1334     * Makes sure that the passed HTML has a document type.
1335     *
1336     * @param string $html
1337     *
1338     * @return string HTML with document type
1339     */
1340    private function ensureDocumentType($html)
1341    {
1342        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
1343        if ($hasDocumentType) {
1344            return $html;
1345        }
1346
1347        return self::DEFAULT_DOCUMENT_TYPE . $html;
1348    }
1349
1350    /**
1351     * Adds a Content-Type meta tag for the charset.
1352     *
1353     * This method also ensures that there is a HEAD element.
1354     *
1355     * @param string $html
1356     *
1357     * @return string the HTML with the meta tag added
1358     */
1359    private function addContentTypeMetaTag($html)
1360    {
1361        $hasContentTypeMetaTag = \stripos($html, 'Content-Type') !== false;
1362        if ($hasContentTypeMetaTag) {
1363            return $html;
1364        }
1365
1366        // We are trying to insert the meta tag to the right spot in the DOM.
1367        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
1368        $hasHeadTag = \stripos($html, '<head') !== false;
1369        $hasHtmlTag = \stripos($html, '<html') !== false;
1370
1371        if ($hasHeadTag) {
1372            $reworkedHtml = \preg_replace('/<head(.*?)>/i', '<head$1>' . self::CONTENT_TYPE_META_TAG, $html);
1373        } elseif ($hasHtmlTag) {
1374            $reworkedHtml = \preg_replace(
1375                '/<html(.*?)>/i',
1376                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
1377                $html
1378            );
1379        } else {
1380            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
1381        }
1382
1383        return $reworkedHtml;
1384    }
1385
1386    /**
1387     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
1388     * self-closing slash.
1389     *
1390     * @param string $html
1391     *
1392     * @return string HTML with problematic tags converted.
1393     */
1394    private function ensurePhpUnrecognizedSelfClosingTagsAreXml($html)
1395    {
1396        return \preg_replace(
1397            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
1398            '$0/',
1399            $html
1400        );
1401    }
1402
1403    /**
1404     * @param string[] $a
1405     * @param string[] $b
1406     *
1407     * @return int
1408     */
1409    private function sortBySelectorPrecedence(array $a, array $b)
1410    {
1411        $precedenceA = $this->getCssSelectorPrecedence($a['selector']);
1412        $precedenceB = $this->getCssSelectorPrecedence($b['selector']);
1413
1414        // We want these sorted in ascending order so selectors with lesser precedence get processed first and
1415        // selectors with greater precedence get sorted last.
1416        $precedenceForEquals = ($a['line'] < $b['line'] ? -1 : 1);
1417        $precedenceForNotEquals = ($precedenceA < $precedenceB ? -1 : 1);
1418        return ($precedenceA === $precedenceB) ? $precedenceForEquals : $precedenceForNotEquals;
1419    }
1420
1421    /**
1422     * @param string $selector
1423     *
1424     * @return int
1425     */
1426    private function getCssSelectorPrecedence($selector)
1427    {
1428        $selectorKey = \md5($selector);
1429        if (!isset($this->caches[self::CACHE_KEY_SELECTOR][$selectorKey])) {
1430            $precedence = 0;
1431            foreach ($this->selectorPrecedenceMatchers as $matcher => $value) {
1432                if (\trim($selector) === '') {
1433                    break;
1434                }
1435                $number = 0;
1436                $selector = \preg_replace('/' . $matcher . '\\w+/', '', $selector, -1, $number);
1437                $precedence += ($value * $number);
1438            }
1439            $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey] = $precedence;
1440        }
1441
1442        return $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey];
1443    }
1444
1445    /**
1446     * Maps a CSS selector to an XPath query string.
1447     *
1448     * @see http://plasmasturm.org/log/444/
1449     *
1450     * @param string $cssSelector a CSS selector
1451     *
1452     * @return string the corresponding XPath selector
1453     */
1454    private function translateCssToXpath($cssSelector)
1455    {
1456        $paddedSelector = ' ' . $cssSelector . ' ';
1457        $lowercasePaddedSelector = \preg_replace_callback(
1458            '/\\s+\\w+\\s+/',
1459            static function (array $matches) {
1460                return \strtolower($matches[0]);
1461            },
1462            $paddedSelector
1463        );
1464        $trimmedLowercaseSelector = \trim($lowercasePaddedSelector);
1465        $xPathKey = \md5($trimmedLowercaseSelector);
1466        if (isset($this->caches[self::CACHE_KEY_XPATH][$xPathKey])) {
1467            return $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey];
1468        }
1469
1470        $hasNotSelector = (bool)\preg_match(
1471            '/^([^:]+):not\\(\\s*([[:ascii:]]+)\\s*\\)$/',
1472            $trimmedLowercaseSelector,
1473            $matches
1474        );
1475        if ($hasNotSelector) {
1476            /** @var string[] $matches */
1477            list(, $partBeforeNot, $notContents) = $matches;
1478            $xPath = '//' . $this->translateCssToXpathPass($partBeforeNot) .
1479                '[not(' . $this->translateCssToXpathPassInline($notContents) . ')]';
1480        } else {
1481            $xPath = '//' . $this->translateCssToXpathPass($trimmedLowercaseSelector);
1482        }
1483        $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey] = $xPath;
1484
1485        return $this->caches[self::CACHE_KEY_SELECTOR][$xPathKey];
1486    }
1487
1488    /**
1489     * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector.
1490     *
1491     * @param string $trimmedLowercaseSelector
1492     *
1493     * @return string
1494     */
1495    private function translateCssToXpathPass($trimmedLowercaseSelector)
1496    {
1497        return $this->translateCssToXpathPassWithMatchClassAttributesCallback(
1498            $trimmedLowercaseSelector,
1499            [$this, 'matchClassAttributes']
1500        );
1501    }
1502
1503    /**
1504     * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector for inline usage.
1505     *
1506     * @param string $trimmedLowercaseSelector
1507     *
1508     * @return string
1509     */
1510    private function translateCssToXpathPassInline($trimmedLowercaseSelector)
1511    {
1512        return $this->translateCssToXpathPassWithMatchClassAttributesCallback(
1513            $trimmedLowercaseSelector,
1514            [$this, 'matchClassAttributesInline']
1515        );
1516    }
1517
1518    /**
1519     * Flexibly translates the CSS selector $trimmedLowercaseSelector to an xPath selector while using
1520     * $matchClassAttributesCallback as to match the class attributes.
1521     *
1522     * @param string $trimmedLowercaseSelector
1523     * @param callable $matchClassAttributesCallback
1524     *
1525     * @return string
1526     */
1527    private function translateCssToXpathPassWithMatchClassAttributesCallback(
1528        $trimmedLowercaseSelector,
1529        callable $matchClassAttributesCallback
1530    ) {
1531        $roughXpath = \preg_replace(\array_keys($this->xPathRules), $this->xPathRules, $trimmedLowercaseSelector);
1532        $xPathWithIdAttributeMatchers = \preg_replace_callback(
1533            self::ID_ATTRIBUTE_MATCHER,
1534            [$this, 'matchIdAttributes'],
1535            $roughXpath
1536        );
1537        $xPathWithIdAttributeAndClassMatchers = \preg_replace_callback(
1538            self::CLASS_ATTRIBUTE_MATCHER,
1539            $matchClassAttributesCallback,
1540            $xPathWithIdAttributeMatchers
1541        );
1542
1543        // Advanced selectors are going to require a bit more advanced emogrification.
1544        $xPathWithIdAttributeAndClassMatchers = \preg_replace_callback(
1545            '/([^\\/]+):nth-child\\(\\s*(odd|even|[+\\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i',
1546            [$this, 'translateNthChild'],
1547            $xPathWithIdAttributeAndClassMatchers
1548        );
1549        $finalXpath = \preg_replace_callback(
1550            '/([^\\/]+):nth-of-type\\(\\s*(odd|even|[+\\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i',
1551            [$this, 'translateNthOfType'],
1552            $xPathWithIdAttributeAndClassMatchers
1553        );
1554
1555        return $finalXpath;
1556    }
1557
1558    /**
1559     * @param string[] $match
1560     *
1561     * @return string
1562     */
1563    private function matchIdAttributes(array $match)
1564    {
1565        return ($match[1] !== '' ? $match[1] : '*') . '[@id="' . $match[2] . '"]';
1566    }
1567
1568    /**
1569     * @param string[] $match
1570     *
1571     * @return string xPath class attribute query wrapped in element selector
1572     */
1573    private function matchClassAttributes(array $match)
1574    {
1575        return ($match[1] !== '' ? $match[1] : '*') . '[' . $this->matchClassAttributesInline($match) . ']';
1576    }
1577
1578    /**
1579     * @param string[] $match
1580     *
1581     * @return string xPath class attribute query
1582     */
1583    private function matchClassAttributesInline(array $match)
1584    {
1585        return 'contains(concat(" ",@class," "),concat(" ","' .
1586            \str_replace('.', '"," "))][contains(concat(" ",@class," "),concat(" ","', \substr($match[2], 1)) .
1587            '"," "))';
1588    }
1589
1590    /**
1591     * @param string[] $match
1592     *
1593     * @return string
1594     */
1595    private function translateNthChild(array $match)
1596    {
1597        $parseResult = $this->parseNth($match);
1598
1599        if (isset($parseResult[self::MULTIPLIER])) {
1600            if ($parseResult[self::MULTIPLIER] < 0) {
1601                $parseResult[self::MULTIPLIER] = \abs($parseResult[self::MULTIPLIER]);
1602                $xPathExpression = \sprintf(
1603                    '*[(last() - position()) mod %1%u = %2$u]/self::%3$s',
1604                    $parseResult[self::MULTIPLIER],
1605                    $parseResult[self::INDEX],
1606                    $match[1]
1607                );
1608            } else {
1609                $xPathExpression = \sprintf(
1610                    '*[position() mod %1$u = %2$u]/self::%3$s',
1611                    $parseResult[self::MULTIPLIER],
1612                    $parseResult[self::INDEX],
1613                    $match[1]
1614                );
1615            }
1616        } else {
1617            $xPathExpression = \sprintf('*[%1$u]/self::%2$s', $parseResult[self::INDEX], $match[1]);
1618        }
1619
1620        return $xPathExpression;
1621    }
1622
1623    /**
1624     * @param string[] $match
1625     *
1626     * @return string
1627     */
1628    private function translateNthOfType(array $match)
1629    {
1630        $parseResult = $this->parseNth($match);
1631
1632        if (isset($parseResult[self::MULTIPLIER])) {
1633            if ($parseResult[self::MULTIPLIER] < 0) {
1634                $parseResult[self::MULTIPLIER] = \abs($parseResult[self::MULTIPLIER]);
1635                $xPathExpression = \sprintf(
1636                    '%1$s[(last() - position()) mod %2$u = %3$u]',
1637                    $match[1],
1638                    $parseResult[self::MULTIPLIER],
1639                    $parseResult[self::INDEX]
1640                );
1641            } else {
1642                $xPathExpression = \sprintf(
1643                    '%1$s[position() mod %2$u = %3$u]',
1644                    $match[1],
1645                    $parseResult[self::MULTIPLIER],
1646                    $parseResult[self::INDEX]
1647                );
1648            }
1649        } else {
1650            $xPathExpression = \sprintf('%1$s[%2$u]', $match[1], $parseResult[self::INDEX]);
1651        }
1652
1653        return $xPathExpression;
1654    }
1655
1656    /**
1657     * @param string[] $match
1658     *
1659     * @return int[]
1660     */
1661    private function parseNth(array $match)
1662    {
1663        if (\in_array(\strtolower($match[2]), ['even', 'odd'], true)) {
1664            // we have "even" or "odd"
1665            $index = \strtolower($match[2]) === 'even' ? 0 : 1;
1666            return [self::MULTIPLIER => 2, self::INDEX => $index];
1667        }
1668        if (\stripos($match[2], 'n') === false) {
1669            // if there is a multiplier
1670            $index = (int)\str_replace(' ', '', $match[2]);
1671            return [self::INDEX => $index];
1672        }
1673
1674        if (isset($match[3])) {
1675            $multipleTerm = \str_replace($match[3], '', $match[2]);
1676            $index = (int)\str_replace(' ', '', $match[3]);
1677        } else {
1678            $multipleTerm = $match[2];
1679            $index = 0;
1680        }
1681
1682        $multiplier = \str_ireplace('n', '', $multipleTerm);
1683
1684        if ($multiplier === '') {
1685            $multiplier = 1;
1686        } elseif ($multiplier === '0') {
1687            return [self::INDEX => $index];
1688        } else {
1689            $multiplier = (int)$multiplier;
1690        }
1691
1692        while ($index < 0) {
1693            $index += \abs($multiplier);
1694        }
1695
1696        return [self::MULTIPLIER => $multiplier, self::INDEX => $index];
1697    }
1698
1699    /**
1700     * Parses a CSS declaration block into property name/value pairs.
1701     *
1702     * Example:
1703     *
1704     * The declaration block
1705     *
1706     *   "color: #000; font-weight: bold;"
1707     *
1708     * will be parsed into the following array:
1709     *
1710     *   "color" => "#000"
1711     *   "font-weight" => "bold"
1712     *
1713     * @param string $cssDeclarationsBlock the CSS declarations block without the curly braces, may be empty
1714     *
1715     * @return string[]
1716     *         the CSS declarations with the property names as array keys and the property values as array values
1717     */
1718    private function parseCssDeclarationsBlock($cssDeclarationsBlock)
1719    {
1720        if (isset($this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock])) {
1721            return $this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock];
1722        }
1723
1724        $properties = [];
1725        foreach (\preg_split('/;(?!base64|charset)/', $cssDeclarationsBlock) as $declaration) {
1726            $matches = [];
1727            if (!\preg_match('/^([A-Za-z\\-]+)\\s*:\\s*(.+)$/s', \trim($declaration), $matches)) {
1728                continue;
1729            }
1730
1731            $propertyName = \strtolower($matches[1]);
1732            $propertyValue = $matches[2];
1733            $properties[$propertyName] = $propertyValue;
1734        }
1735        $this->caches[self::CACHE_KEY_CSS_DECLARATIONS_BLOCK][$cssDeclarationsBlock] = $properties;
1736
1737        return $properties;
1738    }
1739
1740    /**
1741     * Find the nodes that are not to be emogrified.
1742     *
1743     * @return \DOMElement[]
1744     *
1745     * @throws \InvalidArgumentException
1746     */
1747    private function getNodesToExclude()
1748    {
1749        $excludedNodes = [];
1750        foreach (\array_keys($this->excludedSelectors) as $selectorToExclude) {
1751            try {
1752                $matchingNodes = $this->xPath->query($this->translateCssToXpath($selectorToExclude));
1753            } catch (\InvalidArgumentException $e) {
1754                if ($this->debug) {
1755                    throw $e;
1756                }
1757                continue;
1758            }
1759            foreach ($matchingNodes as $node) {
1760                $excludedNodes[] = $node;
1761            }
1762        }
1763
1764        return $excludedNodes;
1765    }
1766
1767    /**
1768     * Handles invalid xPath expression warnings, generated during the process() method,
1769     * during querying \DOMDocument and trigger an \InvalidArgumentException with an invalid selector
1770     * or \RuntimeException, depending on the source of the warning.
1771     *
1772     * @param int $type
1773     * @param string $message
1774     * @param string $file
1775     * @param int $line
1776     * @param array $context
1777     *
1778     * @return bool always false
1779     *
1780     * @throws \InvalidArgumentException
1781     * @throws \RuntimeException
1782     */
1783    public function handleXpathQueryWarnings(// phpcs:ignore Generic.CodeAnalysis.UnusedFunctionParameter
1784        $type,
1785        $message,
1786        $file,
1787        $line,
1788        array $context
1789    ) {
1790        $selector = '';
1791        if (isset($context['cssRule']['selector'])) {
1792            // warnings generated by invalid/unrecognized selectors in method process()
1793            $selector = $context['cssRule']['selector'];
1794        } elseif (isset($context['selectorToExclude'])) {
1795            // warnings generated by invalid/unrecognized selectors in method getNodesToExclude()
1796            $selector = $context['selectorToExclude'];
1797        } elseif (isset($context['cssSelector'])) {
1798            // warnings generated by invalid/unrecognized selectors in method existsMatchForCssSelector()
1799            $selector = $context['cssSelector'];
1800        }
1801
1802        if ($selector !== '') {
1803            throw new \InvalidArgumentException(
1804                \sprintf('%1$s in selector >> %2$s << in %3$s on line %4$u', $message, $selector, $file, $line),
1805                1509279985
1806            );
1807        }
1808
1809        // Catches eventual warnings generated by method getAllNodesWithStyleAttribute()
1810        if (isset($context['xPath'])) {
1811            throw new \RuntimeException(
1812                \sprintf('%1$s in %2$s on line %3$u', $message, $file, $line),
1813                1509280067
1814            );
1815        }
1816
1817        // the normal error handling continues when handler return false
1818        return false;
1819    }
1820
1821    /**
1822     * Sets the debug mode.
1823     *
1824     * @param bool $debug set to true to enable debug mode
1825     *
1826     * @return void
1827     */
1828    public function setDebug($debug)
1829    {
1830        $this->debug = $debug;
1831    }
1832}
1833