1<?php
2
3/**
4 * @see       https://github.com/laminas/laminas-escaper for the canonical source repository
5 * @copyright https://github.com/laminas/laminas-escaper/blob/master/COPYRIGHT.md
6 * @license   https://github.com/laminas/laminas-escaper/blob/master/LICENSE.md New BSD License
7 */
8
9namespace Laminas\Escaper;
10
11/**
12 * Context specific methods for use in secure output escaping
13 */
14class Escaper
15{
16    /**
17     * Entity Map mapping Unicode codepoints to any available named HTML entities.
18     *
19     * While HTML supports far more named entities, the lowest common denominator
20     * has become HTML5's XML Serialisation which is restricted to the those named
21     * entities that XML supports. Using HTML entities would result in this error:
22     *     XML Parsing Error: undefined entity
23     *
24     * @var array
25     */
26    protected static $htmlNamedEntityMap = [
27        34 => 'quot',         // quotation mark
28        38 => 'amp',          // ampersand
29        60 => 'lt',           // less-than sign
30        62 => 'gt',           // greater-than sign
31    ];
32
33    /**
34     * Current encoding for escaping. If not UTF-8, we convert strings from this encoding
35     * pre-escaping and back to this encoding post-escaping.
36     *
37     * @var string
38     */
39    protected $encoding = 'utf-8';
40
41    /**
42     * Holds the value of the special flags passed as second parameter to
43     * htmlspecialchars().
44     *
45     * @var int
46     */
47    protected $htmlSpecialCharsFlags;
48
49    /**
50     * Static Matcher which escapes characters for HTML Attribute contexts
51     *
52     * @var callable
53     */
54    protected $htmlAttrMatcher;
55
56    /**
57     * Static Matcher which escapes characters for Javascript contexts
58     *
59     * @var callable
60     */
61    protected $jsMatcher;
62
63    /**
64     * Static Matcher which escapes characters for CSS Attribute contexts
65     *
66     * @var callable
67     */
68    protected $cssMatcher;
69
70    /**
71     * List of all encoding supported by this class
72     *
73     * @var array
74     */
75    protected $supportedEncodings = [
76        'iso-8859-1',   'iso8859-1',    'iso-8859-5',   'iso8859-5',
77        'iso-8859-15',  'iso8859-15',   'utf-8',        'cp866',
78        'ibm866',       '866',          'cp1251',       'windows-1251',
79        'win-1251',     '1251',         'cp1252',       'windows-1252',
80        '1252',         'koi8-r',       'koi8-ru',      'koi8r',
81        'big5',         '950',          'gb2312',       '936',
82        'big5-hkscs',   'shift_jis',    'sjis',         'sjis-win',
83        'cp932',        '932',          'euc-jp',       'eucjp',
84        'eucjp-win',    'macroman'
85    ];
86
87    /**
88     * Constructor: Single parameter allows setting of global encoding for use by
89     * the current object.
90     *
91     * @param string $encoding
92     * @throws Exception\InvalidArgumentException
93     */
94    public function __construct($encoding = null)
95    {
96        if ($encoding !== null) {
97            if (! is_string($encoding)) {
98                throw new Exception\InvalidArgumentException(
99                    get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding)
100                );
101            }
102            if ($encoding === '') {
103                throw new Exception\InvalidArgumentException(
104                    get_class($this) . ' constructor parameter does not allow a blank value'
105                );
106            }
107
108            $encoding = strtolower($encoding);
109            if (! in_array($encoding, $this->supportedEncodings)) {
110                throw new Exception\InvalidArgumentException(
111                    'Value of \'' . $encoding . '\' passed to ' . get_class($this)
112                    . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()'
113                );
114            }
115
116            $this->encoding = $encoding;
117        }
118
119        // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences.
120        $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE;
121
122        // set matcher callbacks
123        $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher'];
124        $this->jsMatcher       = [$this, 'jsMatcher'];
125        $this->cssMatcher      = [$this, 'cssMatcher'];
126    }
127
128    /**
129     * Return the encoding that all output/input is expected to be encoded in.
130     *
131     * @return string
132     */
133    public function getEncoding()
134    {
135        return $this->encoding;
136    }
137
138    /**
139     * Escape a string for the HTML Body context where there are very few characters
140     * of special meaning. Internally this will use htmlspecialchars().
141     *
142     * @param string $string
143     * @return string
144     */
145    public function escapeHtml($string)
146    {
147        return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding);
148    }
149
150    /**
151     * Escape a string for the HTML Attribute context. We use an extended set of characters
152     * to escape that are not covered by htmlspecialchars() to cover cases where an attribute
153     * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE).
154     *
155     * @param string $string
156     * @return string
157     */
158    public function escapeHtmlAttr($string)
159    {
160        $string = $this->toUtf8($string);
161        if ($string === '' || ctype_digit($string)) {
162            return $string;
163        }
164
165        $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string);
166        return $this->fromUtf8($result);
167    }
168
169    /**
170     * Escape a string for the Javascript context. This does not use json_encode(). An extended
171     * set of characters are escaped beyond ECMAScript's rules for Javascript literal string
172     * escaping in order to prevent misinterpretation of Javascript as HTML leading to the
173     * injection of special characters and entities. The escaping used should be tolerant
174     * of cases where HTML escaping was not applied on top of Javascript escaping correctly.
175     * Backslash escaping is not used as it still leaves the escaped character as-is and so
176     * is not useful in a HTML context.
177     *
178     * @param string $string
179     * @return string
180     */
181    public function escapeJs($string)
182    {
183        $string = $this->toUtf8($string);
184        if ($string === '' || ctype_digit($string)) {
185            return $string;
186        }
187
188        $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string);
189        return $this->fromUtf8($result);
190    }
191
192    /**
193     * Escape a string for the URI or Parameter contexts. This should not be used to escape
194     * an entire URI - only a subcomponent being inserted. The function is a simple proxy
195     * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely.
196     *
197     * @param string $string
198     * @return string
199     */
200    public function escapeUrl($string)
201    {
202        return rawurlencode($string);
203    }
204
205    /**
206     * Escape a string for the CSS context. CSS escaping can be applied to any string being
207     * inserted into CSS and escapes everything except alphanumerics.
208     *
209     * @param string $string
210     * @return string
211     */
212    public function escapeCss($string)
213    {
214        $string = $this->toUtf8($string);
215        if ($string === '' || ctype_digit($string)) {
216            return $string;
217        }
218
219        $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string);
220        return $this->fromUtf8($result);
221    }
222
223    /**
224     * Callback function for preg_replace_callback that applies HTML Attribute
225     * escaping to all matches.
226     *
227     * @param array $matches
228     * @return string
229     */
230    protected function htmlAttrMatcher($matches)
231    {
232        $chr = $matches[0];
233        $ord = ord($chr);
234
235        /**
236         * The following replaces characters undefined in HTML with the
237         * hex entity for the Unicode replacement character.
238         */
239        if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r")
240            || ($ord >= 0x7f && $ord <= 0x9f)
241        ) {
242            return '&#xFFFD;';
243        }
244
245        /**
246         * Check if the current character to escape has a name entity we should
247         * replace it with while grabbing the integer value of the character.
248         */
249        if (strlen($chr) > 1) {
250            $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
251        }
252
253        $hex = bin2hex($chr);
254        $ord = hexdec($hex);
255        if (isset(static::$htmlNamedEntityMap[$ord])) {
256            return '&' . static::$htmlNamedEntityMap[$ord] . ';';
257        }
258
259        /**
260         * Per OWASP recommendations, we'll use upper hex entities
261         * for any other characters where a named entity does not exist.
262         */
263        if ($ord > 255) {
264            return sprintf('&#x%04X;', $ord);
265        }
266        return sprintf('&#x%02X;', $ord);
267    }
268
269    /**
270     * Callback function for preg_replace_callback that applies Javascript
271     * escaping to all matches.
272     *
273     * @param array $matches
274     * @return string
275     */
276    protected function jsMatcher($matches)
277    {
278        $chr = $matches[0];
279        if (strlen($chr) == 1) {
280            return sprintf('\\x%02X', ord($chr));
281        }
282        $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
283        $hex = strtoupper(bin2hex($chr));
284        if (strlen($hex) <= 4) {
285            return sprintf('\\u%04s', $hex);
286        }
287        $highSurrogate = substr($hex, 0, 4);
288        $lowSurrogate = substr($hex, 4, 4);
289        return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate);
290    }
291
292    /**
293     * Callback function for preg_replace_callback that applies CSS
294     * escaping to all matches.
295     *
296     * @param array $matches
297     * @return string
298     */
299    protected function cssMatcher($matches)
300    {
301        $chr = $matches[0];
302        if (strlen($chr) == 1) {
303            $ord = ord($chr);
304        } else {
305            $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
306            $ord = hexdec(bin2hex($chr));
307        }
308        return sprintf('\\%X ', $ord);
309    }
310
311    /**
312     * Converts a string to UTF-8 from the base encoding. The base encoding is set via this
313     * class' constructor.
314     *
315     * @param string $string
316     * @throws Exception\RuntimeException
317     * @return string
318     */
319    protected function toUtf8($string)
320    {
321        if ($this->getEncoding() === 'utf-8') {
322            $result = $string;
323        } else {
324            $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding());
325        }
326
327        if (! $this->isUtf8($result)) {
328            throw new Exception\RuntimeException(
329                sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result)
330            );
331        }
332
333        return $result;
334    }
335
336    /**
337     * Converts a string from UTF-8 to the base encoding. The base encoding is set via this
338     * class' constructor.
339     * @param string $string
340     * @return string
341     */
342    protected function fromUtf8($string)
343    {
344        if ($this->getEncoding() === 'utf-8') {
345            return $string;
346        }
347
348        return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8');
349    }
350
351    /**
352     * Checks if a given string appears to be valid UTF-8 or not.
353     *
354     * @param string $string
355     * @return bool
356     */
357    protected function isUtf8($string)
358    {
359        return ($string === '' || preg_match('/^./su', $string));
360    }
361
362    /**
363     * Encoding conversion helper which wraps iconv and mbstring where they exist or throws
364     * and exception where neither is available.
365     *
366     * @param string $string
367     * @param string $to
368     * @param array|string $from
369     * @throws Exception\RuntimeException
370     * @return string
371     */
372    protected function convertEncoding($string, $to, $from)
373    {
374        if (function_exists('iconv')) {
375            $result = iconv($from, $to, $string);
376        } elseif (function_exists('mb_convert_encoding')) {
377            $result = mb_convert_encoding($string, $to, $from);
378        } else {
379            throw new Exception\RuntimeException(
380                get_class($this)
381                . ' requires either the iconv or mbstring extension to be installed'
382                . ' when escaping for non UTF-8 strings.'
383            );
384        }
385
386        if ($result === false) {
387            return ''; // return non-fatal blank string on encoding errors from users
388        }
389        return $result;
390    }
391}
392