1<?php
2/**
3 * Zend Framework (http://framework.zend.com/)
4 *
5 * @link      http://github.com/zendframework/zf2 for the canonical source repository
6 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
7 * @license   http://framework.zend.com/license/new-bsd New BSD License
8 */
9
10namespace Zend\Escaper;
11
12/**
13 * Context specific methods for use in secure output escaping
14 */
15class Escaper
16{
17    /**
18     * Entity Map mapping Unicode codepoints to any available named HTML entities.
19     *
20     * While HTML supports far more named entities, the lowest common denominator
21     * has become HTML5's XML Serialisation which is restricted to the those named
22     * entities that XML supports. Using HTML entities would result in this error:
23     *     XML Parsing Error: undefined entity
24     *
25     * @var array
26     */
27    protected static $htmlNamedEntityMap = array(
28        34 => 'quot',         // quotation mark
29        38 => 'amp',          // ampersand
30        60 => 'lt',           // less-than sign
31        62 => 'gt',           // greater-than sign
32    );
33
34    /**
35     * Current encoding for escaping. If not UTF-8, we convert strings from this encoding
36     * pre-escaping and back to this encoding post-escaping.
37     *
38     * @var string
39     */
40    protected $encoding = 'utf-8';
41
42    /**
43     * Holds the value of the special flags passed as second parameter to
44     * htmlspecialchars(). We modify these for PHP 5.4 to take advantage
45     * of the new ENT_SUBSTITUTE flag for correctly dealing with invalid
46     * UTF-8 sequences.
47     *
48     * @var string
49     */
50    protected $htmlSpecialCharsFlags = ENT_QUOTES;
51
52    /**
53     * Static Matcher which escapes characters for HTML Attribute contexts
54     *
55     * @var callable
56     */
57    protected $htmlAttrMatcher;
58
59    /**
60     * Static Matcher which escapes characters for Javascript contexts
61     *
62     * @var callable
63     */
64    protected $jsMatcher;
65
66    /**
67     * Static Matcher which escapes characters for CSS Attribute contexts
68     *
69     * @var callable
70     */
71    protected $cssMatcher;
72
73    /**
74     * List of all encoding supported by this class
75     *
76     * @var array
77     */
78    protected $supportedEncodings = array(
79        'iso-8859-1',   'iso8859-1',    'iso-8859-5',   'iso8859-5',
80        'iso-8859-15',  'iso8859-15',   'utf-8',        'cp866',
81        'ibm866',       '866',          'cp1251',       'windows-1251',
82        'win-1251',     '1251',         'cp1252',       'windows-1252',
83        '1252',         'koi8-r',       'koi8-ru',      'koi8r',
84        'big5',         '950',          'gb2312',       '936',
85        'big5-hkscs',   'shift_jis',    'sjis',         'sjis-win',
86        'cp932',        '932',          'euc-jp',       'eucjp',
87        'eucjp-win',    'macroman'
88    );
89
90    /**
91     * Constructor: Single parameter allows setting of global encoding for use by
92     * the current object. If PHP 5.4 is detected, additional ENT_SUBSTITUTE flag
93     * is set for htmlspecialchars() calls.
94     *
95     * @param string $encoding
96     * @throws Exception\InvalidArgumentException
97     */
98    public function __construct($encoding = null)
99    {
100        if ($encoding !== null) {
101            $encoding = (string) $encoding;
102            if ($encoding === '') {
103                throw new Exception\InvalidArgumentException(
104                    get_class($this) . ' constructor parameter does not allow a blank value'
105                );
106            }
107
108            $encoding = strtolower($encoding);
109            if (!in_array($encoding, $this->supportedEncodings)) {
110                throw new Exception\InvalidArgumentException(
111                    'Value of \'' . $encoding . '\' passed to ' . get_class($this)
112                    . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()'
113                );
114            }
115
116            $this->encoding = $encoding;
117        }
118
119        if (defined('ENT_SUBSTITUTE')) {
120            $this->htmlSpecialCharsFlags|= ENT_SUBSTITUTE;
121        }
122
123        // set matcher callbacks
124        $this->htmlAttrMatcher = array($this, 'htmlAttrMatcher');
125        $this->jsMatcher       = array($this, 'jsMatcher');
126        $this->cssMatcher      = array($this, 'cssMatcher');
127    }
128
129    /**
130     * Return the encoding that all output/input is expected to be encoded in.
131     *
132     * @return string
133     */
134    public function getEncoding()
135    {
136        return $this->encoding;
137    }
138
139    /**
140     * Escape a string for the HTML Body context where there are very few characters
141     * of special meaning. Internally this will use htmlspecialchars().
142     *
143     * @param string $string
144     * @return string
145     */
146    public function escapeHtml($string)
147    {
148        return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding);
149    }
150
151    /**
152     * Escape a string for the HTML Attribute context. We use an extended set of characters
153     * to escape that are not covered by htmlspecialchars() to cover cases where an attribute
154     * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE).
155     *
156     * @param string $string
157     * @return string
158     */
159    public function escapeHtmlAttr($string)
160    {
161        $string = $this->toUtf8($string);
162        if ($string === '' || ctype_digit($string)) {
163            return $string;
164        }
165
166        $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string);
167        return $this->fromUtf8($result);
168    }
169
170    /**
171     * Escape a string for the Javascript context. This does not use json_encode(). An extended
172     * set of characters are escaped beyond ECMAScript's rules for Javascript literal string
173     * escaping in order to prevent misinterpretation of Javascript as HTML leading to the
174     * injection of special characters and entities. The escaping used should be tolerant
175     * of cases where HTML escaping was not applied on top of Javascript escaping correctly.
176     * Backslash escaping is not used as it still leaves the escaped character as-is and so
177     * is not useful in a HTML context.
178     *
179     * @param string $string
180     * @return string
181     */
182    public function escapeJs($string)
183    {
184        $string = $this->toUtf8($string);
185        if ($string === '' || ctype_digit($string)) {
186            return $string;
187        }
188
189        $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string);
190        return $this->fromUtf8($result);
191    }
192
193    /**
194     * Escape a string for the URI or Parameter contexts. This should not be used to escape
195     * an entire URI - only a subcomponent being inserted. The function is a simple proxy
196     * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely.
197     *
198     * @param string $string
199     * @return string
200     */
201    public function escapeUrl($string)
202    {
203        return rawurlencode($string);
204    }
205
206    /**
207     * Escape a string for the CSS context. CSS escaping can be applied to any string being
208     * inserted into CSS and escapes everything except alphanumerics.
209     *
210     * @param string $string
211     * @return string
212     */
213    public function escapeCss($string)
214    {
215        $string = $this->toUtf8($string);
216        if ($string === '' || ctype_digit($string)) {
217            return $string;
218        }
219
220        $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string);
221        return $this->fromUtf8($result);
222    }
223
224    /**
225     * Callback function for preg_replace_callback that applies HTML Attribute
226     * escaping to all matches.
227     *
228     * @param array $matches
229     * @return string
230     */
231    protected function htmlAttrMatcher($matches)
232    {
233        $chr = $matches[0];
234        $ord = ord($chr);
235
236        /**
237         * The following replaces characters undefined in HTML with the
238         * hex entity for the Unicode replacement character.
239         */
240        if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r")
241            || ($ord >= 0x7f && $ord <= 0x9f)
242        ) {
243            return '&#xFFFD;';
244        }
245
246        /**
247         * Check if the current character to escape has a name entity we should
248         * replace it with while grabbing the integer value of the character.
249         */
250        if (strlen($chr) > 1) {
251            $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
252        }
253
254        $hex = bin2hex($chr);
255        $ord = hexdec($hex);
256        if (isset(static::$htmlNamedEntityMap[$ord])) {
257            return '&' . static::$htmlNamedEntityMap[$ord] . ';';
258        }
259
260        /**
261         * Per OWASP recommendations, we'll use upper hex entities
262         * for any other characters where a named entity does not exist.
263         */
264        if ($ord > 255) {
265            return sprintf('&#x%04X;', $ord);
266        }
267        return sprintf('&#x%02X;', $ord);
268    }
269
270    /**
271     * Callback function for preg_replace_callback that applies Javascript
272     * escaping to all matches.
273     *
274     * @param array $matches
275     * @return string
276     */
277    protected function jsMatcher($matches)
278    {
279        $chr = $matches[0];
280        if (strlen($chr) == 1) {
281            return sprintf('\\x%02X', ord($chr));
282        }
283        $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
284        return sprintf('\\u%04s', strtoupper(bin2hex($chr)));
285    }
286
287    /**
288     * Callback function for preg_replace_callback that applies CSS
289     * escaping to all matches.
290     *
291     * @param array $matches
292     * @return string
293     */
294    protected function cssMatcher($matches)
295    {
296        $chr = $matches[0];
297        if (strlen($chr) == 1) {
298            $ord = ord($chr);
299        } else {
300            $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
301            $ord = hexdec(bin2hex($chr));
302        }
303        return sprintf('\\%X ', $ord);
304    }
305
306    /**
307     * Converts a string to UTF-8 from the base encoding. The base encoding is set via this
308     * class' constructor.
309     *
310     * @param string $string
311     * @throws Exception\RuntimeException
312     * @return string
313     */
314    protected function toUtf8($string)
315    {
316        if ($this->getEncoding() === 'utf-8') {
317            $result = $string;
318        } else {
319            $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding());
320        }
321
322        if (!$this->isUtf8($result)) {
323            throw new Exception\RuntimeException(
324                sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result)
325            );
326        }
327
328        return $result;
329    }
330
331    /**
332     * Converts a string from UTF-8 to the base encoding. The base encoding is set via this
333     * class' constructor.
334     * @param string $string
335     * @return string
336     */
337    protected function fromUtf8($string)
338    {
339        if ($this->getEncoding() === 'utf-8') {
340            return $string;
341        }
342
343        return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8');
344    }
345
346    /**
347     * Checks if a given string appears to be valid UTF-8 or not.
348     *
349     * @param string $string
350     * @return bool
351     */
352    protected function isUtf8($string)
353    {
354        return ($string === '' || preg_match('/^./su', $string));
355    }
356
357    /**
358     * Encoding conversion helper which wraps iconv and mbstring where they exist or throws
359     * and exception where neither is available.
360     *
361     * @param string $string
362     * @param string $to
363     * @param array|string $from
364     * @throws Exception\RuntimeException
365     * @return string
366     */
367    protected function convertEncoding($string, $to, $from)
368    {
369        if (function_exists('iconv')) {
370            $result = iconv($from, $to, $string);
371        } elseif (function_exists('mb_convert_encoding')) {
372            $result = mb_convert_encoding($string, $to, $from);
373        } else {
374            throw new Exception\RuntimeException(
375                get_class($this)
376                . ' requires either the iconv or mbstring extension to be installed'
377                . ' when escaping for non UTF-8 strings.'
378            );
379        }
380
381        if ($result === false) {
382            return ''; // return non-fatal blank string on encoding errors from users
383        }
384        return $result;
385    }
386}
387