1<?php 2/** 3 * Zend Framework (http://framework.zend.com/) 4 * 5 * @link http://github.com/zendframework/zf2 for the canonical source repository 6 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com) 7 * @license http://framework.zend.com/license/new-bsd New BSD License 8 */ 9 10namespace Zend\Escaper; 11 12/** 13 * Context specific methods for use in secure output escaping 14 */ 15class Escaper 16{ 17 /** 18 * Entity Map mapping Unicode codepoints to any available named HTML entities. 19 * 20 * While HTML supports far more named entities, the lowest common denominator 21 * has become HTML5's XML Serialisation which is restricted to the those named 22 * entities that XML supports. Using HTML entities would result in this error: 23 * XML Parsing Error: undefined entity 24 * 25 * @var array 26 */ 27 protected static $htmlNamedEntityMap = array( 28 34 => 'quot', // quotation mark 29 38 => 'amp', // ampersand 30 60 => 'lt', // less-than sign 31 62 => 'gt', // greater-than sign 32 ); 33 34 /** 35 * Current encoding for escaping. If not UTF-8, we convert strings from this encoding 36 * pre-escaping and back to this encoding post-escaping. 37 * 38 * @var string 39 */ 40 protected $encoding = 'utf-8'; 41 42 /** 43 * Holds the value of the special flags passed as second parameter to 44 * htmlspecialchars(). We modify these for PHP 5.4 to take advantage 45 * of the new ENT_SUBSTITUTE flag for correctly dealing with invalid 46 * UTF-8 sequences. 47 * 48 * @var string 49 */ 50 protected $htmlSpecialCharsFlags = ENT_QUOTES; 51 52 /** 53 * Static Matcher which escapes characters for HTML Attribute contexts 54 * 55 * @var callable 56 */ 57 protected $htmlAttrMatcher; 58 59 /** 60 * Static Matcher which escapes characters for Javascript contexts 61 * 62 * @var callable 63 */ 64 protected $jsMatcher; 65 66 /** 67 * Static Matcher which escapes characters for CSS Attribute contexts 68 * 69 * @var callable 70 */ 71 protected $cssMatcher; 72 73 /** 74 * List of all encoding supported by this class 75 * 76 * @var array 77 */ 78 protected $supportedEncodings = array( 79 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5', 80 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866', 81 'ibm866', '866', 'cp1251', 'windows-1251', 82 'win-1251', '1251', 'cp1252', 'windows-1252', 83 '1252', 'koi8-r', 'koi8-ru', 'koi8r', 84 'big5', '950', 'gb2312', '936', 85 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win', 86 'cp932', '932', 'euc-jp', 'eucjp', 87 'eucjp-win', 'macroman' 88 ); 89 90 /** 91 * Constructor: Single parameter allows setting of global encoding for use by 92 * the current object. If PHP 5.4 is detected, additional ENT_SUBSTITUTE flag 93 * is set for htmlspecialchars() calls. 94 * 95 * @param string $encoding 96 * @throws Exception\InvalidArgumentException 97 */ 98 public function __construct($encoding = null) 99 { 100 if ($encoding !== null) { 101 $encoding = (string) $encoding; 102 if ($encoding === '') { 103 throw new Exception\InvalidArgumentException( 104 get_class($this) . ' constructor parameter does not allow a blank value' 105 ); 106 } 107 108 $encoding = strtolower($encoding); 109 if (!in_array($encoding, $this->supportedEncodings)) { 110 throw new Exception\InvalidArgumentException( 111 'Value of \'' . $encoding . '\' passed to ' . get_class($this) 112 . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()' 113 ); 114 } 115 116 $this->encoding = $encoding; 117 } 118 119 if (defined('ENT_SUBSTITUTE')) { 120 $this->htmlSpecialCharsFlags|= ENT_SUBSTITUTE; 121 } 122 123 // set matcher callbacks 124 $this->htmlAttrMatcher = array($this, 'htmlAttrMatcher'); 125 $this->jsMatcher = array($this, 'jsMatcher'); 126 $this->cssMatcher = array($this, 'cssMatcher'); 127 } 128 129 /** 130 * Return the encoding that all output/input is expected to be encoded in. 131 * 132 * @return string 133 */ 134 public function getEncoding() 135 { 136 return $this->encoding; 137 } 138 139 /** 140 * Escape a string for the HTML Body context where there are very few characters 141 * of special meaning. Internally this will use htmlspecialchars(). 142 * 143 * @param string $string 144 * @return string 145 */ 146 public function escapeHtml($string) 147 { 148 return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding); 149 } 150 151 /** 152 * Escape a string for the HTML Attribute context. We use an extended set of characters 153 * to escape that are not covered by htmlspecialchars() to cover cases where an attribute 154 * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE). 155 * 156 * @param string $string 157 * @return string 158 */ 159 public function escapeHtmlAttr($string) 160 { 161 $string = $this->toUtf8($string); 162 if ($string === '' || ctype_digit($string)) { 163 return $string; 164 } 165 166 $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string); 167 return $this->fromUtf8($result); 168 } 169 170 /** 171 * Escape a string for the Javascript context. This does not use json_encode(). An extended 172 * set of characters are escaped beyond ECMAScript's rules for Javascript literal string 173 * escaping in order to prevent misinterpretation of Javascript as HTML leading to the 174 * injection of special characters and entities. The escaping used should be tolerant 175 * of cases where HTML escaping was not applied on top of Javascript escaping correctly. 176 * Backslash escaping is not used as it still leaves the escaped character as-is and so 177 * is not useful in a HTML context. 178 * 179 * @param string $string 180 * @return string 181 */ 182 public function escapeJs($string) 183 { 184 $string = $this->toUtf8($string); 185 if ($string === '' || ctype_digit($string)) { 186 return $string; 187 } 188 189 $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string); 190 return $this->fromUtf8($result); 191 } 192 193 /** 194 * Escape a string for the URI or Parameter contexts. This should not be used to escape 195 * an entire URI - only a subcomponent being inserted. The function is a simple proxy 196 * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely. 197 * 198 * @param string $string 199 * @return string 200 */ 201 public function escapeUrl($string) 202 { 203 return rawurlencode($string); 204 } 205 206 /** 207 * Escape a string for the CSS context. CSS escaping can be applied to any string being 208 * inserted into CSS and escapes everything except alphanumerics. 209 * 210 * @param string $string 211 * @return string 212 */ 213 public function escapeCss($string) 214 { 215 $string = $this->toUtf8($string); 216 if ($string === '' || ctype_digit($string)) { 217 return $string; 218 } 219 220 $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string); 221 return $this->fromUtf8($result); 222 } 223 224 /** 225 * Callback function for preg_replace_callback that applies HTML Attribute 226 * escaping to all matches. 227 * 228 * @param array $matches 229 * @return string 230 */ 231 protected function htmlAttrMatcher($matches) 232 { 233 $chr = $matches[0]; 234 $ord = ord($chr); 235 236 /** 237 * The following replaces characters undefined in HTML with the 238 * hex entity for the Unicode replacement character. 239 */ 240 if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r") 241 || ($ord >= 0x7f && $ord <= 0x9f) 242 ) { 243 return '�'; 244 } 245 246 /** 247 * Check if the current character to escape has a name entity we should 248 * replace it with while grabbing the integer value of the character. 249 */ 250 if (strlen($chr) > 1) { 251 $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); 252 } 253 254 $hex = bin2hex($chr); 255 $ord = hexdec($hex); 256 if (isset(static::$htmlNamedEntityMap[$ord])) { 257 return '&' . static::$htmlNamedEntityMap[$ord] . ';'; 258 } 259 260 /** 261 * Per OWASP recommendations, we'll use upper hex entities 262 * for any other characters where a named entity does not exist. 263 */ 264 if ($ord > 255) { 265 return sprintf('&#x%04X;', $ord); 266 } 267 return sprintf('&#x%02X;', $ord); 268 } 269 270 /** 271 * Callback function for preg_replace_callback that applies Javascript 272 * escaping to all matches. 273 * 274 * @param array $matches 275 * @return string 276 */ 277 protected function jsMatcher($matches) 278 { 279 $chr = $matches[0]; 280 if (strlen($chr) == 1) { 281 return sprintf('\\x%02X', ord($chr)); 282 } 283 $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); 284 return sprintf('\\u%04s', strtoupper(bin2hex($chr))); 285 } 286 287 /** 288 * Callback function for preg_replace_callback that applies CSS 289 * escaping to all matches. 290 * 291 * @param array $matches 292 * @return string 293 */ 294 protected function cssMatcher($matches) 295 { 296 $chr = $matches[0]; 297 if (strlen($chr) == 1) { 298 $ord = ord($chr); 299 } else { 300 $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); 301 $ord = hexdec(bin2hex($chr)); 302 } 303 return sprintf('\\%X ', $ord); 304 } 305 306 /** 307 * Converts a string to UTF-8 from the base encoding. The base encoding is set via this 308 * class' constructor. 309 * 310 * @param string $string 311 * @throws Exception\RuntimeException 312 * @return string 313 */ 314 protected function toUtf8($string) 315 { 316 if ($this->getEncoding() === 'utf-8') { 317 $result = $string; 318 } else { 319 $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding()); 320 } 321 322 if (!$this->isUtf8($result)) { 323 throw new Exception\RuntimeException( 324 sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result) 325 ); 326 } 327 328 return $result; 329 } 330 331 /** 332 * Converts a string from UTF-8 to the base encoding. The base encoding is set via this 333 * class' constructor. 334 * @param string $string 335 * @return string 336 */ 337 protected function fromUtf8($string) 338 { 339 if ($this->getEncoding() === 'utf-8') { 340 return $string; 341 } 342 343 return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8'); 344 } 345 346 /** 347 * Checks if a given string appears to be valid UTF-8 or not. 348 * 349 * @param string $string 350 * @return bool 351 */ 352 protected function isUtf8($string) 353 { 354 return ($string === '' || preg_match('/^./su', $string)); 355 } 356 357 /** 358 * Encoding conversion helper which wraps iconv and mbstring where they exist or throws 359 * and exception where neither is available. 360 * 361 * @param string $string 362 * @param string $to 363 * @param array|string $from 364 * @throws Exception\RuntimeException 365 * @return string 366 */ 367 protected function convertEncoding($string, $to, $from) 368 { 369 if (function_exists('iconv')) { 370 $result = iconv($from, $to, $string); 371 } elseif (function_exists('mb_convert_encoding')) { 372 $result = mb_convert_encoding($string, $to, $from); 373 } else { 374 throw new Exception\RuntimeException( 375 get_class($this) 376 . ' requires either the iconv or mbstring extension to be installed' 377 . ' when escaping for non UTF-8 strings.' 378 ); 379 } 380 381 if ($result === false) { 382 return ''; // return non-fatal blank string on encoding errors from users 383 } 384 return $result; 385 } 386} 387