1<?php 2 3/** 4 * @see https://github.com/laminas/laminas-escaper for the canonical source repository 5 * @copyright https://github.com/laminas/laminas-escaper/blob/master/COPYRIGHT.md 6 * @license https://github.com/laminas/laminas-escaper/blob/master/LICENSE.md New BSD License 7 */ 8 9namespace Laminas\Escaper; 10 11/** 12 * Context specific methods for use in secure output escaping 13 */ 14class Escaper 15{ 16 /** 17 * Entity Map mapping Unicode codepoints to any available named HTML entities. 18 * 19 * While HTML supports far more named entities, the lowest common denominator 20 * has become HTML5's XML Serialisation which is restricted to the those named 21 * entities that XML supports. Using HTML entities would result in this error: 22 * XML Parsing Error: undefined entity 23 * 24 * @var array 25 */ 26 protected static $htmlNamedEntityMap = [ 27 34 => 'quot', // quotation mark 28 38 => 'amp', // ampersand 29 60 => 'lt', // less-than sign 30 62 => 'gt', // greater-than sign 31 ]; 32 33 /** 34 * Current encoding for escaping. If not UTF-8, we convert strings from this encoding 35 * pre-escaping and back to this encoding post-escaping. 36 * 37 * @var string 38 */ 39 protected $encoding = 'utf-8'; 40 41 /** 42 * Holds the value of the special flags passed as second parameter to 43 * htmlspecialchars(). 44 * 45 * @var int 46 */ 47 protected $htmlSpecialCharsFlags; 48 49 /** 50 * Static Matcher which escapes characters for HTML Attribute contexts 51 * 52 * @var callable 53 */ 54 protected $htmlAttrMatcher; 55 56 /** 57 * Static Matcher which escapes characters for Javascript contexts 58 * 59 * @var callable 60 */ 61 protected $jsMatcher; 62 63 /** 64 * Static Matcher which escapes characters for CSS Attribute contexts 65 * 66 * @var callable 67 */ 68 protected $cssMatcher; 69 70 /** 71 * List of all encoding supported by this class 72 * 73 * @var array 74 */ 75 protected $supportedEncodings = [ 76 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5', 77 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866', 78 'ibm866', '866', 'cp1251', 'windows-1251', 79 'win-1251', '1251', 'cp1252', 'windows-1252', 80 '1252', 'koi8-r', 'koi8-ru', 'koi8r', 81 'big5', '950', 'gb2312', '936', 82 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win', 83 'cp932', '932', 'euc-jp', 'eucjp', 84 'eucjp-win', 'macroman' 85 ]; 86 87 /** 88 * Constructor: Single parameter allows setting of global encoding for use by 89 * the current object. 90 * 91 * @param string $encoding 92 * @throws Exception\InvalidArgumentException 93 */ 94 public function __construct($encoding = null) 95 { 96 if ($encoding !== null) { 97 if (! is_string($encoding)) { 98 throw new Exception\InvalidArgumentException( 99 get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding) 100 ); 101 } 102 if ($encoding === '') { 103 throw new Exception\InvalidArgumentException( 104 get_class($this) . ' constructor parameter does not allow a blank value' 105 ); 106 } 107 108 $encoding = strtolower($encoding); 109 if (! in_array($encoding, $this->supportedEncodings)) { 110 throw new Exception\InvalidArgumentException( 111 'Value of \'' . $encoding . '\' passed to ' . get_class($this) 112 . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()' 113 ); 114 } 115 116 $this->encoding = $encoding; 117 } 118 119 // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences. 120 $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE; 121 122 // set matcher callbacks 123 $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher']; 124 $this->jsMatcher = [$this, 'jsMatcher']; 125 $this->cssMatcher = [$this, 'cssMatcher']; 126 } 127 128 /** 129 * Return the encoding that all output/input is expected to be encoded in. 130 * 131 * @return string 132 */ 133 public function getEncoding() 134 { 135 return $this->encoding; 136 } 137 138 /** 139 * Escape a string for the HTML Body context where there are very few characters 140 * of special meaning. Internally this will use htmlspecialchars(). 141 * 142 * @param string $string 143 * @return string 144 */ 145 public function escapeHtml($string) 146 { 147 return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding); 148 } 149 150 /** 151 * Escape a string for the HTML Attribute context. We use an extended set of characters 152 * to escape that are not covered by htmlspecialchars() to cover cases where an attribute 153 * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE). 154 * 155 * @param string $string 156 * @return string 157 */ 158 public function escapeHtmlAttr($string) 159 { 160 $string = $this->toUtf8($string); 161 if ($string === '' || ctype_digit($string)) { 162 return $string; 163 } 164 165 $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string); 166 return $this->fromUtf8($result); 167 } 168 169 /** 170 * Escape a string for the Javascript context. This does not use json_encode(). An extended 171 * set of characters are escaped beyond ECMAScript's rules for Javascript literal string 172 * escaping in order to prevent misinterpretation of Javascript as HTML leading to the 173 * injection of special characters and entities. The escaping used should be tolerant 174 * of cases where HTML escaping was not applied on top of Javascript escaping correctly. 175 * Backslash escaping is not used as it still leaves the escaped character as-is and so 176 * is not useful in a HTML context. 177 * 178 * @param string $string 179 * @return string 180 */ 181 public function escapeJs($string) 182 { 183 $string = $this->toUtf8($string); 184 if ($string === '' || ctype_digit($string)) { 185 return $string; 186 } 187 188 $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string); 189 return $this->fromUtf8($result); 190 } 191 192 /** 193 * Escape a string for the URI or Parameter contexts. This should not be used to escape 194 * an entire URI - only a subcomponent being inserted. The function is a simple proxy 195 * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely. 196 * 197 * @param string $string 198 * @return string 199 */ 200 public function escapeUrl($string) 201 { 202 return rawurlencode($string); 203 } 204 205 /** 206 * Escape a string for the CSS context. CSS escaping can be applied to any string being 207 * inserted into CSS and escapes everything except alphanumerics. 208 * 209 * @param string $string 210 * @return string 211 */ 212 public function escapeCss($string) 213 { 214 $string = $this->toUtf8($string); 215 if ($string === '' || ctype_digit($string)) { 216 return $string; 217 } 218 219 $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string); 220 return $this->fromUtf8($result); 221 } 222 223 /** 224 * Callback function for preg_replace_callback that applies HTML Attribute 225 * escaping to all matches. 226 * 227 * @param array $matches 228 * @return string 229 */ 230 protected function htmlAttrMatcher($matches) 231 { 232 $chr = $matches[0]; 233 $ord = ord($chr); 234 235 /** 236 * The following replaces characters undefined in HTML with the 237 * hex entity for the Unicode replacement character. 238 */ 239 if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r") 240 || ($ord >= 0x7f && $ord <= 0x9f) 241 ) { 242 return '�'; 243 } 244 245 /** 246 * Check if the current character to escape has a name entity we should 247 * replace it with while grabbing the integer value of the character. 248 */ 249 if (strlen($chr) > 1) { 250 $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8'); 251 } 252 253 $hex = bin2hex($chr); 254 $ord = hexdec($hex); 255 if (isset(static::$htmlNamedEntityMap[$ord])) { 256 return '&' . static::$htmlNamedEntityMap[$ord] . ';'; 257 } 258 259 /** 260 * Per OWASP recommendations, we'll use upper hex entities 261 * for any other characters where a named entity does not exist. 262 */ 263 if ($ord > 255) { 264 return sprintf('&#x%04X;', $ord); 265 } 266 return sprintf('&#x%02X;', $ord); 267 } 268 269 /** 270 * Callback function for preg_replace_callback that applies Javascript 271 * escaping to all matches. 272 * 273 * @param array $matches 274 * @return string 275 */ 276 protected function jsMatcher($matches) 277 { 278 $chr = $matches[0]; 279 if (strlen($chr) == 1) { 280 return sprintf('\\x%02X', ord($chr)); 281 } 282 $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8'); 283 $hex = strtoupper(bin2hex($chr)); 284 if (strlen($hex) <= 4) { 285 return sprintf('\\u%04s', $hex); 286 } 287 $highSurrogate = substr($hex, 0, 4); 288 $lowSurrogate = substr($hex, 4, 4); 289 return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate); 290 } 291 292 /** 293 * Callback function for preg_replace_callback that applies CSS 294 * escaping to all matches. 295 * 296 * @param array $matches 297 * @return string 298 */ 299 protected function cssMatcher($matches) 300 { 301 $chr = $matches[0]; 302 if (strlen($chr) == 1) { 303 $ord = ord($chr); 304 } else { 305 $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8'); 306 $ord = hexdec(bin2hex($chr)); 307 } 308 return sprintf('\\%X ', $ord); 309 } 310 311 /** 312 * Converts a string to UTF-8 from the base encoding. The base encoding is set via this 313 * class' constructor. 314 * 315 * @param string $string 316 * @throws Exception\RuntimeException 317 * @return string 318 */ 319 protected function toUtf8($string) 320 { 321 if ($this->getEncoding() === 'utf-8') { 322 $result = $string; 323 } else { 324 $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding()); 325 } 326 327 if (! $this->isUtf8($result)) { 328 throw new Exception\RuntimeException( 329 sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result) 330 ); 331 } 332 333 return $result; 334 } 335 336 /** 337 * Converts a string from UTF-8 to the base encoding. The base encoding is set via this 338 * class' constructor. 339 * @param string $string 340 * @return string 341 */ 342 protected function fromUtf8($string) 343 { 344 if ($this->getEncoding() === 'utf-8') { 345 return $string; 346 } 347 348 return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8'); 349 } 350 351 /** 352 * Checks if a given string appears to be valid UTF-8 or not. 353 * 354 * @param string $string 355 * @return bool 356 */ 357 protected function isUtf8($string) 358 { 359 return ($string === '' || preg_match('/^./su', $string)); 360 } 361 362 /** 363 * Encoding conversion helper which wraps iconv and mbstring where they exist or throws 364 * and exception where neither is available. 365 * 366 * @param string $string 367 * @param string $to 368 * @param array|string $from 369 * @throws Exception\RuntimeException 370 * @return string 371 */ 372 protected function convertEncoding($string, $to, $from) 373 { 374 if (function_exists('iconv')) { 375 $result = iconv($from, $to, $string); 376 } elseif (function_exists('mb_convert_encoding')) { 377 $result = mb_convert_encoding($string, $to, $from); 378 } else { 379 throw new Exception\RuntimeException( 380 get_class($this) 381 . ' requires either the iconv or mbstring extension to be installed' 382 . ' when escaping for non UTF-8 strings.' 383 ); 384 } 385 386 if ($result === false) { 387 return ''; // return non-fatal blank string on encoding errors from users 388 } 389 return $result; 390 } 391} 392