1<?php 2/** 3 * HTML sanitizer for %MediaWiki. 4 * 5 * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 * @ingroup Parser 25 */ 26 27use MediaWiki\MediaWikiServices; 28 29/** 30 * HTML sanitizer for MediaWiki 31 * @ingroup Parser 32 */ 33class Sanitizer { 34 /** 35 * Regular expression to match various types of character references in 36 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences 37 */ 38 private const CHAR_REFS_REGEX = 39 '/&([A-Za-z0-9\x80-\xff]+); 40 |&\#([0-9]+); 41 |&\#[xX]([0-9A-Fa-f]+); 42 |(&)/x'; 43 44 /** 45 * Acceptable tag name charset from HTML5 parsing spec 46 * https://www.w3.org/TR/html5/syntax.html#tag-open-state 47 */ 48 private const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; 49 50 /** 51 * Pattern matching evil uris like javascript: 52 * WARNING: DO NOT use this in any place that actually requires denying 53 * certain URIs for security reasons. There are NUMEROUS[1] ways to bypass 54 * pattern-based deny lists; the only way to be secure from javascript: 55 * uri based xss vectors is to allow only things that you know are safe 56 * and deny everything else. 57 * [1]: http://ha.ckers.org/xss.html 58 */ 59 private const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 60 private const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 61 62 /** 63 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. 64 * 65 * @since 1.30 66 */ 67 public const ID_PRIMARY = 0; 68 69 /** 70 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false 71 * if no fallback is configured. 72 * 73 * @since 1.30 74 */ 75 public const ID_FALLBACK = 1; 76 77 /** 78 * List of all named character entities defined in HTML 4.01 79 * https://www.w3.org/TR/html4/sgml/entities.html 80 * As well as ' which is only defined starting in XHTML1. 81 */ 82 private const HTML_ENTITIES = [ 83 'Aacute' => 193, 84 'aacute' => 225, 85 'Acirc' => 194, 86 'acirc' => 226, 87 'acute' => 180, 88 'AElig' => 198, 89 'aelig' => 230, 90 'Agrave' => 192, 91 'agrave' => 224, 92 'alefsym' => 8501, 93 'Alpha' => 913, 94 'alpha' => 945, 95 'amp' => 38, 96 'and' => 8743, 97 'ang' => 8736, 98 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 99 'Aring' => 197, 100 'aring' => 229, 101 'asymp' => 8776, 102 'Atilde' => 195, 103 'atilde' => 227, 104 'Auml' => 196, 105 'auml' => 228, 106 'bdquo' => 8222, 107 'Beta' => 914, 108 'beta' => 946, 109 'brvbar' => 166, 110 'bull' => 8226, 111 'cap' => 8745, 112 'Ccedil' => 199, 113 'ccedil' => 231, 114 'cedil' => 184, 115 'cent' => 162, 116 'Chi' => 935, 117 'chi' => 967, 118 'circ' => 710, 119 'clubs' => 9827, 120 'cong' => 8773, 121 'copy' => 169, 122 'crarr' => 8629, 123 'cup' => 8746, 124 'curren' => 164, 125 'dagger' => 8224, 126 'Dagger' => 8225, 127 'darr' => 8595, 128 'dArr' => 8659, 129 'deg' => 176, 130 'Delta' => 916, 131 'delta' => 948, 132 'diams' => 9830, 133 'divide' => 247, 134 'Eacute' => 201, 135 'eacute' => 233, 136 'Ecirc' => 202, 137 'ecirc' => 234, 138 'Egrave' => 200, 139 'egrave' => 232, 140 'empty' => 8709, 141 'emsp' => 8195, 142 'ensp' => 8194, 143 'Epsilon' => 917, 144 'epsilon' => 949, 145 'equiv' => 8801, 146 'Eta' => 919, 147 'eta' => 951, 148 'ETH' => 208, 149 'eth' => 240, 150 'Euml' => 203, 151 'euml' => 235, 152 'euro' => 8364, 153 'exist' => 8707, 154 'fnof' => 402, 155 'forall' => 8704, 156 'frac12' => 189, 157 'frac14' => 188, 158 'frac34' => 190, 159 'frasl' => 8260, 160 'Gamma' => 915, 161 'gamma' => 947, 162 'ge' => 8805, 163 'gt' => 62, 164 'harr' => 8596, 165 'hArr' => 8660, 166 'hearts' => 9829, 167 'hellip' => 8230, 168 'Iacute' => 205, 169 'iacute' => 237, 170 'Icirc' => 206, 171 'icirc' => 238, 172 'iexcl' => 161, 173 'Igrave' => 204, 174 'igrave' => 236, 175 'image' => 8465, 176 'infin' => 8734, 177 'int' => 8747, 178 'Iota' => 921, 179 'iota' => 953, 180 'iquest' => 191, 181 'isin' => 8712, 182 'Iuml' => 207, 183 'iuml' => 239, 184 'Kappa' => 922, 185 'kappa' => 954, 186 'Lambda' => 923, 187 'lambda' => 955, 188 'lang' => 9001, 189 'laquo' => 171, 190 'larr' => 8592, 191 'lArr' => 8656, 192 'lceil' => 8968, 193 'ldquo' => 8220, 194 'le' => 8804, 195 'lfloor' => 8970, 196 'lowast' => 8727, 197 'loz' => 9674, 198 'lrm' => 8206, 199 'lsaquo' => 8249, 200 'lsquo' => 8216, 201 'lt' => 60, 202 'macr' => 175, 203 'mdash' => 8212, 204 'micro' => 181, 205 'middot' => 183, 206 'minus' => 8722, 207 'Mu' => 924, 208 'mu' => 956, 209 'nabla' => 8711, 210 'nbsp' => 160, 211 'ndash' => 8211, 212 'ne' => 8800, 213 'ni' => 8715, 214 'not' => 172, 215 'notin' => 8713, 216 'nsub' => 8836, 217 'Ntilde' => 209, 218 'ntilde' => 241, 219 'Nu' => 925, 220 'nu' => 957, 221 'Oacute' => 211, 222 'oacute' => 243, 223 'Ocirc' => 212, 224 'ocirc' => 244, 225 'OElig' => 338, 226 'oelig' => 339, 227 'Ograve' => 210, 228 'ograve' => 242, 229 'oline' => 8254, 230 'Omega' => 937, 231 'omega' => 969, 232 'Omicron' => 927, 233 'omicron' => 959, 234 'oplus' => 8853, 235 'or' => 8744, 236 'ordf' => 170, 237 'ordm' => 186, 238 'Oslash' => 216, 239 'oslash' => 248, 240 'Otilde' => 213, 241 'otilde' => 245, 242 'otimes' => 8855, 243 'Ouml' => 214, 244 'ouml' => 246, 245 'para' => 182, 246 'part' => 8706, 247 'permil' => 8240, 248 'perp' => 8869, 249 'Phi' => 934, 250 'phi' => 966, 251 'Pi' => 928, 252 'pi' => 960, 253 'piv' => 982, 254 'plusmn' => 177, 255 'pound' => 163, 256 'prime' => 8242, 257 'Prime' => 8243, 258 'prod' => 8719, 259 'prop' => 8733, 260 'Psi' => 936, 261 'psi' => 968, 262 'quot' => 34, 263 'radic' => 8730, 264 'rang' => 9002, 265 'raquo' => 187, 266 'rarr' => 8594, 267 'rArr' => 8658, 268 'rceil' => 8969, 269 'rdquo' => 8221, 270 'real' => 8476, 271 'reg' => 174, 272 'rfloor' => 8971, 273 'Rho' => 929, 274 'rho' => 961, 275 'rlm' => 8207, 276 'rsaquo' => 8250, 277 'rsquo' => 8217, 278 'sbquo' => 8218, 279 'Scaron' => 352, 280 'scaron' => 353, 281 'sdot' => 8901, 282 'sect' => 167, 283 'shy' => 173, 284 'Sigma' => 931, 285 'sigma' => 963, 286 'sigmaf' => 962, 287 'sim' => 8764, 288 'spades' => 9824, 289 'sub' => 8834, 290 'sube' => 8838, 291 'sum' => 8721, 292 'sup' => 8835, 293 'sup1' => 185, 294 'sup2' => 178, 295 'sup3' => 179, 296 'supe' => 8839, 297 'szlig' => 223, 298 'Tau' => 932, 299 'tau' => 964, 300 'there4' => 8756, 301 'Theta' => 920, 302 'theta' => 952, 303 'thetasym' => 977, 304 'thinsp' => 8201, 305 'THORN' => 222, 306 'thorn' => 254, 307 'tilde' => 732, 308 'times' => 215, 309 'trade' => 8482, 310 'Uacute' => 218, 311 'uacute' => 250, 312 'uarr' => 8593, 313 'uArr' => 8657, 314 'Ucirc' => 219, 315 'ucirc' => 251, 316 'Ugrave' => 217, 317 'ugrave' => 249, 318 'uml' => 168, 319 'upsih' => 978, 320 'Upsilon' => 933, 321 'upsilon' => 965, 322 'Uuml' => 220, 323 'uuml' => 252, 324 'weierp' => 8472, 325 'Xi' => 926, 326 'xi' => 958, 327 'Yacute' => 221, 328 'yacute' => 253, 329 'yen' => 165, 330 'Yuml' => 376, 331 'yuml' => 255, 332 'Zeta' => 918, 333 'zeta' => 950, 334 'zwj' => 8205, 335 'zwnj' => 8204 336 ]; 337 338 /** 339 * Character entity aliases accepted by MediaWiki 340 */ 341 private const HTML_ENTITY_ALIASES = [ 342 'רלמ' => 'rlm', 343 'رلم' => 'rlm', 344 ]; 345 346 /** 347 * Lazy-initialised attributes regex, see getAttribsRegex() 348 */ 349 private static $attribsRegex; 350 351 /** 352 * Regular expression to match HTML/XML attribute pairs within a tag. 353 * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state 354 * Used in Sanitizer::decodeTagAttributes 355 * @return string 356 */ 357 private static function getAttribsRegex() { 358 if ( self::$attribsRegex === null ) { 359 $spaceChars = '\x09\x0a\x0c\x0d\x20'; 360 $space = "[{$spaceChars}]"; 361 $attrib = "[^{$spaceChars}\/>=]"; 362 $attribFirst = "(?:{$attrib}|=)"; 363 self::$attribsRegex = 364 "/({$attribFirst}{$attrib}*) 365 ($space*=$space* 366 (?: 367 # The attribute value: quoted or alone 368 \"([^\"]*)(?:\"|\$) 369 | '([^']*)(?:'|\$) 370 | (((?!$space|>).)*) 371 ) 372 )?/sxu"; 373 } 374 return self::$attribsRegex; 375 } 376 377 /** 378 * Lazy-initialised attribute name regex, see getAttribNameRegex() 379 */ 380 private static $attribNameRegex; 381 382 /** 383 * Used in Sanitizer::decodeTagAttributes to filter attributes. 384 * @return string 385 */ 386 private static function getAttribNameRegex() { 387 if ( self::$attribNameRegex === null ) { 388 $attribFirst = "[:_\p{L}\p{N}]"; 389 $attrib = "[:_\.\-\p{L}\p{N}]"; 390 self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; 391 } 392 return self::$attribNameRegex; 393 } 394 395 /** 396 * Return the various lists of recognized tags 397 * @param array $extratags For any extra tags to include 398 * @param array $removetags For any tags (default or extra) to exclude 399 * @return array 400 */ 401 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) { 402 global $wgAllowImageTag; 403 404 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 405 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 406 407 // Base our staticInitialised variable off of the global config state so that if the globals 408 // are changed (like in the screwed up test system) we will re-initialise the settings. 409 $globalContext = $wgAllowImageTag; 410 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 411 $htmlpairsStatic = [ # Tags that must be closed 412 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 413 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 414 'strike', 'strong', 'tt', 'var', 'div', 'center', 415 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 416 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', 417 'kbd', 'samp', 'data', 'time', 'mark' 418 ]; 419 $htmlsingle = [ 420 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' 421 ]; 422 423 # Elements that cannot have close tags. This is (not coincidentally) 424 # also the list of tags for which the HTML 5 parsing algorithm 425 # requires you to "acknowledge the token's self-closing flag", i.e. 426 # a self-closing tag like <br/> is not an HTML 5 parse error only 427 # for this list. 428 $htmlsingleonly = [ 429 'br', 'wbr', 'hr', 'meta', 'link' 430 ]; 431 432 $htmlnest = [ # Tags that can be nested--?? 433 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 434 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', 435 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' 436 ]; 437 $tabletags = [ # Can only appear inside table, we will close them 438 'td', 'th', 'tr', 439 ]; 440 $htmllist = [ # Tags used by list 441 'ul', 'ol', 442 ]; 443 $listtags = [ # Tags that can appear in a list 444 'li', 445 ]; 446 447 if ( $wgAllowImageTag ) { 448 wfDeprecatedMsg( 'Setting $wgAllowImageTag to true ' . 449 'is deprecated since MediaWiki 1.35', '1.35', false, false ); 450 $htmlsingle[] = 'img'; 451 $htmlsingleonly[] = 'img'; 452 } 453 454 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 455 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 456 457 # Convert them all to hashtables for faster lookup 458 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 459 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; 460 foreach ( $vars as $var ) { 461 $$var = array_flip( $$var ); 462 } 463 $staticInitialised = $globalContext; 464 } 465 466 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 467 $extratags = array_flip( $extratags ); 468 $removetags = array_flip( $removetags ); 469 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 470 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); 471 472 return [ 473 'htmlpairs' => $htmlpairs, 474 'htmlsingle' => $htmlsingle, 475 'htmlsingleonly' => $htmlsingleonly, 476 'htmlnest' => $htmlnest, 477 'tabletags' => $tabletags, 478 'htmllist' => $htmllist, 479 'listtags' => $listtags, 480 'htmlsingleallowed' => $htmlsingleallowed, 481 'htmlelements' => $htmlelements, 482 ]; 483 } 484 485 /** 486 * Cleans up HTML, removes dangerous tags and attributes, and 487 * removes HTML comments 488 * @param string $text 489 * @param callable|null $processCallback Callback to do any variable or parameter 490 * replacements in HTML attribute values 491 * @param array|bool $args Arguments for the processing callback 492 * @param array $extratags For any extra tags to include 493 * @param array $removetags For any tags (default or extra) to exclude 494 * @return string 495 */ 496 public static function removeHTMLtags( $text, $processCallback = null, 497 $args = [], $extratags = [], $removetags = [] 498 ) { 499 $tagData = self::getRecognizedTagData( $extratags, $removetags ); 500 $htmlpairs = $tagData['htmlpairs']; 501 $htmlsingle = $tagData['htmlsingle']; 502 $htmlsingleonly = $tagData['htmlsingleonly']; 503 $htmlnest = $tagData['htmlnest']; 504 $tabletags = $tagData['tabletags']; 505 $htmllist = $tagData['htmllist']; 506 $listtags = $tagData['listtags']; 507 $htmlsingleallowed = $tagData['htmlsingleallowed']; 508 $htmlelements = $tagData['htmlelements']; 509 510 # Remove HTML comments 511 $text = self::removeHTMLcomments( $text ); 512 $bits = explode( '<', $text ); 513 $text = str_replace( '>', '>', array_shift( $bits ) ); 514 515 # this might be possible using remex tidy itself 516 foreach ( $bits as $x ) { 517 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { 518 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 519 520 $badtag = false; 521 $t = strtolower( $t ); 522 if ( isset( $htmlelements[$t] ) ) { 523 if ( is_callable( $processCallback ) ) { 524 call_user_func_array( $processCallback, [ &$params, $args ] ); 525 } 526 527 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { 528 // Remove the self-closing slash, to be consistent 529 // with HTML5 semantics. T134423 530 $brace = '>'; 531 } 532 if ( !self::validateTag( $params, $t ) ) { 533 $badtag = true; 534 } 535 536 $newparams = self::fixTagAttributes( $params, $t ); 537 if ( !$badtag ) { 538 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { 539 # Interpret self-closing tags as empty tags even when 540 # HTML 5 would interpret them as start tags. Such input 541 # is commonly seen on Wikimedia wikis with this intention. 542 $brace = "></$t>"; 543 } 544 545 $rest = str_replace( '>', '>', $rest ); 546 $text .= "<$slash$t$newparams$brace$rest"; 547 continue; 548 } 549 } 550 } 551 $text .= '<' . str_replace( '>', '>', $x ); 552 } 553 return $text; 554 } 555 556 /** 557 * Remove '<!--', '-->', and everything between. 558 * To avoid leaving blank lines, when a comment is both preceded 559 * and followed by a newline (ignoring spaces), trim leading and 560 * trailing spaces and one of the newlines. 561 * 562 * @param string $text 563 * @return string 564 */ 565 public static function removeHTMLcomments( $text ) { 566 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { 567 $end = strpos( $text, '-->', $start + 4 ); 568 if ( $end === false ) { 569 # Unterminated comment; bail out 570 break; 571 } 572 573 $end += 3; 574 575 # Trim space and newline if the comment is both 576 # preceded and followed by a newline 577 $spaceStart = max( $start - 1, 0 ); 578 $spaceLen = $end - $spaceStart; 579 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { 580 $spaceStart--; 581 $spaceLen++; 582 } 583 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { 584 $spaceLen++; 585 } 586 if ( substr( $text, $spaceStart, 1 ) === "\n" 587 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { 588 # Remove the comment, leading and trailing 589 # spaces, and leave only one newline. 590 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); 591 } else { 592 # Remove just the comment. 593 $text = substr_replace( $text, '', $start, $end - $start ); 594 } 595 } 596 return $text; 597 } 598 599 /** 600 * Takes attribute names and values for a tag and the tag name and 601 * validates that the tag is allowed to be present. 602 * This DOES NOT validate the attributes, nor does it validate the 603 * tags themselves. This method only handles the special circumstances 604 * where we may want to allow a tag within content but ONLY when it has 605 * specific attributes set. 606 * 607 * @param string $params 608 * @param string $element 609 * @return bool 610 */ 611 private static function validateTag( $params, $element ) { 612 $params = self::decodeTagAttributes( $params ); 613 614 if ( $element == 'meta' || $element == 'link' ) { 615 if ( !isset( $params['itemprop'] ) ) { 616 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 617 return false; 618 } 619 if ( $element == 'meta' && !isset( $params['content'] ) ) { 620 // <meta> must have a content="" for the itemprop 621 return false; 622 } 623 if ( $element == 'link' && !isset( $params['href'] ) ) { 624 // <link> must have an associated href="" 625 return false; 626 } 627 } 628 629 return true; 630 } 631 632 /** 633 * Take an array of attribute names and values and normalize or discard 634 * illegal values for the given element type. 635 * 636 * - Discards attributes not allowed for the given element 637 * - Unsafe style attributes are discarded 638 * - Invalid id attributes are re-encoded 639 * 640 * @param array $attribs 641 * @param string $element 642 * @return array 643 * 644 * @todo Check for legal values where the DTD limits things. 645 * @todo Check for unique id attribute :P 646 */ 647 public static function validateTagAttributes( $attribs, $element ) { 648 return self::validateAttributes( $attribs, 649 self::attributesAllowedInternal( $element ) ); 650 } 651 652 /** 653 * Take an array of attribute names and values and normalize or discard 654 * illegal values. 655 * 656 * - Discards attributes not on the given list 657 * - Unsafe style attributes are discarded 658 * - Invalid id attributes are re-encoded 659 * 660 * @param array $attribs 661 * @param array $allowed List of allowed attribute names, 662 * as an associative array where keys give valid attribute names 663 * (since 1.34). Before 1.35, passing a sequential array of 664 * valid attribute names was permitted but that is now deprecated. 665 * @return array 666 * 667 * @todo Check for legal values where the DTD limits things. 668 * @todo Check for unique id attribute :P 669 */ 670 public static function validateAttributes( $attribs, $allowed ) { 671 if ( isset( $allowed[0] ) ) { 672 // Calling this function with a sequential array is 673 // deprecated. For now just convert it. 674 wfDeprecated( __METHOD__ . ' with sequential array', '1.35' ); 675 $allowed = array_flip( $allowed ); 676 } 677 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 678 679 $out = []; 680 foreach ( $attribs as $attribute => $value ) { 681 # Allow XML namespace declaration to allow RDFa 682 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 683 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 684 $out[$attribute] = $value; 685 } 686 687 continue; 688 } 689 690 # Allow any attribute beginning with "data-" 691 # However: 692 # * Disallow data attributes used by MediaWiki code 693 # * Ensure that the attribute is not namespaced by banning 694 # colons. 695 if ( ( 696 !preg_match( '/^data-[^:]*$/i', $attribute ) && 697 !array_key_exists( $attribute, $allowed ) 698 ) || self::isReservedDataAttribute( $attribute ) ) { 699 continue; 700 } 701 702 # Strip javascript "expression" from stylesheets. 703 # https://msdn.microsoft.com/en-us/library/ms537634.aspx 704 if ( $attribute == 'style' ) { 705 $value = self::checkCss( $value ); 706 } 707 708 # Escape HTML id attributes 709 if ( $attribute === 'id' ) { 710 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY ); 711 } 712 713 # Escape HTML id reference lists 714 if ( $attribute === 'aria-describedby' 715 || $attribute === 'aria-flowto' 716 || $attribute === 'aria-labelledby' 717 || $attribute === 'aria-owns' 718 ) { 719 $value = self::escapeIdReferenceList( $value ); 720 } 721 722 // RDFa and microdata properties allow URLs, URIs and/or CURIs. 723 // Check them for sanity. 724 if ( $attribute === 'rel' || $attribute === 'rev' 725 # RDFa 726 || $attribute === 'about' || $attribute === 'property' 727 || $attribute === 'resource' || $attribute === 'datatype' 728 || $attribute === 'typeof' 729 # HTML5 microdata 730 || $attribute === 'itemid' || $attribute === 'itemprop' 731 || $attribute === 'itemref' || $attribute === 'itemscope' 732 || $attribute === 'itemtype' 733 ) { 734 // Paranoia. Allow "simple" values but suppress javascript 735 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 736 continue; 737 } 738 } 739 740 # NOTE: even though elements using href/src are not allowed directly, supply 741 # validation code that can be used by tag hook handlers, etc 742 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) { 743 if ( !preg_match( $hrefExp, $value ) ) { 744 continue; // drop any href or src attributes not using an allowed protocol. 745 // NOTE: this also drops all relative URLs 746 } 747 } 748 749 if ( $attribute === 'tabindex' && $value !== '0' ) { 750 // Only allow tabindex of 0, which is useful for accessibility. 751 continue; 752 } 753 754 // If this attribute was previously set, override it. 755 // Output should only have one attribute of each name. 756 $out[$attribute] = $value; 757 } 758 759 # itemtype, itemid, itemref don't make sense without itemscope 760 if ( !array_key_exists( 'itemscope', $out ) ) { 761 unset( $out['itemtype'] ); 762 unset( $out['itemid'] ); 763 unset( $out['itemref'] ); 764 } 765 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 766 767 return $out; 768 } 769 770 /** 771 * Given an attribute name, checks whether it is a reserved data attribute 772 * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki 773 * core and extension code can safely use it to communicate with frontend code. 774 * @param string $attr Attribute name. 775 * @return bool 776 */ 777 public static function isReservedDataAttribute( $attr ) { 778 // data-ooui is reserved for ooui. 779 // data-mw and data-parsoid are reserved for parsoid. 780 // data-mw-<name here> is reserved for extensions (or core) if 781 // they need to communicate some data to the client and want to be 782 // sure that it isn't coming from an untrusted user. 783 // We ignore the possibility of namespaces since user-generated HTML 784 // can't use them anymore. 785 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr ); 786 } 787 788 /** 789 * Merge two sets of HTML attributes. Conflicting items in the second set 790 * will override those in the first, except for 'class' attributes which 791 * will be combined (if they're both strings). 792 * 793 * @todo implement merging for other attributes such as style 794 * @param array $a 795 * @param array $b 796 * @return array 797 */ 798 public static function mergeAttributes( $a, $b ) { 799 $out = array_merge( $a, $b ); 800 if ( isset( $a['class'] ) && isset( $b['class'] ) 801 && is_string( $a['class'] ) && is_string( $b['class'] ) 802 && $a['class'] !== $b['class'] 803 ) { 804 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 805 -1, PREG_SPLIT_NO_EMPTY ); 806 $out['class'] = implode( ' ', array_unique( $classes ) ); 807 } 808 return $out; 809 } 810 811 /** 812 * Normalize CSS into a format we can easily search for hostile input 813 * - decode character references 814 * - decode escape sequences 815 * - remove comments, unless the entire value is one single comment 816 * @param string $value the css string 817 * @return string normalized css 818 */ 819 public static function normalizeCss( $value ) { 820 // Decode character references like { 821 $value = self::decodeCharReferences( $value ); 822 823 // Decode escape sequences and line continuation 824 // See the grammar in the CSS 2 spec, appendix D. 825 // This has to be done AFTER decoding character references. 826 // This means it isn't possible for this function to return 827 // unsanitized escape sequences. It is possible to manufacture 828 // input that contains character references that decode to 829 // escape sequences that decode to character references, but 830 // it's OK for the return value to contain character references 831 // because the caller is supposed to escape those anyway. 832 static $decodeRegex; 833 if ( !$decodeRegex ) { 834 $space = '[\\x20\\t\\r\\n\\f]'; 835 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 836 $backslash = '\\\\'; 837 $decodeRegex = "/ $backslash 838 (?: 839 ($nl) | # 1. Line continuation 840 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 841 (.) | # 3. backslash cancelling special meaning 842 () | # 4. backslash at end of string 843 )/xu"; 844 } 845 $value = preg_replace_callback( $decodeRegex, 846 [ __CLASS__, 'cssDecodeCallback' ], $value ); 847 848 // Let the value through if it's nothing but a single comment, to 849 // allow other functions which may reject it to pass some error 850 // message through. 851 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { 852 // Remove any comments; IE gets token splitting wrong 853 // This must be done AFTER decoding character references and 854 // escape sequences, because those steps can introduce comments 855 // This step cannot introduce character references or escape 856 // sequences, because it replaces comments with spaces rather 857 // than removing them completely. 858 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 859 860 // Remove anything after a comment-start token, to guard against 861 // incorrect client implementations. 862 $commentPos = strpos( $value, '/*' ); 863 if ( $commentPos !== false ) { 864 $value = substr( $value, 0, $commentPos ); 865 } 866 } 867 868 return $value; 869 } 870 871 /** 872 * Pick apart some CSS and check it for forbidden or unsafe structures. 873 * Returns a sanitized string. This sanitized string will have 874 * character references and escape sequences decoded and comments 875 * stripped (unless it is itself one valid comment, in which case the value 876 * will be passed through). If the input is just too evil, only a comment 877 * complaining about evilness will be returned. 878 * 879 * Currently URL references, 'expression', 'tps' are forbidden. 880 * 881 * NOTE: Despite the fact that character references are decoded, the 882 * returned string may contain character references given certain 883 * clever input strings. These character references must 884 * be escaped before the return value is embedded in HTML. 885 * 886 * @param string $value 887 * @return string 888 */ 889 public static function checkCss( $value ) { 890 $value = self::normalizeCss( $value ); 891 892 // Reject problematic keywords and control characters 893 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || 894 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { 895 return '/* invalid control char */'; 896 } elseif ( preg_match( 897 '! expression 898 | filter\s*: 899 | accelerator\s*: 900 | -o-link\s*: 901 | -o-link-source\s*: 902 | -o-replace\s*: 903 | url\s*\( 904 | image\s*\( 905 | image-set\s*\( 906 | attr\s*\([^)]+[\s,]+url 907 | var\s*\( 908 !ix', $value ) ) { 909 return '/* insecure input */'; 910 } 911 return $value; 912 } 913 914 /** 915 * @param array $matches 916 * @return string 917 */ 918 private static function cssDecodeCallback( $matches ) { 919 if ( $matches[1] !== '' ) { 920 // Line continuation 921 return ''; 922 } elseif ( $matches[2] !== '' ) { 923 $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); 924 } elseif ( $matches[3] !== '' ) { 925 $char = $matches[3]; 926 } else { 927 $char = '\\'; 928 } 929 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 930 // These characters need to be escaped in strings 931 // Clean up the escape sequence to avoid parsing errors by clients 932 return '\\' . dechex( ord( $char ) ) . ' '; 933 } else { 934 // Decode unnecessary escape 935 return $char; 936 } 937 } 938 939 /** 940 * Take a tag soup fragment listing an HTML element's attributes 941 * and normalize it to well-formed XML, discarding unwanted attributes. 942 * Output is safe for further wikitext processing, with escaping of 943 * values that could trigger problems. 944 * 945 * - Normalizes attribute names to lowercase 946 * - Discards attributes not allowed for the given element 947 * - Turns broken or invalid entities into plaintext 948 * - Double-quotes all attribute values 949 * - Attributes without values are given the name as attribute 950 * - Double attributes are discarded 951 * - Unsafe style attributes are discarded 952 * - Prepends space if there are attributes. 953 * - (Optionally) Sorts attributes by name. 954 * 955 * @param string $text 956 * @param string $element 957 * @param bool $sorted Whether to sort the attributes (default: false) 958 * @return string 959 */ 960 public static function fixTagAttributes( $text, $element, $sorted = false ) { 961 if ( trim( $text ) == '' ) { 962 return ''; 963 } 964 965 $decoded = self::decodeTagAttributes( $text ); 966 $stripped = self::validateTagAttributes( $decoded, $element ); 967 968 if ( $sorted ) { 969 ksort( $stripped ); 970 } 971 972 return self::safeEncodeTagAttributes( $stripped ); 973 } 974 975 /** 976 * Encode an attribute value for HTML output. 977 * @param string $text 978 * @return string HTML-encoded text fragment 979 */ 980 public static function encodeAttribute( $text ) { 981 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 982 983 // Whitespace is normalized during attribute decoding, 984 // so if we've been passed non-spaces we must encode them 985 // ahead of time or they won't be preserved. 986 $encValue = strtr( $encValue, [ 987 "\n" => ' ', 988 "\r" => ' ', 989 "\t" => '	', 990 ] ); 991 992 return $encValue; 993 } 994 995 /** 996 * Armor French spaces with a replacement character 997 * 998 * @since 1.32 999 * @param string $text Text to armor 1000 * @param string $space Space character for the French spaces, defaults to ' ' 1001 * @return string Armored text 1002 */ 1003 public static function armorFrenchSpaces( $text, $space = ' ' ) { 1004 // Replace $ with \$ and \ with \\ 1005 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space ); 1006 $fixtags = [ 1007 # French spaces, last one Guillemet-left 1008 # only if there is something before the space 1009 # and a non-word character after the punctuation. 1010 '/(?<=\S) (?=[?:;!%»›](?!\w))/u' => "$space", 1011 # French spaces, Guillemet-right 1012 '/([«‹]) /u' => "\\1$space", 1013 ]; 1014 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); 1015 } 1016 1017 /** 1018 * Encode an attribute value for HTML tags, with extra armoring 1019 * against further wiki processing. 1020 * @param string $text 1021 * @return string HTML-encoded text fragment 1022 */ 1023 public static function safeEncodeAttribute( $text ) { 1024 $encValue = self::encodeAttribute( $text ); 1025 1026 # Templates and links may be expanded in later parsing, 1027 # creating invalid or dangerous output. Suppress this. 1028 $encValue = strtr( $encValue, [ 1029 '<' => '<', // This should never happen, 1030 '>' => '>', // we've received invalid input 1031 '"' => '"', // which should have been escaped. 1032 '{' => '{', 1033 '}' => '}', // prevent unpaired language conversion syntax 1034 '[' => '[', 1035 ']' => ']', 1036 "''" => '''', 1037 'ISBN' => 'ISBN', 1038 'RFC' => 'RFC', 1039 'PMID' => 'PMID', 1040 '|' => '|', 1041 '__' => '__', 1042 ] ); 1043 1044 # Armor against French spaces detection (T5158) 1045 $encValue = self::armorFrenchSpaces( $encValue, ' ' ); 1046 1047 # Stupid hack 1048 $encValue = preg_replace_callback( 1049 '/((?i)' . wfUrlProtocols() . ')/', 1050 function ( $matches ) { 1051 return str_replace( ':', ':', $matches[1] ); 1052 }, 1053 $encValue ); 1054 return $encValue; 1055 } 1056 1057 /** 1058 * Given a value, escape it so that it can be used in an id attribute and 1059 * return it. This will use HTML5 validation, allowing anything but ASCII 1060 * whitespace. 1061 * 1062 * To ensure we don't have to bother escaping anything, we also strip ', ". 1063 * TODO: Is this the best tactic? 1064 * 1065 * We also strip # because it upsets IE, and % because it could be 1066 * ambiguous if it's part of something that looks like a percent escape 1067 * (which don't work reliably in fragments cross-browser). 1068 * 1069 * @deprecated since 1.30, use one of this class' escapeIdFor*() functions 1070 * 1071 * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters 1072 * in the id and name attributes 1073 * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with 1074 * the id attribute 1075 * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute 1076 * HTML5 definition of id attribute 1077 * 1078 * @param string $id Id to escape 1079 * @param string|array $options String or array of strings (default is []): 1080 * 'noninitial': This is a non-initial fragment of an id, not a full id, 1081 * so don't pay attention if the first character isn't valid at the 1082 * beginning of an id. 1083 * @return string 1084 */ 1085 public static function escapeId( $id, $options = [] ) { 1086 wfDeprecated( __METHOD__, '1.30' ); 1087 $options = (array)$options; 1088 1089 // HTML4-style escaping 1090 static $replace = [ 1091 '%3A' => ':', 1092 '%' => '.' 1093 ]; 1094 1095 $id = urlencode( strtr( $id, ' ', '_' ) ); 1096 $id = strtr( $id, $replace ); 1097 1098 if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) { 1099 // Initial character must be a letter! 1100 $id = "x$id"; 1101 } 1102 return $id; 1103 } 1104 1105 /** 1106 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be 1107 * a valid HTML id attribute. 1108 * 1109 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, 1110 * be sure to use proper escaping. 1111 * 1112 * @param string $id String to escape 1113 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding 1114 * should be used. 1115 * @return string|bool Escaped ID or false if fallback encoding is requested but it's not 1116 * configured. 1117 * 1118 * @since 1.30 1119 */ 1120 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) { 1121 global $wgFragmentMode; 1122 1123 if ( !isset( $wgFragmentMode[$mode] ) ) { 1124 if ( $mode === self::ID_PRIMARY ) { 1125 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); 1126 } 1127 return false; 1128 } 1129 1130 $internalMode = $wgFragmentMode[$mode]; 1131 1132 return self::escapeIdInternal( $id, $internalMode ); 1133 } 1134 1135 /** 1136 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be 1137 * a valid URL fragment. 1138 * 1139 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, 1140 * be sure to use proper escaping. 1141 * 1142 * @param string $id String to escape 1143 * @return string Escaped ID 1144 * 1145 * @since 1.30 1146 */ 1147 public static function escapeIdForLink( $id ) { 1148 global $wgFragmentMode; 1149 1150 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { 1151 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); 1152 } 1153 1154 $mode = $wgFragmentMode[self::ID_PRIMARY]; 1155 1156 $id = self::escapeIdInternalUrl( $id, $mode ); 1157 1158 return $id; 1159 } 1160 1161 /** 1162 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be 1163 * a valid URL fragment for external interwikis. 1164 * 1165 * @param string $id String to escape 1166 * @return string Escaped ID 1167 * 1168 * @since 1.30 1169 */ 1170 public static function escapeIdForExternalInterwiki( $id ) { 1171 global $wgExternalInterwikiFragmentMode; 1172 1173 $id = self::escapeIdInternalUrl( $id, $wgExternalInterwikiFragmentMode ); 1174 1175 return $id; 1176 } 1177 1178 /** 1179 * Do percent encoding of percent signs for href (but not id) attributes 1180 * 1181 * @since 1.35 1182 * @see https://phabricator.wikimedia.org/T238385 1183 * @param string $id String to escape 1184 * @param string $mode One of modes from $wgFragmentMode 1185 * @return string 1186 */ 1187 private static function escapeIdInternalUrl( $id, $mode ) { 1188 $id = self::escapeIdInternal( $id, $mode ); 1189 if ( $mode === 'html5' ) { 1190 $id = preg_replace( '/%([a-fA-F0-9]{2})/', '%25$1', $id ); 1191 } 1192 return $id; 1193 } 1194 1195 /** 1196 * Helper for escapeIdFor*() functions. Performs most of the actual escaping. 1197 * 1198 * @param string $id String to escape 1199 * @param string $mode One of modes from $wgFragmentMode 1200 * @return string 1201 */ 1202 private static function escapeIdInternal( $id, $mode ) { 1203 // Truncate overly-long IDs. This isn't an HTML limit, it's just 1204 // griefer protection. [T251506] 1205 $id = mb_substr( $id, 0, 1024 ); 1206 1207 switch ( $mode ) { 1208 case 'html5': 1209 // html5 spec says ids must not have any of the following: 1210 // U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE 1211 // In practice, in wikitext, only tab, LF, CR (and SPACE) are 1212 // possible using either Lua or html entities. 1213 $id = str_replace( [ "\t", "\n", "\f", "\r", " " ], '_', $id ); 1214 break; 1215 case 'legacy': 1216 // This corresponds to 'noninitial' mode of the old escapeId() 1217 static $replace = [ 1218 '%3A' => ':', 1219 '%' => '.' 1220 ]; 1221 1222 $id = urlencode( str_replace( ' ', '_', $id ) ); 1223 $id = strtr( $id, $replace ); 1224 break; 1225 default: 1226 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); 1227 } 1228 1229 return $id; 1230 } 1231 1232 /** 1233 * Given a string containing a space delimited list of ids, escape each id 1234 * to match ids escaped by the escapeIdForAttribute() function. 1235 * 1236 * @since 1.27 1237 * 1238 * @param string $referenceString Space delimited list of ids 1239 * @return string 1240 */ 1241 public static function escapeIdReferenceList( $referenceString ) { 1242 # Explode the space delimited list string into an array of tokens 1243 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); 1244 1245 # Escape each token as an id 1246 foreach ( $references as &$ref ) { 1247 $ref = self::escapeIdForAttribute( $ref ); 1248 } 1249 1250 # Merge the array back to a space delimited list string 1251 # If the array is empty, the result will be an empty string ('') 1252 $referenceString = implode( ' ', $references ); 1253 1254 return $referenceString; 1255 } 1256 1257 /** 1258 * Given a value, escape it so that it can be used as a CSS class and 1259 * return it. 1260 * 1261 * @todo For extra validity, input should be validated UTF-8. 1262 * 1263 * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format 1264 * 1265 * @param string $class 1266 * @return string 1267 */ 1268 public static function escapeClass( $class ) { 1269 // Convert ugly stuff to underscores and kill underscores in ugly places 1270 return rtrim( preg_replace( 1271 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ], 1272 '_', 1273 $class ), '_' ); 1274 } 1275 1276 /** 1277 * Given HTML input, escape with htmlspecialchars but un-escape entities. 1278 * This allows (generally harmless) entities like   to survive. 1279 * 1280 * @param string $html HTML to escape 1281 * @return string Escaped input 1282 */ 1283 public static function escapeHtmlAllowEntities( $html ) { 1284 $html = self::decodeCharReferences( $html ); 1285 # It seems wise to escape ' as well as ", as a matter of course. Can't 1286 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters 1287 # don't cause the entire string to disappear. 1288 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); 1289 return $html; 1290 } 1291 1292 /** 1293 * Return an associative array of attribute names and values from 1294 * a partial tag string. Attribute names are forced to lowercase, 1295 * character references are decoded to UTF-8 text. 1296 * 1297 * @param string $text 1298 * @return array 1299 */ 1300 public static function decodeTagAttributes( $text ) { 1301 if ( trim( $text ) == '' ) { 1302 return []; 1303 } 1304 1305 $pairs = []; 1306 if ( !preg_match_all( 1307 self::getAttribsRegex(), 1308 $text, 1309 $pairs, 1310 PREG_SET_ORDER ) ) { 1311 return []; 1312 } 1313 1314 $attribs = []; 1315 foreach ( $pairs as $set ) { 1316 $attribute = strtolower( $set[1] ); 1317 1318 // Filter attribute names with unacceptable characters 1319 if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) { 1320 continue; 1321 } 1322 1323 $value = self::getTagAttributeCallback( $set ); 1324 1325 // Normalize whitespace 1326 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 1327 $value = trim( $value ); 1328 1329 // Decode character references 1330 $attribs[$attribute] = self::decodeCharReferences( $value ); 1331 } 1332 return $attribs; 1333 } 1334 1335 /** 1336 * Build a partial tag string from an associative array of attribute 1337 * names and values as returned by decodeTagAttributes. 1338 * 1339 * @param array $assoc_array 1340 * @return string 1341 */ 1342 public static function safeEncodeTagAttributes( $assoc_array ) { 1343 $attribs = []; 1344 foreach ( $assoc_array as $attribute => $value ) { 1345 $encAttribute = htmlspecialchars( $attribute ); 1346 $encValue = self::safeEncodeAttribute( $value ); 1347 1348 $attribs[] = "$encAttribute=\"$encValue\""; 1349 } 1350 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 1351 } 1352 1353 /** 1354 * Pick the appropriate attribute value from a match set from the 1355 * attribs regex matches. 1356 * 1357 * @param array $set 1358 * @throws MWException When tag conditions are not met. 1359 * @return string 1360 */ 1361 private static function getTagAttributeCallback( $set ) { 1362 if ( isset( $set[5] ) ) { 1363 # No quotes. 1364 return $set[5]; 1365 } elseif ( isset( $set[4] ) ) { 1366 # Single-quoted 1367 return $set[4]; 1368 } elseif ( isset( $set[3] ) ) { 1369 # Double-quoted 1370 return $set[3]; 1371 } elseif ( !isset( $set[2] ) ) { 1372 # In XHTML, attributes must have a value so return an empty string. 1373 # See "Empty attribute syntax", 1374 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name 1375 return ""; 1376 } else { 1377 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 1378 } 1379 } 1380 1381 /** 1382 * @param string $text 1383 * @return string 1384 */ 1385 private static function normalizeWhitespace( $text ) { 1386 return trim( preg_replace( 1387 '/(?:\r\n|[\x20\x0d\x0a\x09])+/', 1388 ' ', 1389 $text ) ); 1390 } 1391 1392 /** 1393 * Normalizes whitespace in a section name, such as might be returned 1394 * by Parser::stripSectionName(), for use in the id's that are used for 1395 * section links. 1396 * 1397 * @param string $section 1398 * @return string 1399 */ 1400 public static function normalizeSectionNameWhitespace( $section ) { 1401 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 1402 } 1403 1404 /** 1405 * Ensure that any entities and character references are legal 1406 * for XML and XHTML specifically. Any stray bits will be 1407 * &-escaped to result in a valid text fragment. 1408 * 1409 * a. named char refs can only be < > & ", others are 1410 * numericized (this way we're well-formed even without a DTD) 1411 * b. any numeric char refs must be legal chars, not invalid or forbidden 1412 * c. use lower cased "&#x", not "&#X" 1413 * d. fix or reject non-valid attributes 1414 * 1415 * @param string $text 1416 * @return string 1417 * @internal 1418 */ 1419 public static function normalizeCharReferences( $text ) { 1420 return preg_replace_callback( 1421 self::CHAR_REFS_REGEX, 1422 [ self::class, 'normalizeCharReferencesCallback' ], 1423 $text ); 1424 } 1425 1426 /** 1427 * @param string $matches 1428 * @return string 1429 */ 1430 private static function normalizeCharReferencesCallback( $matches ) { 1431 $ret = null; 1432 if ( $matches[1] != '' ) { 1433 $ret = self::normalizeEntity( $matches[1] ); 1434 } elseif ( $matches[2] != '' ) { 1435 $ret = self::decCharReference( $matches[2] ); 1436 } elseif ( $matches[3] != '' ) { 1437 $ret = self::hexCharReference( $matches[3] ); 1438 } 1439 if ( $ret === null ) { 1440 return htmlspecialchars( $matches[0] ); 1441 } else { 1442 return $ret; 1443 } 1444 } 1445 1446 /** 1447 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, 1448 * return the equivalent numeric entity reference (except for the core < 1449 * > & "). If the entity is a MediaWiki-specific alias, returns 1450 * the HTML equivalent. Otherwise, returns HTML-escaped text of 1451 * pseudo-entity source (eg &foo;) 1452 * 1453 * @param string $name 1454 * @return string 1455 */ 1456 private static function normalizeEntity( $name ) { 1457 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) { 1458 return '&' . self::HTML_ENTITY_ALIASES[$name] . ';'; 1459 } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) { 1460 return "&$name;"; 1461 } elseif ( isset( self::HTML_ENTITIES[$name] ) ) { 1462 return '&#' . self::HTML_ENTITIES[$name] . ';'; 1463 } else { 1464 return "&$name;"; 1465 } 1466 } 1467 1468 /** 1469 * @param int $codepoint 1470 * @return null|string 1471 */ 1472 private static function decCharReference( $codepoint ) { 1473 $point = intval( $codepoint ); 1474 if ( self::validateCodepoint( $point ) ) { 1475 return sprintf( '&#%d;', $point ); 1476 } else { 1477 return null; 1478 } 1479 } 1480 1481 /** 1482 * @param int $codepoint 1483 * @return null|string 1484 */ 1485 private static function hexCharReference( $codepoint ) { 1486 $point = hexdec( $codepoint ); 1487 if ( self::validateCodepoint( $point ) ) { 1488 return sprintf( '&#x%x;', $point ); 1489 } else { 1490 return null; 1491 } 1492 } 1493 1494 /** 1495 * Returns true if a given Unicode codepoint is a valid character in 1496 * both HTML5 and XML. 1497 * @param int $codepoint 1498 * @return bool 1499 */ 1500 private static function validateCodepoint( $codepoint ) { 1501 # U+000C is valid in HTML5 but not allowed in XML. 1502 # U+000D is valid in XML but not allowed in HTML5. 1503 # U+007F - U+009F are disallowed in HTML5 (control characters). 1504 return $codepoint == 0x09 1505 || $codepoint == 0x0a 1506 || ( $codepoint >= 0x20 && $codepoint <= 0x7e ) 1507 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) 1508 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) 1509 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); 1510 } 1511 1512 /** 1513 * Decode any character references, numeric or named entities, 1514 * in the text and return a UTF-8 string. 1515 * 1516 * @param string $text 1517 * @return string 1518 */ 1519 public static function decodeCharReferences( $text ) { 1520 return preg_replace_callback( 1521 self::CHAR_REFS_REGEX, 1522 [ self::class, 'decodeCharReferencesCallback' ], 1523 $text ); 1524 } 1525 1526 /** 1527 * Decode any character references, numeric or named entities, 1528 * in the next and normalize the resulting string. (T16952) 1529 * 1530 * This is useful for page titles, not for text to be displayed, 1531 * MediaWiki allows HTML entities to escape normalization as a feature. 1532 * 1533 * @param string $text Already normalized, containing entities 1534 * @return string Still normalized, without entities 1535 */ 1536 public static function decodeCharReferencesAndNormalize( $text ) { 1537 $text = preg_replace_callback( 1538 self::CHAR_REFS_REGEX, 1539 [ self::class, 'decodeCharReferencesCallback' ], 1540 $text, 1541 -1, // limit 1542 $count 1543 ); 1544 1545 if ( $count ) { 1546 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text ); 1547 } else { 1548 return $text; 1549 } 1550 } 1551 1552 /** 1553 * @param string $matches 1554 * @return string 1555 */ 1556 private static function decodeCharReferencesCallback( $matches ) { 1557 if ( $matches[1] != '' ) { 1558 return self::decodeEntity( $matches[1] ); 1559 } elseif ( $matches[2] != '' ) { 1560 return self::decodeChar( intval( $matches[2] ) ); 1561 } elseif ( $matches[3] != '' ) { 1562 return self::decodeChar( hexdec( $matches[3] ) ); 1563 } 1564 # Last case should be an ampersand by itself 1565 return $matches[0]; 1566 } 1567 1568 /** 1569 * Return UTF-8 string for a codepoint if that is a valid 1570 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. 1571 * @param int $codepoint 1572 * @return string 1573 * @internal 1574 */ 1575 private static function decodeChar( $codepoint ) { 1576 if ( self::validateCodepoint( $codepoint ) ) { 1577 return UtfNormal\Utils::codepointToUtf8( $codepoint ); 1578 } else { 1579 return UtfNormal\Constants::UTF8_REPLACEMENT; 1580 } 1581 } 1582 1583 /** 1584 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, 1585 * return the UTF-8 encoding of that character. Otherwise, returns 1586 * pseudo-entity source (eg "&foo;") 1587 * 1588 * @param string $name 1589 * @return string 1590 */ 1591 private static function decodeEntity( $name ) { 1592 if ( isset( self::HTML_ENTITY_ALIASES[$name] ) ) { 1593 $name = self::HTML_ENTITY_ALIASES[$name]; 1594 } 1595 if ( isset( self::HTML_ENTITIES[$name] ) ) { 1596 return UtfNormal\Utils::codepointToUtf8( self::HTML_ENTITIES[$name] ); 1597 } else { 1598 return "&$name;"; 1599 } 1600 } 1601 1602 /** 1603 * Fetch the list of acceptable attributes for a given element name. 1604 * 1605 * @param string $element 1606 * @return array An associative array where keys are acceptable attribute 1607 * names 1608 */ 1609 private static function attributesAllowedInternal( $element ) { 1610 $list = self::setupAttributesAllowedInternal(); 1611 return $list[$element] ?? []; 1612 } 1613 1614 /** 1615 * Foreach array key (an allowed HTML element), return an array 1616 * of allowed attributes. 1617 * @return array An associative array: keys are HTML element names; 1618 * values are associative arrays where the keys are allowed attribute 1619 * names. 1620 */ 1621 private static function setupAttributesAllowedInternal() { 1622 static $allowed; 1623 1624 if ( $allowed !== null ) { 1625 return $allowed; 1626 } 1627 1628 // For lookup efficiency flip each attributes array so the keys are 1629 // the valid attributes. 1630 $merge = function ( $a, $b, $c = [] ) { 1631 return array_merge( $a, array_flip( $b ), array_flip( $c ) ); 1632 }; 1633 $common = $merge( [], [ 1634 # HTML 1635 'id', 1636 'class', 1637 'style', 1638 'lang', 1639 'dir', 1640 'title', 1641 'tabindex', 1642 1643 # WAI-ARIA 1644 'aria-describedby', 1645 'aria-flowto', 1646 'aria-hidden', 1647 'aria-label', 1648 'aria-labelledby', 1649 'aria-owns', 1650 'role', 1651 1652 # RDFa 1653 # These attributes are specified in section 9 of 1654 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 1655 'about', 1656 'property', 1657 'resource', 1658 'datatype', 1659 'typeof', 1660 1661 # Microdata. These are specified by 1662 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model 1663 'itemid', 1664 'itemprop', 1665 'itemref', 1666 'itemscope', 1667 'itemtype', 1668 ] ); 1669 1670 $block = $merge( $common, [ 'align' ] ); 1671 1672 $tablealign = [ 'align', 'valign' ]; 1673 $tablecell = [ 1674 'abbr', 1675 'axis', 1676 'headers', 1677 'scope', 1678 'rowspan', 1679 'colspan', 1680 'nowrap', # deprecated 1681 'width', # deprecated 1682 'height', # deprecated 1683 'bgcolor', # deprecated 1684 ]; 1685 1686 # Numbers refer to sections in HTML 4.01 standard describing the element. 1687 # See: https://www.w3.org/TR/html4/ 1688 $allowed = [ 1689 # 7.5.4 1690 'div' => $block, 1691 'center' => $common, # deprecated 1692 'span' => $common, 1693 1694 # 7.5.5 1695 'h1' => $block, 1696 'h2' => $block, 1697 'h3' => $block, 1698 'h4' => $block, 1699 'h5' => $block, 1700 'h6' => $block, 1701 1702 # 7.5.6 1703 # address 1704 1705 # 8.2.4 1706 'bdo' => $common, 1707 1708 # 9.2.1 1709 'em' => $common, 1710 'strong' => $common, 1711 'cite' => $common, 1712 'dfn' => $common, 1713 'code' => $common, 1714 'samp' => $common, 1715 'kbd' => $common, 1716 'var' => $common, 1717 'abbr' => $common, 1718 # acronym 1719 1720 # 9.2.2 1721 'blockquote' => $merge( $common, [ 'cite' ] ), 1722 'q' => $merge( $common, [ 'cite' ] ), 1723 1724 # 9.2.3 1725 'sub' => $common, 1726 'sup' => $common, 1727 1728 # 9.3.1 1729 'p' => $block, 1730 1731 # 9.3.2 1732 'br' => $merge( $common, [ 'clear' ] ), 1733 1734 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element 1735 'wbr' => $common, 1736 1737 # 9.3.4 1738 'pre' => $merge( $common, [ 'width' ] ), 1739 1740 # 9.4 1741 'ins' => $merge( $common, [ 'cite', 'datetime' ] ), 1742 'del' => $merge( $common, [ 'cite', 'datetime' ] ), 1743 1744 # 10.2 1745 'ul' => $merge( $common, [ 'type' ] ), 1746 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ), 1747 'li' => $merge( $common, [ 'type', 'value' ] ), 1748 1749 # 10.3 1750 'dl' => $common, 1751 'dd' => $common, 1752 'dt' => $common, 1753 1754 # 11.2.1 1755 'table' => $merge( $common, 1756 [ 'summary', 'width', 'border', 'frame', 1757 'rules', 'cellspacing', 'cellpadding', 1758 'align', 'bgcolor', 1759 ] ), 1760 1761 # 11.2.2 1762 'caption' => $block, 1763 1764 # 11.2.3 1765 'thead' => $common, 1766 'tfoot' => $common, 1767 'tbody' => $common, 1768 1769 # 11.2.4 1770 'colgroup' => $merge( $common, [ 'span' ] ), 1771 'col' => $merge( $common, [ 'span' ] ), 1772 1773 # 11.2.5 1774 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ), 1775 1776 # 11.2.6 1777 'td' => $merge( $common, $tablecell, $tablealign ), 1778 'th' => $merge( $common, $tablecell, $tablealign ), 1779 1780 # 12.2 1781 # NOTE: <a> is not allowed directly, but this list of allowed 1782 # attributes is used from the Parser object 1783 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa 1784 1785 # 13.2 1786 # Not usually allowed, but may be used for extension-style hooks 1787 # such as <math> when it is rasterized, or if $wgAllowImageTag is 1788 # true 1789 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), 1790 # Attributes for A/V tags added in T163583 / T133673 1791 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), 1792 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), 1793 'source' => $merge( $common, [ 'type', 'src' ] ), 1794 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), 1795 1796 # 15.2.1 1797 'tt' => $common, 1798 'b' => $common, 1799 'i' => $common, 1800 'big' => $common, 1801 'small' => $common, 1802 'strike' => $common, 1803 's' => $common, 1804 'u' => $common, 1805 1806 # 15.2.2 1807 'font' => $merge( $common, [ 'size', 'color', 'face' ] ), 1808 # basefont 1809 1810 # 15.3 1811 'hr' => $merge( $common, [ 'width' ] ), 1812 1813 # HTML Ruby annotation text module, simple ruby only. 1814 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element 1815 'ruby' => $common, 1816 # rbc 1817 'rb' => $common, 1818 'rp' => $common, 1819 'rt' => $common, # $merge( $common, [ 'rbspan' ] ), 1820 'rtc' => $common, 1821 1822 # MathML root element, where used for extensions 1823 # 'title' may not be 100% valid here; it's XHTML 1824 # https://www.w3.org/TR/REC-MathML/ 1825 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ), 1826 1827 // HTML 5 section 4.5 1828 'figure' => $common, 1829 'figure-inline' => $common, # T118520 1830 'figcaption' => $common, 1831 1832 # HTML 5 section 4.6 1833 'bdi' => $common, 1834 1835 # HTML5 elements, defined by: 1836 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element 1837 'data' => $merge( $common, [ 'value' ] ), 1838 'time' => $merge( $common, [ 'datetime' ] ), 1839 'mark' => $common, 1840 1841 // meta and link are only permitted by removeHTMLtags when Microdata 1842 // is enabled so we don't bother adding a conditional to hide these 1843 // Also meta and link are only valid in WikiText as Microdata elements 1844 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 1845 // So we don't bother including $common attributes that have no purpose. 1846 'meta' => $merge( [], [ 'itemprop', 'content' ] ), 1847 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ), 1848 ]; 1849 1850 return $allowed; 1851 } 1852 1853 /** 1854 * Take a fragment of (potentially invalid) HTML and return 1855 * a version with any tags removed, encoded as plain text. 1856 * 1857 * Warning: this return value must be further escaped for literal 1858 * inclusion in HTML output as of 1.10! 1859 * 1860 * @param string $html HTML fragment 1861 * @return string 1862 * @return-taint tainted 1863 */ 1864 public static function stripAllTags( $html ) { 1865 // Use RemexHtml to tokenize $html and extract the text 1866 $handler = new RemexStripTagHandler; 1867 $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [ 1868 'ignoreErrors' => true, 1869 // don't ignore char refs, we want them to be decoded 1870 'ignoreNulls' => true, 1871 'skipPreprocess' => true, 1872 ] ); 1873 $tokenizer->execute(); 1874 $text = $handler->getResult(); 1875 1876 $text = self::normalizeWhitespace( $text ); 1877 return $text; 1878 } 1879 1880 /** 1881 * Hack up a private DOCTYPE with HTML's standard entity declarations. 1882 * PHP 4 seemed to know these if you gave it an HTML doctype, but 1883 * PHP 5.1 doesn't. 1884 * 1885 * Use for passing XHTML fragments to PHP's XML parsing functions 1886 * 1887 * @return string 1888 */ 1889 public static function hackDocType() { 1890 $out = "<!DOCTYPE html [\n"; 1891 foreach ( self::HTML_ENTITIES as $entity => $codepoint ) { 1892 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 1893 } 1894 $out .= "]>\n"; 1895 return $out; 1896 } 1897 1898 /** 1899 * @param string $url 1900 * @return mixed|string 1901 */ 1902 public static function cleanUrl( $url ) { 1903 # Normalize any HTML entities in input. They will be 1904 # re-escaped by makeExternalLink(). 1905 $url = self::decodeCharReferences( $url ); 1906 1907 # Escape any control characters introduced by the above step 1908 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 1909 [ __CLASS__, 'cleanUrlCallback' ], $url ); 1910 1911 # Validate hostname portion 1912 $matches = []; 1913 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 1914 list( /* $whole */, $protocol, $host, $rest ) = $matches; 1915 1916 // Characters that will be ignored in IDNs. 1917 // https://tools.ietf.org/html/rfc3454#section-3.1 1918 // Strip them before further processing so deny lists and such work. 1919 $strip = "/ 1920 \\s| # general whitespace 1921 \xc2\xad| # 00ad SOFT HYPHEN 1922 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 1923 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 1924 \xe2\x81\xa0| # 2060 WORD JOINER 1925 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 1926 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 1927 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 1928 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 1929 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 1930 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 1931 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 1932 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 1933 /xuD"; 1934 1935 $host = preg_replace( $strip, '', $host ); 1936 1937 // IPv6 host names are bracketed with []. Url-decode these. 1938 if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && 1939 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) 1940 ) { 1941 $host = '//[' . $matches[1] . ']' . $matches[2]; 1942 } 1943 1944 // @todo FIXME: Validate hostnames here 1945 1946 return $protocol . $host . $rest; 1947 } else { 1948 return $url; 1949 } 1950 } 1951 1952 /** 1953 * @param array $matches 1954 * @return string 1955 */ 1956 private static function cleanUrlCallback( $matches ) { 1957 return urlencode( $matches[0] ); 1958 } 1959 1960 /** 1961 * Does a string look like an e-mail address? 1962 * 1963 * This validates an email address using an HTML5 specification found at: 1964 * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address 1965 * Which as of 2011-01-24 says: 1966 * 1967 * A valid e-mail address is a string that matches the ABNF production 1968 * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined 1969 * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section 1970 * 3.5. 1971 * 1972 * This function is an implementation of the specification as requested in 1973 * T24449. 1974 * 1975 * Client-side forms will use the same standard validation rules via JS or 1976 * HTML 5 validation; additional restrictions can be enforced server-side 1977 * by extensions via the 'isValidEmailAddr' hook. 1978 * 1979 * Note that this validation doesn't 100% match RFC 2822, but is believed 1980 * to be liberal enough for wide use. Some invalid addresses will still 1981 * pass validation here. 1982 * 1983 * @since 1.18 1984 * 1985 * @param string $addr E-mail address 1986 * @return bool 1987 */ 1988 public static function validateEmail( $addr ) { 1989 $result = null; 1990 if ( !Hooks::runner()->onIsValidEmailAddr( $addr, $result ) ) { 1991 return $result; 1992 } 1993 1994 // Please note strings below are enclosed in brackets [], this make the 1995 // hyphen "-" a range indicator. Hence it is double backslashed below. 1996 // See T28948 1997 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; 1998 $rfc1034_ldh_str = "a-z0-9\\-"; 1999 2000 $html5_email_regexp = "/ 2001 ^ # start of string 2002 [$rfc5322_atext\\.]+ # user part which is liberal :p 2003 @ # 'apostrophe' 2004 [$rfc1034_ldh_str]+ # First domain part 2005 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 2006 $ # End of string 2007 /ix"; // case Insensitive, eXtended 2008 2009 return (bool)preg_match( $html5_email_regexp, $addr ); 2010 } 2011} 2012