1<?php 2 3/** 4 * @file 5 * This file was auto-generated by generate-includes.php and includes all of 6 * the core files required by HTML Purifier. Use this if performance is a 7 * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS 8 * FILE, changes will be overwritten the next time the script is run. 9 * 10 * @version 4.10.0 11 * 12 * @warning 13 * You must *not* include any other HTML Purifier files before this file, 14 * because 'require' not 'require_once' is used. 15 * 16 * @warning 17 * This file requires that the include path contains the HTML Purifier 18 * library directory; this is not auto-set. 19 */ 20 21 22 23/*! @mainpage 24 * 25 * HTML Purifier is an HTML filter that will take an arbitrary snippet of 26 * HTML and rigorously test, validate and filter it into a version that 27 * is safe for output onto webpages. It achieves this by: 28 * 29 * -# Lexing (parsing into tokens) the document, 30 * -# Executing various strategies on the tokens: 31 * -# Removing all elements not in the whitelist, 32 * -# Making the tokens well-formed, 33 * -# Fixing the nesting of the nodes, and 34 * -# Validating attributes of the nodes; and 35 * -# Generating HTML from the purified tokens. 36 * 37 * However, most users will only need to interface with the HTMLPurifier 38 * and HTMLPurifier_Config. 39 */ 40 41/* 42 HTML Purifier 4.10.0 - Standards Compliant HTML Filtering 43 Copyright (C) 2006-2008 Edward Z. Yang 44 45 This library is free software; you can redistribute it and/or 46 modify it under the terms of the GNU Lesser General Public 47 License as published by the Free Software Foundation; either 48 version 2.1 of the License, or (at your option) any later version. 49 50 This library is distributed in the hope that it will be useful, 51 but WITHOUT ANY WARRANTY; without even the implied warranty of 52 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 53 Lesser General Public License for more details. 54 55 You should have received a copy of the GNU Lesser General Public 56 License along with this library; if not, write to the Free Software 57 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 58 */ 59 60/** 61 * Facade that coordinates HTML Purifier's subsystems in order to purify HTML. 62 * 63 * @note There are several points in which configuration can be specified 64 * for HTML Purifier. The precedence of these (from lowest to 65 * highest) is as follows: 66 * -# Instance: new HTMLPurifier($config) 67 * -# Invocation: purify($html, $config) 68 * These configurations are entirely independent of each other and 69 * are *not* merged (this behavior may change in the future). 70 * 71 * @todo We need an easier way to inject strategies using the configuration 72 * object. 73 */ 74class HTMLPurifier 75{ 76 77 /** 78 * Version of HTML Purifier. 79 * @type string 80 */ 81 public $version = '4.10.0'; 82 83 /** 84 * Constant with version of HTML Purifier. 85 */ 86 const VERSION = '4.10.0'; 87 88 /** 89 * Global configuration object. 90 * @type HTMLPurifier_Config 91 */ 92 public $config; 93 94 /** 95 * Array of extra filter objects to run on HTML, 96 * for backwards compatibility. 97 * @type HTMLPurifier_Filter[] 98 */ 99 private $filters = array(); 100 101 /** 102 * Single instance of HTML Purifier. 103 * @type HTMLPurifier 104 */ 105 private static $instance; 106 107 /** 108 * @type HTMLPurifier_Strategy_Core 109 */ 110 protected $strategy; 111 112 /** 113 * @type HTMLPurifier_Generator 114 */ 115 protected $generator; 116 117 /** 118 * Resultant context of last run purification. 119 * Is an array of contexts if the last called method was purifyArray(). 120 * @type HTMLPurifier_Context 121 */ 122 public $context; 123 124 /** 125 * Initializes the purifier. 126 * 127 * @param HTMLPurifier_Config|mixed $config Optional HTMLPurifier_Config object 128 * for all instances of the purifier, if omitted, a default 129 * configuration is supplied (which can be overridden on a 130 * per-use basis). 131 * The parameter can also be any type that 132 * HTMLPurifier_Config::create() supports. 133 */ 134 public function __construct($config = null) 135 { 136 $this->config = HTMLPurifier_Config::create($config); 137 $this->strategy = new HTMLPurifier_Strategy_Core(); 138 } 139 140 /** 141 * Adds a filter to process the output. First come first serve 142 * 143 * @param HTMLPurifier_Filter $filter HTMLPurifier_Filter object 144 */ 145 public function addFilter($filter) 146 { 147 trigger_error( 148 'HTMLPurifier->addFilter() is deprecated, use configuration directives' . 149 ' in the Filter namespace or Filter.Custom', 150 E_USER_WARNING 151 ); 152 $this->filters[] = $filter; 153 } 154 155 /** 156 * Filters an HTML snippet/document to be XSS-free and standards-compliant. 157 * 158 * @param string $html String of HTML to purify 159 * @param HTMLPurifier_Config $config Config object for this operation, 160 * if omitted, defaults to the config object specified during this 161 * object's construction. The parameter can also be any type 162 * that HTMLPurifier_Config::create() supports. 163 * 164 * @return string Purified HTML 165 */ 166 public function purify($html, $config = null) 167 { 168 // :TODO: make the config merge in, instead of replace 169 $config = $config ? HTMLPurifier_Config::create($config) : $this->config; 170 171 // implementation is partially environment dependant, partially 172 // configuration dependant 173 $lexer = HTMLPurifier_Lexer::create($config); 174 175 $context = new HTMLPurifier_Context(); 176 177 // setup HTML generator 178 $this->generator = new HTMLPurifier_Generator($config, $context); 179 $context->register('Generator', $this->generator); 180 181 // set up global context variables 182 if ($config->get('Core.CollectErrors')) { 183 // may get moved out if other facilities use it 184 $language_factory = HTMLPurifier_LanguageFactory::instance(); 185 $language = $language_factory->create($config, $context); 186 $context->register('Locale', $language); 187 188 $error_collector = new HTMLPurifier_ErrorCollector($context); 189 $context->register('ErrorCollector', $error_collector); 190 } 191 192 // setup id_accumulator context, necessary due to the fact that 193 // AttrValidator can be called from many places 194 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context); 195 $context->register('IDAccumulator', $id_accumulator); 196 197 $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context); 198 199 // setup filters 200 $filter_flags = $config->getBatch('Filter'); 201 $custom_filters = $filter_flags['Custom']; 202 unset($filter_flags['Custom']); 203 $filters = array(); 204 foreach ($filter_flags as $filter => $flag) { 205 if (!$flag) { 206 continue; 207 } 208 if (strpos($filter, '.') !== false) { 209 continue; 210 } 211 $class = "HTMLPurifier_Filter_$filter"; 212 $filters[] = new $class; 213 } 214 foreach ($custom_filters as $filter) { 215 // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat 216 $filters[] = $filter; 217 } 218 $filters = array_merge($filters, $this->filters); 219 // maybe prepare(), but later 220 221 for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) { 222 $html = $filters[$i]->preFilter($html, $config, $context); 223 } 224 225 // purified HTML 226 $html = 227 $this->generator->generateFromTokens( 228 // list of tokens 229 $this->strategy->execute( 230 // list of un-purified tokens 231 $lexer->tokenizeHTML( 232 // un-purified HTML 233 $html, 234 $config, 235 $context 236 ), 237 $config, 238 $context 239 ) 240 ); 241 242 for ($i = $filter_size - 1; $i >= 0; $i--) { 243 $html = $filters[$i]->postFilter($html, $config, $context); 244 } 245 246 $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context); 247 $this->context =& $context; 248 return $html; 249 } 250 251 /** 252 * Filters an array of HTML snippets 253 * 254 * @param string[] $array_of_html Array of html snippets 255 * @param HTMLPurifier_Config $config Optional config object for this operation. 256 * See HTMLPurifier::purify() for more details. 257 * 258 * @return string[] Array of purified HTML 259 */ 260 public function purifyArray($array_of_html, $config = null) 261 { 262 $context_array = array(); 263 foreach ($array_of_html as $key => $html) { 264 $array_of_html[$key] = $this->purify($html, $config); 265 $context_array[$key] = $this->context; 266 } 267 $this->context = $context_array; 268 return $array_of_html; 269 } 270 271 /** 272 * Singleton for enforcing just one HTML Purifier in your system 273 * 274 * @param HTMLPurifier|HTMLPurifier_Config $prototype Optional prototype 275 * HTMLPurifier instance to overload singleton with, 276 * or HTMLPurifier_Config instance to configure the 277 * generated version with. 278 * 279 * @return HTMLPurifier 280 */ 281 public static function instance($prototype = null) 282 { 283 if (!self::$instance || $prototype) { 284 if ($prototype instanceof HTMLPurifier) { 285 self::$instance = $prototype; 286 } elseif ($prototype) { 287 self::$instance = new HTMLPurifier($prototype); 288 } else { 289 self::$instance = new HTMLPurifier(); 290 } 291 } 292 return self::$instance; 293 } 294 295 /** 296 * Singleton for enforcing just one HTML Purifier in your system 297 * 298 * @param HTMLPurifier|HTMLPurifier_Config $prototype Optional prototype 299 * HTMLPurifier instance to overload singleton with, 300 * or HTMLPurifier_Config instance to configure the 301 * generated version with. 302 * 303 * @return HTMLPurifier 304 * @note Backwards compatibility, see instance() 305 */ 306 public static function getInstance($prototype = null) 307 { 308 return HTMLPurifier::instance($prototype); 309 } 310} 311 312 313 314 315 316/** 317 * Converts a stream of HTMLPurifier_Token into an HTMLPurifier_Node, 318 * and back again. 319 * 320 * @note This transformation is not an equivalence. We mutate the input 321 * token stream to make it so; see all [MUT] markers in code. 322 */ 323class HTMLPurifier_Arborize 324{ 325 public static function arborize($tokens, $config, $context) { 326 $definition = $config->getHTMLDefinition(); 327 $parent = new HTMLPurifier_Token_Start($definition->info_parent); 328 $stack = array($parent->toNode()); 329 foreach ($tokens as $token) { 330 $token->skip = null; // [MUT] 331 $token->carryover = null; // [MUT] 332 if ($token instanceof HTMLPurifier_Token_End) { 333 $token->start = null; // [MUT] 334 $r = array_pop($stack); 335 //assert($r->name === $token->name); 336 //assert(empty($token->attr)); 337 $r->endCol = $token->col; 338 $r->endLine = $token->line; 339 $r->endArmor = $token->armor; 340 continue; 341 } 342 $node = $token->toNode(); 343 $stack[count($stack)-1]->children[] = $node; 344 if ($token instanceof HTMLPurifier_Token_Start) { 345 $stack[] = $node; 346 } 347 } 348 //assert(count($stack) == 1); 349 return $stack[0]; 350 } 351 352 public static function flatten($node, $config, $context) { 353 $level = 0; 354 $nodes = array($level => new HTMLPurifier_Queue(array($node))); 355 $closingTokens = array(); 356 $tokens = array(); 357 do { 358 while (!$nodes[$level]->isEmpty()) { 359 $node = $nodes[$level]->shift(); // FIFO 360 list($start, $end) = $node->toTokenPair(); 361 if ($level > 0) { 362 $tokens[] = $start; 363 } 364 if ($end !== NULL) { 365 $closingTokens[$level][] = $end; 366 } 367 if ($node instanceof HTMLPurifier_Node_Element) { 368 $level++; 369 $nodes[$level] = new HTMLPurifier_Queue(); 370 foreach ($node->children as $childNode) { 371 $nodes[$level]->push($childNode); 372 } 373 } 374 } 375 $level--; 376 if ($level && isset($closingTokens[$level])) { 377 while ($token = array_pop($closingTokens[$level])) { 378 $tokens[] = $token; 379 } 380 } 381 } while ($level > 0); 382 return $tokens; 383 } 384} 385 386 387 388/** 389 * Defines common attribute collections that modules reference 390 */ 391 392class HTMLPurifier_AttrCollections 393{ 394 395 /** 396 * Associative array of attribute collections, indexed by name. 397 * @type array 398 */ 399 public $info = array(); 400 401 /** 402 * Performs all expansions on internal data for use by other inclusions 403 * It also collects all attribute collection extensions from 404 * modules 405 * @param HTMLPurifier_AttrTypes $attr_types HTMLPurifier_AttrTypes instance 406 * @param HTMLPurifier_HTMLModule[] $modules Hash array of HTMLPurifier_HTMLModule members 407 */ 408 public function __construct($attr_types, $modules) 409 { 410 $this->doConstruct($attr_types, $modules); 411 } 412 413 public function doConstruct($attr_types, $modules) 414 { 415 // load extensions from the modules 416 foreach ($modules as $module) { 417 foreach ($module->attr_collections as $coll_i => $coll) { 418 if (!isset($this->info[$coll_i])) { 419 $this->info[$coll_i] = array(); 420 } 421 foreach ($coll as $attr_i => $attr) { 422 if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) { 423 // merge in includes 424 $this->info[$coll_i][$attr_i] = array_merge( 425 $this->info[$coll_i][$attr_i], 426 $attr 427 ); 428 continue; 429 } 430 $this->info[$coll_i][$attr_i] = $attr; 431 } 432 } 433 } 434 // perform internal expansions and inclusions 435 foreach ($this->info as $name => $attr) { 436 // merge attribute collections that include others 437 $this->performInclusions($this->info[$name]); 438 // replace string identifiers with actual attribute objects 439 $this->expandIdentifiers($this->info[$name], $attr_types); 440 } 441 } 442 443 /** 444 * Takes a reference to an attribute associative array and performs 445 * all inclusions specified by the zero index. 446 * @param array &$attr Reference to attribute array 447 */ 448 public function performInclusions(&$attr) 449 { 450 if (!isset($attr[0])) { 451 return; 452 } 453 $merge = $attr[0]; 454 $seen = array(); // recursion guard 455 // loop through all the inclusions 456 for ($i = 0; isset($merge[$i]); $i++) { 457 if (isset($seen[$merge[$i]])) { 458 continue; 459 } 460 $seen[$merge[$i]] = true; 461 // foreach attribute of the inclusion, copy it over 462 if (!isset($this->info[$merge[$i]])) { 463 continue; 464 } 465 foreach ($this->info[$merge[$i]] as $key => $value) { 466 if (isset($attr[$key])) { 467 continue; 468 } // also catches more inclusions 469 $attr[$key] = $value; 470 } 471 if (isset($this->info[$merge[$i]][0])) { 472 // recursion 473 $merge = array_merge($merge, $this->info[$merge[$i]][0]); 474 } 475 } 476 unset($attr[0]); 477 } 478 479 /** 480 * Expands all string identifiers in an attribute array by replacing 481 * them with the appropriate values inside HTMLPurifier_AttrTypes 482 * @param array &$attr Reference to attribute array 483 * @param HTMLPurifier_AttrTypes $attr_types HTMLPurifier_AttrTypes instance 484 */ 485 public function expandIdentifiers(&$attr, $attr_types) 486 { 487 // because foreach will process new elements we add, make sure we 488 // skip duplicates 489 $processed = array(); 490 491 foreach ($attr as $def_i => $def) { 492 // skip inclusions 493 if ($def_i === 0) { 494 continue; 495 } 496 497 if (isset($processed[$def_i])) { 498 continue; 499 } 500 501 // determine whether or not attribute is required 502 if ($required = (strpos($def_i, '*') !== false)) { 503 // rename the definition 504 unset($attr[$def_i]); 505 $def_i = trim($def_i, '*'); 506 $attr[$def_i] = $def; 507 } 508 509 $processed[$def_i] = true; 510 511 // if we've already got a literal object, move on 512 if (is_object($def)) { 513 // preserve previous required 514 $attr[$def_i]->required = ($required || $attr[$def_i]->required); 515 continue; 516 } 517 518 if ($def === false) { 519 unset($attr[$def_i]); 520 continue; 521 } 522 523 if ($t = $attr_types->get($def)) { 524 $attr[$def_i] = $t; 525 $attr[$def_i]->required = $required; 526 } else { 527 unset($attr[$def_i]); 528 } 529 } 530 } 531} 532 533 534 535 536 537/** 538 * Base class for all validating attribute definitions. 539 * 540 * This family of classes forms the core for not only HTML attribute validation, 541 * but also any sort of string that needs to be validated or cleaned (which 542 * means CSS properties and composite definitions are defined here too). 543 * Besides defining (through code) what precisely makes the string valid, 544 * subclasses are also responsible for cleaning the code if possible. 545 */ 546 547abstract class HTMLPurifier_AttrDef 548{ 549 550 /** 551 * Tells us whether or not an HTML attribute is minimized. 552 * Has no meaning in other contexts. 553 * @type bool 554 */ 555 public $minimized = false; 556 557 /** 558 * Tells us whether or not an HTML attribute is required. 559 * Has no meaning in other contexts 560 * @type bool 561 */ 562 public $required = false; 563 564 /** 565 * Validates and cleans passed string according to a definition. 566 * 567 * @param string $string String to be validated and cleaned. 568 * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object. 569 * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object. 570 */ 571 abstract public function validate($string, $config, $context); 572 573 /** 574 * Convenience method that parses a string as if it were CDATA. 575 * 576 * This method process a string in the manner specified at 577 * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing 578 * leading and trailing whitespace, ignoring line feeds, and replacing 579 * carriage returns and tabs with spaces. While most useful for HTML 580 * attributes specified as CDATA, it can also be applied to most CSS 581 * values. 582 * 583 * @note This method is not entirely standards compliant, as trim() removes 584 * more types of whitespace than specified in the spec. In practice, 585 * this is rarely a problem, as those extra characters usually have 586 * already been removed by HTMLPurifier_Encoder. 587 * 588 * @warning This processing is inconsistent with XML's whitespace handling 589 * as specified by section 3.3.3 and referenced XHTML 1.0 section 590 * 4.7. However, note that we are NOT necessarily 591 * parsing XML, thus, this behavior may still be correct. We 592 * assume that newlines have been normalized. 593 */ 594 public function parseCDATA($string) 595 { 596 $string = trim($string); 597 $string = str_replace(array("\n", "\t", "\r"), ' ', $string); 598 return $string; 599 } 600 601 /** 602 * Factory method for creating this class from a string. 603 * @param string $string String construction info 604 * @return HTMLPurifier_AttrDef Created AttrDef object corresponding to $string 605 */ 606 public function make($string) 607 { 608 // default implementation, return a flyweight of this object. 609 // If $string has an effect on the returned object (i.e. you 610 // need to overload this method), it is best 611 // to clone or instantiate new copies. (Instantiation is safer.) 612 return $this; 613 } 614 615 /** 616 * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work 617 * properly. THIS IS A HACK! 618 * @param string $string a CSS colour definition 619 * @return string 620 */ 621 protected function mungeRgb($string) 622 { 623 $p = '\s*(\d+(\.\d+)?([%]?))\s*'; 624 625 if (preg_match('/(rgba|hsla)\(/', $string)) { 626 return preg_replace('/(rgba|hsla)\('.$p.','.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8,\11)', $string); 627 } 628 629 return preg_replace('/(rgb|hsl)\('.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8)', $string); 630 } 631 632 /** 633 * Parses a possibly escaped CSS string and returns the "pure" 634 * version of it. 635 */ 636 protected function expandCSSEscape($string) 637 { 638 // flexibly parse it 639 $ret = ''; 640 for ($i = 0, $c = strlen($string); $i < $c; $i++) { 641 if ($string[$i] === '\\') { 642 $i++; 643 if ($i >= $c) { 644 $ret .= '\\'; 645 break; 646 } 647 if (ctype_xdigit($string[$i])) { 648 $code = $string[$i]; 649 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { 650 if (!ctype_xdigit($string[$i])) { 651 break; 652 } 653 $code .= $string[$i]; 654 } 655 // We have to be extremely careful when adding 656 // new characters, to make sure we're not breaking 657 // the encoding. 658 $char = HTMLPurifier_Encoder::unichr(hexdec($code)); 659 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') { 660 continue; 661 } 662 $ret .= $char; 663 if ($i < $c && trim($string[$i]) !== '') { 664 $i--; 665 } 666 continue; 667 } 668 if ($string[$i] === "\n") { 669 continue; 670 } 671 } 672 $ret .= $string[$i]; 673 } 674 return $ret; 675 } 676} 677 678 679 680 681 682/** 683 * Processes an entire attribute array for corrections needing multiple values. 684 * 685 * Occasionally, a certain attribute will need to be removed and popped onto 686 * another value. Instead of creating a complex return syntax for 687 * HTMLPurifier_AttrDef, we just pass the whole attribute array to a 688 * specialized object and have that do the special work. That is the 689 * family of HTMLPurifier_AttrTransform. 690 * 691 * An attribute transformation can be assigned to run before or after 692 * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for 693 * more details. 694 */ 695 696abstract class HTMLPurifier_AttrTransform 697{ 698 699 /** 700 * Abstract: makes changes to the attributes dependent on multiple values. 701 * 702 * @param array $attr Assoc array of attributes, usually from 703 * HTMLPurifier_Token_Tag::$attr 704 * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object. 705 * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object 706 * @return array Processed attribute array. 707 */ 708 abstract public function transform($attr, $config, $context); 709 710 /** 711 * Prepends CSS properties to the style attribute, creating the 712 * attribute if it doesn't exist. 713 * @param array &$attr Attribute array to process (passed by reference) 714 * @param string $css CSS to prepend 715 */ 716 public function prependCSS(&$attr, $css) 717 { 718 $attr['style'] = isset($attr['style']) ? $attr['style'] : ''; 719 $attr['style'] = $css . $attr['style']; 720 } 721 722 /** 723 * Retrieves and removes an attribute 724 * @param array &$attr Attribute array to process (passed by reference) 725 * @param mixed $key Key of attribute to confiscate 726 * @return mixed 727 */ 728 public function confiscateAttr(&$attr, $key) 729 { 730 if (!isset($attr[$key])) { 731 return null; 732 } 733 $value = $attr[$key]; 734 unset($attr[$key]); 735 return $value; 736 } 737} 738 739 740 741 742 743/** 744 * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects 745 */ 746class HTMLPurifier_AttrTypes 747{ 748 /** 749 * Lookup array of attribute string identifiers to concrete implementations. 750 * @type HTMLPurifier_AttrDef[] 751 */ 752 protected $info = array(); 753 754 /** 755 * Constructs the info array, supplying default implementations for attribute 756 * types. 757 */ 758 public function __construct() 759 { 760 // XXX This is kind of poor, since we don't actually /clone/ 761 // instances; instead, we use the supplied make() attribute. So, 762 // the underlying class must know how to deal with arguments. 763 // With the old implementation of Enum, that ignored its 764 // arguments when handling a make dispatch, the IAlign 765 // definition wouldn't work. 766 767 // pseudo-types, must be instantiated via shorthand 768 $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum(); 769 $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool(); 770 771 $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text(); 772 $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID(); 773 $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length(); 774 $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength(); 775 $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens(); 776 $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels(); 777 $this->info['Text'] = new HTMLPurifier_AttrDef_Text(); 778 $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); 779 $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang(); 780 $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color(); 781 $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right'); 782 $this->info['LAlign'] = self::makeEnum('top,bottom,left,right'); 783 $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget(); 784 785 // unimplemented aliases 786 $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text(); 787 $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text(); 788 $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text(); 789 $this->info['Character'] = new HTMLPurifier_AttrDef_Text(); 790 791 // "proprietary" types 792 $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class(); 793 794 // number is really a positive integer (one or more digits) 795 // FIXME: ^^ not always, see start and value of list items 796 $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); 797 } 798 799 private static function makeEnum($in) 800 { 801 return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in))); 802 } 803 804 /** 805 * Retrieves a type 806 * @param string $type String type name 807 * @return HTMLPurifier_AttrDef Object AttrDef for type 808 */ 809 public function get($type) 810 { 811 // determine if there is any extra info tacked on 812 if (strpos($type, '#') !== false) { 813 list($type, $string) = explode('#', $type, 2); 814 } else { 815 $string = ''; 816 } 817 818 if (!isset($this->info[$type])) { 819 trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR); 820 return; 821 } 822 return $this->info[$type]->make($string); 823 } 824 825 /** 826 * Sets a new implementation for a type 827 * @param string $type String type name 828 * @param HTMLPurifier_AttrDef $impl Object AttrDef for type 829 */ 830 public function set($type, $impl) 831 { 832 $this->info[$type] = $impl; 833 } 834} 835 836 837 838 839 840/** 841 * Validates the attributes of a token. Doesn't manage required attributes 842 * very well. The only reason we factored this out was because RemoveForeignElements 843 * also needed it besides ValidateAttributes. 844 */ 845class HTMLPurifier_AttrValidator 846{ 847 848 /** 849 * Validates the attributes of a token, mutating it as necessary. 850 * that has valid tokens 851 * @param HTMLPurifier_Token $token Token to validate. 852 * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config 853 * @param HTMLPurifier_Context $context Instance of HTMLPurifier_Context 854 */ 855 public function validateToken($token, $config, $context) 856 { 857 $definition = $config->getHTMLDefinition(); 858 $e =& $context->get('ErrorCollector', true); 859 860 // initialize IDAccumulator if necessary 861 $ok =& $context->get('IDAccumulator', true); 862 if (!$ok) { 863 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context); 864 $context->register('IDAccumulator', $id_accumulator); 865 } 866 867 // initialize CurrentToken if necessary 868 $current_token =& $context->get('CurrentToken', true); 869 if (!$current_token) { 870 $context->register('CurrentToken', $token); 871 } 872 873 if (!$token instanceof HTMLPurifier_Token_Start && 874 !$token instanceof HTMLPurifier_Token_Empty 875 ) { 876 return; 877 } 878 879 // create alias to global definition array, see also $defs 880 // DEFINITION CALL 881 $d_defs = $definition->info_global_attr; 882 883 // don't update token until the very end, to ensure an atomic update 884 $attr = $token->attr; 885 886 // do global transformations (pre) 887 // nothing currently utilizes this 888 foreach ($definition->info_attr_transform_pre as $transform) { 889 $attr = $transform->transform($o = $attr, $config, $context); 890 if ($e) { 891 if ($attr != $o) { 892 $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 893 } 894 } 895 } 896 897 // do local transformations only applicable to this element (pre) 898 // ex. <p align="right"> to <p style="text-align:right;"> 899 foreach ($definition->info[$token->name]->attr_transform_pre as $transform) { 900 $attr = $transform->transform($o = $attr, $config, $context); 901 if ($e) { 902 if ($attr != $o) { 903 $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 904 } 905 } 906 } 907 908 // create alias to this element's attribute definition array, see 909 // also $d_defs (global attribute definition array) 910 // DEFINITION CALL 911 $defs = $definition->info[$token->name]->attr; 912 913 $attr_key = false; 914 $context->register('CurrentAttr', $attr_key); 915 916 // iterate through all the attribute keypairs 917 // Watch out for name collisions: $key has previously been used 918 foreach ($attr as $attr_key => $value) { 919 920 // call the definition 921 if (isset($defs[$attr_key])) { 922 // there is a local definition defined 923 if ($defs[$attr_key] === false) { 924 // We've explicitly been told not to allow this element. 925 // This is usually when there's a global definition 926 // that must be overridden. 927 // Theoretically speaking, we could have a 928 // AttrDef_DenyAll, but this is faster! 929 $result = false; 930 } else { 931 // validate according to the element's definition 932 $result = $defs[$attr_key]->validate( 933 $value, 934 $config, 935 $context 936 ); 937 } 938 } elseif (isset($d_defs[$attr_key])) { 939 // there is a global definition defined, validate according 940 // to the global definition 941 $result = $d_defs[$attr_key]->validate( 942 $value, 943 $config, 944 $context 945 ); 946 } else { 947 // system never heard of the attribute? DELETE! 948 $result = false; 949 } 950 951 // put the results into effect 952 if ($result === false || $result === null) { 953 // this is a generic error message that should replaced 954 // with more specific ones when possible 955 if ($e) { 956 $e->send(E_ERROR, 'AttrValidator: Attribute removed'); 957 } 958 959 // remove the attribute 960 unset($attr[$attr_key]); 961 } elseif (is_string($result)) { 962 // generally, if a substitution is happening, there 963 // was some sort of implicit correction going on. We'll 964 // delegate it to the attribute classes to say exactly what. 965 966 // simple substitution 967 $attr[$attr_key] = $result; 968 } else { 969 // nothing happens 970 } 971 972 // we'd also want slightly more complicated substitution 973 // involving an array as the return value, 974 // although we're not sure how colliding attributes would 975 // resolve (certain ones would be completely overriden, 976 // others would prepend themselves). 977 } 978 979 $context->destroy('CurrentAttr'); 980 981 // post transforms 982 983 // global (error reporting untested) 984 foreach ($definition->info_attr_transform_post as $transform) { 985 $attr = $transform->transform($o = $attr, $config, $context); 986 if ($e) { 987 if ($attr != $o) { 988 $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 989 } 990 } 991 } 992 993 // local (error reporting untested) 994 foreach ($definition->info[$token->name]->attr_transform_post as $transform) { 995 $attr = $transform->transform($o = $attr, $config, $context); 996 if ($e) { 997 if ($attr != $o) { 998 $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 999 } 1000 } 1001 } 1002 1003 $token->attr = $attr; 1004 1005 // destroy CurrentToken if we made it ourselves 1006 if (!$current_token) { 1007 $context->destroy('CurrentToken'); 1008 } 1009 1010 } 1011 1012 1013} 1014 1015 1016 1017 1018 1019// constants are slow, so we use as few as possible 1020if (!defined('HTMLPURIFIER_PREFIX')) { 1021 define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone'); 1022 set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path()); 1023} 1024 1025// accomodations for versions earlier than 5.0.2 1026// borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net> 1027if (!defined('PHP_EOL')) { 1028 switch (strtoupper(substr(PHP_OS, 0, 3))) { 1029 case 'WIN': 1030 define('PHP_EOL', "\r\n"); 1031 break; 1032 case 'DAR': 1033 define('PHP_EOL', "\r"); 1034 break; 1035 default: 1036 define('PHP_EOL', "\n"); 1037 } 1038} 1039 1040/** 1041 * Bootstrap class that contains meta-functionality for HTML Purifier such as 1042 * the autoload function. 1043 * 1044 * @note 1045 * This class may be used without any other files from HTML Purifier. 1046 */ 1047class HTMLPurifier_Bootstrap 1048{ 1049 1050 /** 1051 * Autoload function for HTML Purifier 1052 * @param string $class Class to load 1053 * @return bool 1054 */ 1055 public static function autoload($class) 1056 { 1057 $file = HTMLPurifier_Bootstrap::getPath($class); 1058 if (!$file) { 1059 return false; 1060 } 1061 // Technically speaking, it should be ok and more efficient to 1062 // just do 'require', but Antonio Parraga reports that with 1063 // Zend extensions such as Zend debugger and APC, this invariant 1064 // may be broken. Since we have efficient alternatives, pay 1065 // the cost here and avoid the bug. 1066 require_once HTMLPURIFIER_PREFIX . '/' . $file; 1067 return true; 1068 } 1069 1070 /** 1071 * Returns the path for a specific class. 1072 * @param string $class Class path to get 1073 * @return string 1074 */ 1075 public static function getPath($class) 1076 { 1077 if (strncmp('HTMLPurifier', $class, 12) !== 0) { 1078 return false; 1079 } 1080 // Custom implementations 1081 if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) { 1082 $code = str_replace('_', '-', substr($class, 22)); 1083 $file = 'HTMLPurifier/Language/classes/' . $code . '.php'; 1084 } else { 1085 $file = str_replace('_', '/', $class) . '.php'; 1086 } 1087 if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) { 1088 return false; 1089 } 1090 return $file; 1091 } 1092 1093 /** 1094 * "Pre-registers" our autoloader on the SPL stack. 1095 */ 1096 public static function registerAutoload() 1097 { 1098 $autoload = array('HTMLPurifier_Bootstrap', 'autoload'); 1099 if (($funcs = spl_autoload_functions()) === false) { 1100 spl_autoload_register($autoload); 1101 } elseif (function_exists('spl_autoload_unregister')) { 1102 if (version_compare(PHP_VERSION, '5.3.0', '>=')) { 1103 // prepend flag exists, no need for shenanigans 1104 spl_autoload_register($autoload, true, true); 1105 } else { 1106 $buggy = version_compare(PHP_VERSION, '5.2.11', '<'); 1107 $compat = version_compare(PHP_VERSION, '5.1.2', '<=') && 1108 version_compare(PHP_VERSION, '5.1.0', '>='); 1109 foreach ($funcs as $func) { 1110 if ($buggy && is_array($func)) { 1111 // :TRICKY: There are some compatibility issues and some 1112 // places where we need to error out 1113 $reflector = new ReflectionMethod($func[0], $func[1]); 1114 if (!$reflector->isStatic()) { 1115 throw new Exception( 1116 'HTML Purifier autoloader registrar is not compatible 1117 with non-static object methods due to PHP Bug #44144; 1118 Please do not use HTMLPurifier.autoload.php (or any 1119 file that includes this file); instead, place the code: 1120 spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\')) 1121 after your own autoloaders.' 1122 ); 1123 } 1124 // Suprisingly, spl_autoload_register supports the 1125 // Class::staticMethod callback format, although call_user_func doesn't 1126 if ($compat) { 1127 $func = implode('::', $func); 1128 } 1129 } 1130 spl_autoload_unregister($func); 1131 } 1132 spl_autoload_register($autoload); 1133 foreach ($funcs as $func) { 1134 spl_autoload_register($func); 1135 } 1136 } 1137 } 1138 } 1139} 1140 1141 1142 1143 1144 1145/** 1146 * Super-class for definition datatype objects, implements serialization 1147 * functions for the class. 1148 */ 1149abstract class HTMLPurifier_Definition 1150{ 1151 1152 /** 1153 * Has setup() been called yet? 1154 * @type bool 1155 */ 1156 public $setup = false; 1157 1158 /** 1159 * If true, write out the final definition object to the cache after 1160 * setup. This will be true only if all invocations to get a raw 1161 * definition object are also optimized. This does not cause file 1162 * system thrashing because on subsequent calls the cached object 1163 * is used and any writes to the raw definition object are short 1164 * circuited. See enduser-customize.html for the high-level 1165 * picture. 1166 * @type bool 1167 */ 1168 public $optimized = null; 1169 1170 /** 1171 * What type of definition is it? 1172 * @type string 1173 */ 1174 public $type; 1175 1176 /** 1177 * Sets up the definition object into the final form, something 1178 * not done by the constructor 1179 * @param HTMLPurifier_Config $config 1180 */ 1181 abstract protected function doSetup($config); 1182 1183 /** 1184 * Setup function that aborts if already setup 1185 * @param HTMLPurifier_Config $config 1186 */ 1187 public function setup($config) 1188 { 1189 if ($this->setup) { 1190 return; 1191 } 1192 $this->setup = true; 1193 $this->doSetup($config); 1194 } 1195} 1196 1197 1198 1199 1200 1201/** 1202 * Defines allowed CSS attributes and what their values are. 1203 * @see HTMLPurifier_HTMLDefinition 1204 */ 1205class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition 1206{ 1207 1208 public $type = 'CSS'; 1209 1210 /** 1211 * Assoc array of attribute name to definition object. 1212 * @type HTMLPurifier_AttrDef[] 1213 */ 1214 public $info = array(); 1215 1216 /** 1217 * Constructs the info array. The meat of this class. 1218 * @param HTMLPurifier_Config $config 1219 */ 1220 protected function doSetup($config) 1221 { 1222 $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum( 1223 array('left', 'right', 'center', 'justify'), 1224 false 1225 ); 1226 1227 $border_style = 1228 $this->info['border-bottom-style'] = 1229 $this->info['border-right-style'] = 1230 $this->info['border-left-style'] = 1231 $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum( 1232 array( 1233 'none', 1234 'hidden', 1235 'dotted', 1236 'dashed', 1237 'solid', 1238 'double', 1239 'groove', 1240 'ridge', 1241 'inset', 1242 'outset' 1243 ), 1244 false 1245 ); 1246 1247 $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style); 1248 1249 $this->info['clear'] = new HTMLPurifier_AttrDef_Enum( 1250 array('none', 'left', 'right', 'both'), 1251 false 1252 ); 1253 $this->info['float'] = new HTMLPurifier_AttrDef_Enum( 1254 array('none', 'left', 'right'), 1255 false 1256 ); 1257 $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum( 1258 array('normal', 'italic', 'oblique'), 1259 false 1260 ); 1261 $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum( 1262 array('normal', 'small-caps'), 1263 false 1264 ); 1265 1266 $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite( 1267 array( 1268 new HTMLPurifier_AttrDef_Enum(array('none')), 1269 new HTMLPurifier_AttrDef_CSS_URI() 1270 ) 1271 ); 1272 1273 $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum( 1274 array('inside', 'outside'), 1275 false 1276 ); 1277 $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum( 1278 array( 1279 'disc', 1280 'circle', 1281 'square', 1282 'decimal', 1283 'lower-roman', 1284 'upper-roman', 1285 'lower-alpha', 1286 'upper-alpha', 1287 'none' 1288 ), 1289 false 1290 ); 1291 $this->info['list-style-image'] = $uri_or_none; 1292 1293 $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config); 1294 1295 $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum( 1296 array('capitalize', 'uppercase', 'lowercase', 'none'), 1297 false 1298 ); 1299 $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1300 1301 $this->info['background-image'] = $uri_or_none; 1302 $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum( 1303 array('repeat', 'repeat-x', 'repeat-y', 'no-repeat') 1304 ); 1305 $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum( 1306 array('scroll', 'fixed') 1307 ); 1308 $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition(); 1309 1310 $border_color = 1311 $this->info['border-top-color'] = 1312 $this->info['border-bottom-color'] = 1313 $this->info['border-left-color'] = 1314 $this->info['border-right-color'] = 1315 $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite( 1316 array( 1317 new HTMLPurifier_AttrDef_Enum(array('transparent')), 1318 new HTMLPurifier_AttrDef_CSS_Color() 1319 ) 1320 ); 1321 1322 $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config); 1323 1324 $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color); 1325 1326 $border_width = 1327 $this->info['border-top-width'] = 1328 $this->info['border-bottom-width'] = 1329 $this->info['border-left-width'] = 1330 $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite( 1331 array( 1332 new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')), 1333 new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative 1334 ) 1335 ); 1336 1337 $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width); 1338 1339 $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite( 1340 array( 1341 new HTMLPurifier_AttrDef_Enum(array('normal')), 1342 new HTMLPurifier_AttrDef_CSS_Length() 1343 ) 1344 ); 1345 1346 $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite( 1347 array( 1348 new HTMLPurifier_AttrDef_Enum(array('normal')), 1349 new HTMLPurifier_AttrDef_CSS_Length() 1350 ) 1351 ); 1352 1353 $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite( 1354 array( 1355 new HTMLPurifier_AttrDef_Enum( 1356 array( 1357 'xx-small', 1358 'x-small', 1359 'small', 1360 'medium', 1361 'large', 1362 'x-large', 1363 'xx-large', 1364 'larger', 1365 'smaller' 1366 ) 1367 ), 1368 new HTMLPurifier_AttrDef_CSS_Percentage(), 1369 new HTMLPurifier_AttrDef_CSS_Length() 1370 ) 1371 ); 1372 1373 $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite( 1374 array( 1375 new HTMLPurifier_AttrDef_Enum(array('normal')), 1376 new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives 1377 new HTMLPurifier_AttrDef_CSS_Length('0'), 1378 new HTMLPurifier_AttrDef_CSS_Percentage(true) 1379 ) 1380 ); 1381 1382 $margin = 1383 $this->info['margin-top'] = 1384 $this->info['margin-bottom'] = 1385 $this->info['margin-left'] = 1386 $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite( 1387 array( 1388 new HTMLPurifier_AttrDef_CSS_Length(), 1389 new HTMLPurifier_AttrDef_CSS_Percentage(), 1390 new HTMLPurifier_AttrDef_Enum(array('auto')) 1391 ) 1392 ); 1393 1394 $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin); 1395 1396 // non-negative 1397 $padding = 1398 $this->info['padding-top'] = 1399 $this->info['padding-bottom'] = 1400 $this->info['padding-left'] = 1401 $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite( 1402 array( 1403 new HTMLPurifier_AttrDef_CSS_Length('0'), 1404 new HTMLPurifier_AttrDef_CSS_Percentage(true) 1405 ) 1406 ); 1407 1408 $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding); 1409 1410 $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite( 1411 array( 1412 new HTMLPurifier_AttrDef_CSS_Length(), 1413 new HTMLPurifier_AttrDef_CSS_Percentage() 1414 ) 1415 ); 1416 1417 $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite( 1418 array( 1419 new HTMLPurifier_AttrDef_CSS_Length('0'), 1420 new HTMLPurifier_AttrDef_CSS_Percentage(true), 1421 new HTMLPurifier_AttrDef_Enum(array('auto')) 1422 ) 1423 ); 1424 $max = $config->get('CSS.MaxImgLength'); 1425 1426 $this->info['min-width'] = 1427 $this->info['max-width'] = 1428 $this->info['min-height'] = 1429 $this->info['max-height'] = 1430 $this->info['width'] = 1431 $this->info['height'] = 1432 $max === null ? 1433 $trusted_wh : 1434 new HTMLPurifier_AttrDef_Switch( 1435 'img', 1436 // For img tags: 1437 new HTMLPurifier_AttrDef_CSS_Composite( 1438 array( 1439 new HTMLPurifier_AttrDef_CSS_Length('0', $max), 1440 new HTMLPurifier_AttrDef_Enum(array('auto')) 1441 ) 1442 ), 1443 // For everyone else: 1444 $trusted_wh 1445 ); 1446 1447 $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration(); 1448 1449 $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily(); 1450 1451 // this could use specialized code 1452 $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum( 1453 array( 1454 'normal', 1455 'bold', 1456 'bolder', 1457 'lighter', 1458 '100', 1459 '200', 1460 '300', 1461 '400', 1462 '500', 1463 '600', 1464 '700', 1465 '800', 1466 '900' 1467 ), 1468 false 1469 ); 1470 1471 // MUST be called after other font properties, as it references 1472 // a CSSDefinition object 1473 $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config); 1474 1475 // same here 1476 $this->info['border'] = 1477 $this->info['border-bottom'] = 1478 $this->info['border-top'] = 1479 $this->info['border-left'] = 1480 $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config); 1481 1482 $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum( 1483 array('collapse', 'separate') 1484 ); 1485 1486 $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum( 1487 array('top', 'bottom') 1488 ); 1489 1490 $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum( 1491 array('auto', 'fixed') 1492 ); 1493 1494 $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite( 1495 array( 1496 new HTMLPurifier_AttrDef_Enum( 1497 array( 1498 'baseline', 1499 'sub', 1500 'super', 1501 'top', 1502 'text-top', 1503 'middle', 1504 'bottom', 1505 'text-bottom' 1506 ) 1507 ), 1508 new HTMLPurifier_AttrDef_CSS_Length(), 1509 new HTMLPurifier_AttrDef_CSS_Percentage() 1510 ) 1511 ); 1512 1513 $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2); 1514 1515 // These CSS properties don't work on many browsers, but we live 1516 // in THE FUTURE! 1517 $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum( 1518 array('nowrap', 'normal', 'pre', 'pre-wrap', 'pre-line') 1519 ); 1520 1521 if ($config->get('CSS.Proprietary')) { 1522 $this->doSetupProprietary($config); 1523 } 1524 1525 if ($config->get('CSS.AllowTricky')) { 1526 $this->doSetupTricky($config); 1527 } 1528 1529 if ($config->get('CSS.Trusted')) { 1530 $this->doSetupTrusted($config); 1531 } 1532 1533 $allow_important = $config->get('CSS.AllowImportant'); 1534 // wrap all attr-defs with decorator that handles !important 1535 foreach ($this->info as $k => $v) { 1536 $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important); 1537 } 1538 1539 $this->setupConfigStuff($config); 1540 } 1541 1542 /** 1543 * @param HTMLPurifier_Config $config 1544 */ 1545 protected function doSetupProprietary($config) 1546 { 1547 // Internet Explorer only scrollbar colors 1548 $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1549 $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1550 $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1551 $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1552 $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1553 $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1554 1555 // vendor specific prefixes of opacity 1556 $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1557 $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1558 1559 // only opacity, for now 1560 $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter(); 1561 1562 // more CSS3 1563 $this->info['page-break-after'] = 1564 $this->info['page-break-before'] = new HTMLPurifier_AttrDef_Enum( 1565 array( 1566 'auto', 1567 'always', 1568 'avoid', 1569 'left', 1570 'right' 1571 ) 1572 ); 1573 $this->info['page-break-inside'] = new HTMLPurifier_AttrDef_Enum(array('auto', 'avoid')); 1574 1575 $border_radius = new HTMLPurifier_AttrDef_CSS_Composite( 1576 array( 1577 new HTMLPurifier_AttrDef_CSS_Percentage(true), // disallow negative 1578 new HTMLPurifier_AttrDef_CSS_Length('0') // disallow negative 1579 )); 1580 1581 $this->info['border-top-left-radius'] = 1582 $this->info['border-top-right-radius'] = 1583 $this->info['border-bottom-right-radius'] = 1584 $this->info['border-bottom-left-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 2); 1585 // TODO: support SLASH syntax 1586 $this->info['border-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 4); 1587 1588 } 1589 1590 /** 1591 * @param HTMLPurifier_Config $config 1592 */ 1593 protected function doSetupTricky($config) 1594 { 1595 $this->info['display'] = new HTMLPurifier_AttrDef_Enum( 1596 array( 1597 'inline', 1598 'block', 1599 'list-item', 1600 'run-in', 1601 'compact', 1602 'marker', 1603 'table', 1604 'inline-block', 1605 'inline-table', 1606 'table-row-group', 1607 'table-header-group', 1608 'table-footer-group', 1609 'table-row', 1610 'table-column-group', 1611 'table-column', 1612 'table-cell', 1613 'table-caption', 1614 'none' 1615 ) 1616 ); 1617 $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum( 1618 array('visible', 'hidden', 'collapse') 1619 ); 1620 $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll')); 1621 $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1622 } 1623 1624 /** 1625 * @param HTMLPurifier_Config $config 1626 */ 1627 protected function doSetupTrusted($config) 1628 { 1629 $this->info['position'] = new HTMLPurifier_AttrDef_Enum( 1630 array('static', 'relative', 'absolute', 'fixed') 1631 ); 1632 $this->info['top'] = 1633 $this->info['left'] = 1634 $this->info['right'] = 1635 $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite( 1636 array( 1637 new HTMLPurifier_AttrDef_CSS_Length(), 1638 new HTMLPurifier_AttrDef_CSS_Percentage(), 1639 new HTMLPurifier_AttrDef_Enum(array('auto')), 1640 ) 1641 ); 1642 $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite( 1643 array( 1644 new HTMLPurifier_AttrDef_Integer(), 1645 new HTMLPurifier_AttrDef_Enum(array('auto')), 1646 ) 1647 ); 1648 } 1649 1650 /** 1651 * Performs extra config-based processing. Based off of 1652 * HTMLPurifier_HTMLDefinition. 1653 * @param HTMLPurifier_Config $config 1654 * @todo Refactor duplicate elements into common class (probably using 1655 * composition, not inheritance). 1656 */ 1657 protected function setupConfigStuff($config) 1658 { 1659 // setup allowed elements 1660 $support = "(for information on implementing this, see the " . 1661 "support forums) "; 1662 $allowed_properties = $config->get('CSS.AllowedProperties'); 1663 if ($allowed_properties !== null) { 1664 foreach ($this->info as $name => $d) { 1665 if (!isset($allowed_properties[$name])) { 1666 unset($this->info[$name]); 1667 } 1668 unset($allowed_properties[$name]); 1669 } 1670 // emit errors 1671 foreach ($allowed_properties as $name => $d) { 1672 // :TODO: Is this htmlspecialchars() call really necessary? 1673 $name = htmlspecialchars($name); 1674 trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING); 1675 } 1676 } 1677 1678 $forbidden_properties = $config->get('CSS.ForbiddenProperties'); 1679 if ($forbidden_properties !== null) { 1680 foreach ($this->info as $name => $d) { 1681 if (isset($forbidden_properties[$name])) { 1682 unset($this->info[$name]); 1683 } 1684 } 1685 } 1686 } 1687} 1688 1689 1690 1691 1692 1693/** 1694 * Defines allowed child nodes and validates nodes against it. 1695 */ 1696abstract class HTMLPurifier_ChildDef 1697{ 1698 /** 1699 * Type of child definition, usually right-most part of class name lowercase. 1700 * Used occasionally in terms of context. 1701 * @type string 1702 */ 1703 public $type; 1704 1705 /** 1706 * Indicates whether or not an empty array of children is okay. 1707 * 1708 * This is necessary for redundant checking when changes affecting 1709 * a child node may cause a parent node to now be disallowed. 1710 * @type bool 1711 */ 1712 public $allow_empty; 1713 1714 /** 1715 * Lookup array of all elements that this definition could possibly allow. 1716 * @type array 1717 */ 1718 public $elements = array(); 1719 1720 /** 1721 * Get lookup of tag names that should not close this element automatically. 1722 * All other elements will do so. 1723 * @param HTMLPurifier_Config $config HTMLPurifier_Config object 1724 * @return array 1725 */ 1726 public function getAllowedElements($config) 1727 { 1728 return $this->elements; 1729 } 1730 1731 /** 1732 * Validates nodes according to definition and returns modification. 1733 * 1734 * @param HTMLPurifier_Node[] $children Array of HTMLPurifier_Node 1735 * @param HTMLPurifier_Config $config HTMLPurifier_Config object 1736 * @param HTMLPurifier_Context $context HTMLPurifier_Context object 1737 * @return bool|array true to leave nodes as is, false to remove parent node, array of replacement children 1738 */ 1739 abstract public function validateChildren($children, $config, $context); 1740} 1741 1742 1743 1744 1745 1746/** 1747 * Configuration object that triggers customizable behavior. 1748 * 1749 * @warning This class is strongly defined: that means that the class 1750 * will fail if an undefined directive is retrieved or set. 1751 * 1752 * @note Many classes that could (although many times don't) use the 1753 * configuration object make it a mandatory parameter. This is 1754 * because a configuration object should always be forwarded, 1755 * otherwise, you run the risk of missing a parameter and then 1756 * being stumped when a configuration directive doesn't work. 1757 * 1758 * @todo Reconsider some of the public member variables 1759 */ 1760class HTMLPurifier_Config 1761{ 1762 1763 /** 1764 * HTML Purifier's version 1765 * @type string 1766 */ 1767 public $version = '4.10.0'; 1768 1769 /** 1770 * Whether or not to automatically finalize 1771 * the object if a read operation is done. 1772 * @type bool 1773 */ 1774 public $autoFinalize = true; 1775 1776 // protected member variables 1777 1778 /** 1779 * Namespace indexed array of serials for specific namespaces. 1780 * @see getSerial() for more info. 1781 * @type string[] 1782 */ 1783 protected $serials = array(); 1784 1785 /** 1786 * Serial for entire configuration object. 1787 * @type string 1788 */ 1789 protected $serial; 1790 1791 /** 1792 * Parser for variables. 1793 * @type HTMLPurifier_VarParser_Flexible 1794 */ 1795 protected $parser = null; 1796 1797 /** 1798 * Reference HTMLPurifier_ConfigSchema for value checking. 1799 * @type HTMLPurifier_ConfigSchema 1800 * @note This is public for introspective purposes. Please don't 1801 * abuse! 1802 */ 1803 public $def; 1804 1805 /** 1806 * Indexed array of definitions. 1807 * @type HTMLPurifier_Definition[] 1808 */ 1809 protected $definitions; 1810 1811 /** 1812 * Whether or not config is finalized. 1813 * @type bool 1814 */ 1815 protected $finalized = false; 1816 1817 /** 1818 * Property list containing configuration directives. 1819 * @type array 1820 */ 1821 protected $plist; 1822 1823 /** 1824 * Whether or not a set is taking place due to an alias lookup. 1825 * @type bool 1826 */ 1827 private $aliasMode; 1828 1829 /** 1830 * Set to false if you do not want line and file numbers in errors. 1831 * (useful when unit testing). This will also compress some errors 1832 * and exceptions. 1833 * @type bool 1834 */ 1835 public $chatty = true; 1836 1837 /** 1838 * Current lock; only gets to this namespace are allowed. 1839 * @type string 1840 */ 1841 private $lock; 1842 1843 /** 1844 * Constructor 1845 * @param HTMLPurifier_ConfigSchema $definition ConfigSchema that defines 1846 * what directives are allowed. 1847 * @param HTMLPurifier_PropertyList $parent 1848 */ 1849 public function __construct($definition, $parent = null) 1850 { 1851 $parent = $parent ? $parent : $definition->defaultPlist; 1852 $this->plist = new HTMLPurifier_PropertyList($parent); 1853 $this->def = $definition; // keep a copy around for checking 1854 $this->parser = new HTMLPurifier_VarParser_Flexible(); 1855 } 1856 1857 /** 1858 * Convenience constructor that creates a config object based on a mixed var 1859 * @param mixed $config Variable that defines the state of the config 1860 * object. Can be: a HTMLPurifier_Config() object, 1861 * an array of directives based on loadArray(), 1862 * or a string filename of an ini file. 1863 * @param HTMLPurifier_ConfigSchema $schema Schema object 1864 * @return HTMLPurifier_Config Configured object 1865 */ 1866 public static function create($config, $schema = null) 1867 { 1868 if ($config instanceof HTMLPurifier_Config) { 1869 // pass-through 1870 return $config; 1871 } 1872 if (!$schema) { 1873 $ret = HTMLPurifier_Config::createDefault(); 1874 } else { 1875 $ret = new HTMLPurifier_Config($schema); 1876 } 1877 if (is_string($config)) { 1878 $ret->loadIni($config); 1879 } elseif (is_array($config)) $ret->loadArray($config); 1880 return $ret; 1881 } 1882 1883 /** 1884 * Creates a new config object that inherits from a previous one. 1885 * @param HTMLPurifier_Config $config Configuration object to inherit from. 1886 * @return HTMLPurifier_Config object with $config as its parent. 1887 */ 1888 public static function inherit(HTMLPurifier_Config $config) 1889 { 1890 return new HTMLPurifier_Config($config->def, $config->plist); 1891 } 1892 1893 /** 1894 * Convenience constructor that creates a default configuration object. 1895 * @return HTMLPurifier_Config default object. 1896 */ 1897 public static function createDefault() 1898 { 1899 $definition = HTMLPurifier_ConfigSchema::instance(); 1900 $config = new HTMLPurifier_Config($definition); 1901 return $config; 1902 } 1903 1904 /** 1905 * Retrieves a value from the configuration. 1906 * 1907 * @param string $key String key 1908 * @param mixed $a 1909 * 1910 * @return mixed 1911 */ 1912 public function get($key, $a = null) 1913 { 1914 if ($a !== null) { 1915 $this->triggerError( 1916 "Using deprecated API: use \$config->get('$key.$a') instead", 1917 E_USER_WARNING 1918 ); 1919 $key = "$key.$a"; 1920 } 1921 if (!$this->finalized) { 1922 $this->autoFinalize(); 1923 } 1924 if (!isset($this->def->info[$key])) { 1925 // can't add % due to SimpleTest bug 1926 $this->triggerError( 1927 'Cannot retrieve value of undefined directive ' . htmlspecialchars($key), 1928 E_USER_WARNING 1929 ); 1930 return; 1931 } 1932 if (isset($this->def->info[$key]->isAlias)) { 1933 $d = $this->def->info[$key]; 1934 $this->triggerError( 1935 'Cannot get value from aliased directive, use real name ' . $d->key, 1936 E_USER_ERROR 1937 ); 1938 return; 1939 } 1940 if ($this->lock) { 1941 list($ns) = explode('.', $key); 1942 if ($ns !== $this->lock) { 1943 $this->triggerError( 1944 'Cannot get value of namespace ' . $ns . ' when lock for ' . 1945 $this->lock . 1946 ' is active, this probably indicates a Definition setup method ' . 1947 'is accessing directives that are not within its namespace', 1948 E_USER_ERROR 1949 ); 1950 return; 1951 } 1952 } 1953 return $this->plist->get($key); 1954 } 1955 1956 /** 1957 * Retrieves an array of directives to values from a given namespace 1958 * 1959 * @param string $namespace String namespace 1960 * 1961 * @return array 1962 */ 1963 public function getBatch($namespace) 1964 { 1965 if (!$this->finalized) { 1966 $this->autoFinalize(); 1967 } 1968 $full = $this->getAll(); 1969 if (!isset($full[$namespace])) { 1970 $this->triggerError( 1971 'Cannot retrieve undefined namespace ' . 1972 htmlspecialchars($namespace), 1973 E_USER_WARNING 1974 ); 1975 return; 1976 } 1977 return $full[$namespace]; 1978 } 1979 1980 /** 1981 * Returns a SHA-1 signature of a segment of the configuration object 1982 * that uniquely identifies that particular configuration 1983 * 1984 * @param string $namespace Namespace to get serial for 1985 * 1986 * @return string 1987 * @note Revision is handled specially and is removed from the batch 1988 * before processing! 1989 */ 1990 public function getBatchSerial($namespace) 1991 { 1992 if (empty($this->serials[$namespace])) { 1993 $batch = $this->getBatch($namespace); 1994 unset($batch['DefinitionRev']); 1995 $this->serials[$namespace] = sha1(serialize($batch)); 1996 } 1997 return $this->serials[$namespace]; 1998 } 1999 2000 /** 2001 * Returns a SHA-1 signature for the entire configuration object 2002 * that uniquely identifies that particular configuration 2003 * 2004 * @return string 2005 */ 2006 public function getSerial() 2007 { 2008 if (empty($this->serial)) { 2009 $this->serial = sha1(serialize($this->getAll())); 2010 } 2011 return $this->serial; 2012 } 2013 2014 /** 2015 * Retrieves all directives, organized by namespace 2016 * 2017 * @warning This is a pretty inefficient function, avoid if you can 2018 */ 2019 public function getAll() 2020 { 2021 if (!$this->finalized) { 2022 $this->autoFinalize(); 2023 } 2024 $ret = array(); 2025 foreach ($this->plist->squash() as $name => $value) { 2026 list($ns, $key) = explode('.', $name, 2); 2027 $ret[$ns][$key] = $value; 2028 } 2029 return $ret; 2030 } 2031 2032 /** 2033 * Sets a value to configuration. 2034 * 2035 * @param string $key key 2036 * @param mixed $value value 2037 * @param mixed $a 2038 */ 2039 public function set($key, $value, $a = null) 2040 { 2041 if (strpos($key, '.') === false) { 2042 $namespace = $key; 2043 $directive = $value; 2044 $value = $a; 2045 $key = "$key.$directive"; 2046 $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE); 2047 } else { 2048 list($namespace) = explode('.', $key); 2049 } 2050 if ($this->isFinalized('Cannot set directive after finalization')) { 2051 return; 2052 } 2053 if (!isset($this->def->info[$key])) { 2054 $this->triggerError( 2055 'Cannot set undefined directive ' . htmlspecialchars($key) . ' to value', 2056 E_USER_WARNING 2057 ); 2058 return; 2059 } 2060 $def = $this->def->info[$key]; 2061 2062 if (isset($def->isAlias)) { 2063 if ($this->aliasMode) { 2064 $this->triggerError( 2065 'Double-aliases not allowed, please fix '. 2066 'ConfigSchema bug with' . $key, 2067 E_USER_ERROR 2068 ); 2069 return; 2070 } 2071 $this->aliasMode = true; 2072 $this->set($def->key, $value); 2073 $this->aliasMode = false; 2074 $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE); 2075 return; 2076 } 2077 2078 // Raw type might be negative when using the fully optimized form 2079 // of stdClass, which indicates allow_null == true 2080 $rtype = is_int($def) ? $def : $def->type; 2081 if ($rtype < 0) { 2082 $type = -$rtype; 2083 $allow_null = true; 2084 } else { 2085 $type = $rtype; 2086 $allow_null = isset($def->allow_null); 2087 } 2088 2089 try { 2090 $value = $this->parser->parse($value, $type, $allow_null); 2091 } catch (HTMLPurifier_VarParserException $e) { 2092 $this->triggerError( 2093 'Value for ' . $key . ' is of invalid type, should be ' . 2094 HTMLPurifier_VarParser::getTypeName($type), 2095 E_USER_WARNING 2096 ); 2097 return; 2098 } 2099 if (is_string($value) && is_object($def)) { 2100 // resolve value alias if defined 2101 if (isset($def->aliases[$value])) { 2102 $value = $def->aliases[$value]; 2103 } 2104 // check to see if the value is allowed 2105 if (isset($def->allowed) && !isset($def->allowed[$value])) { 2106 $this->triggerError( 2107 'Value not supported, valid values are: ' . 2108 $this->_listify($def->allowed), 2109 E_USER_WARNING 2110 ); 2111 return; 2112 } 2113 } 2114 $this->plist->set($key, $value); 2115 2116 // reset definitions if the directives they depend on changed 2117 // this is a very costly process, so it's discouraged 2118 // with finalization 2119 if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') { 2120 $this->definitions[$namespace] = null; 2121 } 2122 2123 $this->serials[$namespace] = false; 2124 } 2125 2126 /** 2127 * Convenience function for error reporting 2128 * 2129 * @param array $lookup 2130 * 2131 * @return string 2132 */ 2133 private function _listify($lookup) 2134 { 2135 $list = array(); 2136 foreach ($lookup as $name => $b) { 2137 $list[] = $name; 2138 } 2139 return implode(', ', $list); 2140 } 2141 2142 /** 2143 * Retrieves object reference to the HTML definition. 2144 * 2145 * @param bool $raw Return a copy that has not been setup yet. Must be 2146 * called before it's been setup, otherwise won't work. 2147 * @param bool $optimized If true, this method may return null, to 2148 * indicate that a cached version of the modified 2149 * definition object is available and no further edits 2150 * are necessary. Consider using 2151 * maybeGetRawHTMLDefinition, which is more explicitly 2152 * named, instead. 2153 * 2154 * @return HTMLPurifier_HTMLDefinition 2155 */ 2156 public function getHTMLDefinition($raw = false, $optimized = false) 2157 { 2158 return $this->getDefinition('HTML', $raw, $optimized); 2159 } 2160 2161 /** 2162 * Retrieves object reference to the CSS definition 2163 * 2164 * @param bool $raw Return a copy that has not been setup yet. Must be 2165 * called before it's been setup, otherwise won't work. 2166 * @param bool $optimized If true, this method may return null, to 2167 * indicate that a cached version of the modified 2168 * definition object is available and no further edits 2169 * are necessary. Consider using 2170 * maybeGetRawCSSDefinition, which is more explicitly 2171 * named, instead. 2172 * 2173 * @return HTMLPurifier_CSSDefinition 2174 */ 2175 public function getCSSDefinition($raw = false, $optimized = false) 2176 { 2177 return $this->getDefinition('CSS', $raw, $optimized); 2178 } 2179 2180 /** 2181 * Retrieves object reference to the URI definition 2182 * 2183 * @param bool $raw Return a copy that has not been setup yet. Must be 2184 * called before it's been setup, otherwise won't work. 2185 * @param bool $optimized If true, this method may return null, to 2186 * indicate that a cached version of the modified 2187 * definition object is available and no further edits 2188 * are necessary. Consider using 2189 * maybeGetRawURIDefinition, which is more explicitly 2190 * named, instead. 2191 * 2192 * @return HTMLPurifier_URIDefinition 2193 */ 2194 public function getURIDefinition($raw = false, $optimized = false) 2195 { 2196 return $this->getDefinition('URI', $raw, $optimized); 2197 } 2198 2199 /** 2200 * Retrieves a definition 2201 * 2202 * @param string $type Type of definition: HTML, CSS, etc 2203 * @param bool $raw Whether or not definition should be returned raw 2204 * @param bool $optimized Only has an effect when $raw is true. Whether 2205 * or not to return null if the result is already present in 2206 * the cache. This is off by default for backwards 2207 * compatibility reasons, but you need to do things this 2208 * way in order to ensure that caching is done properly. 2209 * Check out enduser-customize.html for more details. 2210 * We probably won't ever change this default, as much as the 2211 * maybe semantics is the "right thing to do." 2212 * 2213 * @throws HTMLPurifier_Exception 2214 * @return HTMLPurifier_Definition 2215 */ 2216 public function getDefinition($type, $raw = false, $optimized = false) 2217 { 2218 if ($optimized && !$raw) { 2219 throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false"); 2220 } 2221 if (!$this->finalized) { 2222 $this->autoFinalize(); 2223 } 2224 // temporarily suspend locks, so we can handle recursive definition calls 2225 $lock = $this->lock; 2226 $this->lock = null; 2227 $factory = HTMLPurifier_DefinitionCacheFactory::instance(); 2228 $cache = $factory->create($type, $this); 2229 $this->lock = $lock; 2230 if (!$raw) { 2231 // full definition 2232 // --------------- 2233 // check if definition is in memory 2234 if (!empty($this->definitions[$type])) { 2235 $def = $this->definitions[$type]; 2236 // check if the definition is setup 2237 if ($def->setup) { 2238 return $def; 2239 } else { 2240 $def->setup($this); 2241 if ($def->optimized) { 2242 $cache->add($def, $this); 2243 } 2244 return $def; 2245 } 2246 } 2247 // check if definition is in cache 2248 $def = $cache->get($this); 2249 if ($def) { 2250 // definition in cache, save to memory and return it 2251 $this->definitions[$type] = $def; 2252 return $def; 2253 } 2254 // initialize it 2255 $def = $this->initDefinition($type); 2256 // set it up 2257 $this->lock = $type; 2258 $def->setup($this); 2259 $this->lock = null; 2260 // save in cache 2261 $cache->add($def, $this); 2262 // return it 2263 return $def; 2264 } else { 2265 // raw definition 2266 // -------------- 2267 // check preconditions 2268 $def = null; 2269 if ($optimized) { 2270 if (is_null($this->get($type . '.DefinitionID'))) { 2271 // fatally error out if definition ID not set 2272 throw new HTMLPurifier_Exception( 2273 "Cannot retrieve raw version without specifying %$type.DefinitionID" 2274 ); 2275 } 2276 } 2277 if (!empty($this->definitions[$type])) { 2278 $def = $this->definitions[$type]; 2279 if ($def->setup && !$optimized) { 2280 $extra = $this->chatty ? 2281 " (try moving this code block earlier in your initialization)" : 2282 ""; 2283 throw new HTMLPurifier_Exception( 2284 "Cannot retrieve raw definition after it has already been setup" . 2285 $extra 2286 ); 2287 } 2288 if ($def->optimized === null) { 2289 $extra = $this->chatty ? " (try flushing your cache)" : ""; 2290 throw new HTMLPurifier_Exception( 2291 "Optimization status of definition is unknown" . $extra 2292 ); 2293 } 2294 if ($def->optimized !== $optimized) { 2295 $msg = $optimized ? "optimized" : "unoptimized"; 2296 $extra = $this->chatty ? 2297 " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" 2298 : ""; 2299 throw new HTMLPurifier_Exception( 2300 "Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra 2301 ); 2302 } 2303 } 2304 // check if definition was in memory 2305 if ($def) { 2306 if ($def->setup) { 2307 // invariant: $optimized === true (checked above) 2308 return null; 2309 } else { 2310 return $def; 2311 } 2312 } 2313 // if optimized, check if definition was in cache 2314 // (because we do the memory check first, this formulation 2315 // is prone to cache slamming, but I think 2316 // guaranteeing that either /all/ of the raw 2317 // setup code or /none/ of it is run is more important.) 2318 if ($optimized) { 2319 // This code path only gets run once; once we put 2320 // something in $definitions (which is guaranteed by the 2321 // trailing code), we always short-circuit above. 2322 $def = $cache->get($this); 2323 if ($def) { 2324 // save the full definition for later, but don't 2325 // return it yet 2326 $this->definitions[$type] = $def; 2327 return null; 2328 } 2329 } 2330 // check invariants for creation 2331 if (!$optimized) { 2332 if (!is_null($this->get($type . '.DefinitionID'))) { 2333 if ($this->chatty) { 2334 $this->triggerError( 2335 'Due to a documentation error in previous version of HTML Purifier, your ' . 2336 'definitions are not being cached. If this is OK, you can remove the ' . 2337 '%$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, ' . 2338 'modify your code to use maybeGetRawDefinition, and test if the returned ' . 2339 'value is null before making any edits (if it is null, that means that a ' . 2340 'cached version is available, and no raw operations are necessary). See ' . 2341 '<a href="http://htmlpurifier.org/docs/enduser-customize.html#optimized">' . 2342 'Customize</a> for more details', 2343 E_USER_WARNING 2344 ); 2345 } else { 2346 $this->triggerError( 2347 "Useless DefinitionID declaration", 2348 E_USER_WARNING 2349 ); 2350 } 2351 } 2352 } 2353 // initialize it 2354 $def = $this->initDefinition($type); 2355 $def->optimized = $optimized; 2356 return $def; 2357 } 2358 throw new HTMLPurifier_Exception("The impossible happened!"); 2359 } 2360 2361 /** 2362 * Initialise definition 2363 * 2364 * @param string $type What type of definition to create 2365 * 2366 * @return HTMLPurifier_CSSDefinition|HTMLPurifier_HTMLDefinition|HTMLPurifier_URIDefinition 2367 * @throws HTMLPurifier_Exception 2368 */ 2369 private function initDefinition($type) 2370 { 2371 // quick checks failed, let's create the object 2372 if ($type == 'HTML') { 2373 $def = new HTMLPurifier_HTMLDefinition(); 2374 } elseif ($type == 'CSS') { 2375 $def = new HTMLPurifier_CSSDefinition(); 2376 } elseif ($type == 'URI') { 2377 $def = new HTMLPurifier_URIDefinition(); 2378 } else { 2379 throw new HTMLPurifier_Exception( 2380 "Definition of $type type not supported" 2381 ); 2382 } 2383 $this->definitions[$type] = $def; 2384 return $def; 2385 } 2386 2387 public function maybeGetRawDefinition($name) 2388 { 2389 return $this->getDefinition($name, true, true); 2390 } 2391 2392 /** 2393 * @return HTMLPurifier_HTMLDefinition 2394 */ 2395 public function maybeGetRawHTMLDefinition() 2396 { 2397 return $this->getDefinition('HTML', true, true); 2398 } 2399 2400 /** 2401 * @return HTMLPurifier_CSSDefinition 2402 */ 2403 public function maybeGetRawCSSDefinition() 2404 { 2405 return $this->getDefinition('CSS', true, true); 2406 } 2407 2408 /** 2409 * @return HTMLPurifier_URIDefinition 2410 */ 2411 public function maybeGetRawURIDefinition() 2412 { 2413 return $this->getDefinition('URI', true, true); 2414 } 2415 2416 /** 2417 * Loads configuration values from an array with the following structure: 2418 * Namespace.Directive => Value 2419 * 2420 * @param array $config_array Configuration associative array 2421 */ 2422 public function loadArray($config_array) 2423 { 2424 if ($this->isFinalized('Cannot load directives after finalization')) { 2425 return; 2426 } 2427 foreach ($config_array as $key => $value) { 2428 $key = str_replace('_', '.', $key); 2429 if (strpos($key, '.') !== false) { 2430 $this->set($key, $value); 2431 } else { 2432 $namespace = $key; 2433 $namespace_values = $value; 2434 foreach ($namespace_values as $directive => $value2) { 2435 $this->set($namespace .'.'. $directive, $value2); 2436 } 2437 } 2438 } 2439 } 2440 2441 /** 2442 * Returns a list of array(namespace, directive) for all directives 2443 * that are allowed in a web-form context as per an allowed 2444 * namespaces/directives list. 2445 * 2446 * @param array $allowed List of allowed namespaces/directives 2447 * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy 2448 * 2449 * @return array 2450 */ 2451 public static function getAllowedDirectivesForForm($allowed, $schema = null) 2452 { 2453 if (!$schema) { 2454 $schema = HTMLPurifier_ConfigSchema::instance(); 2455 } 2456 if ($allowed !== true) { 2457 if (is_string($allowed)) { 2458 $allowed = array($allowed); 2459 } 2460 $allowed_ns = array(); 2461 $allowed_directives = array(); 2462 $blacklisted_directives = array(); 2463 foreach ($allowed as $ns_or_directive) { 2464 if (strpos($ns_or_directive, '.') !== false) { 2465 // directive 2466 if ($ns_or_directive[0] == '-') { 2467 $blacklisted_directives[substr($ns_or_directive, 1)] = true; 2468 } else { 2469 $allowed_directives[$ns_or_directive] = true; 2470 } 2471 } else { 2472 // namespace 2473 $allowed_ns[$ns_or_directive] = true; 2474 } 2475 } 2476 } 2477 $ret = array(); 2478 foreach ($schema->info as $key => $def) { 2479 list($ns, $directive) = explode('.', $key, 2); 2480 if ($allowed !== true) { 2481 if (isset($blacklisted_directives["$ns.$directive"])) { 2482 continue; 2483 } 2484 if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) { 2485 continue; 2486 } 2487 } 2488 if (isset($def->isAlias)) { 2489 continue; 2490 } 2491 if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') { 2492 continue; 2493 } 2494 $ret[] = array($ns, $directive); 2495 } 2496 return $ret; 2497 } 2498 2499 /** 2500 * Loads configuration values from $_GET/$_POST that were posted 2501 * via ConfigForm 2502 * 2503 * @param array $array $_GET or $_POST array to import 2504 * @param string|bool $index Index/name that the config variables are in 2505 * @param array|bool $allowed List of allowed namespaces/directives 2506 * @param bool $mq_fix Boolean whether or not to enable magic quotes fix 2507 * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy 2508 * 2509 * @return mixed 2510 */ 2511 public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) 2512 { 2513 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema); 2514 $config = HTMLPurifier_Config::create($ret, $schema); 2515 return $config; 2516 } 2517 2518 /** 2519 * Merges in configuration values from $_GET/$_POST to object. NOT STATIC. 2520 * 2521 * @param array $array $_GET or $_POST array to import 2522 * @param string|bool $index Index/name that the config variables are in 2523 * @param array|bool $allowed List of allowed namespaces/directives 2524 * @param bool $mq_fix Boolean whether or not to enable magic quotes fix 2525 */ 2526 public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) 2527 { 2528 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def); 2529 $this->loadArray($ret); 2530 } 2531 2532 /** 2533 * Prepares an array from a form into something usable for the more 2534 * strict parts of HTMLPurifier_Config 2535 * 2536 * @param array $array $_GET or $_POST array to import 2537 * @param string|bool $index Index/name that the config variables are in 2538 * @param array|bool $allowed List of allowed namespaces/directives 2539 * @param bool $mq_fix Boolean whether or not to enable magic quotes fix 2540 * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy 2541 * 2542 * @return array 2543 */ 2544 public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) 2545 { 2546 if ($index !== false) { 2547 $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array(); 2548 } 2549 2550 $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema); 2551 $ret = array(); 2552 foreach ($allowed as $key) { 2553 list($ns, $directive) = $key; 2554 $skey = "$ns.$directive"; 2555 if (!empty($array["Null_$skey"])) { 2556 $ret[$ns][$directive] = null; 2557 continue; 2558 } 2559 if (!isset($array[$skey])) { 2560 continue; 2561 } 2562 $ret[$ns][$directive] = $array[$skey]; 2563 } 2564 return $ret; 2565 } 2566 2567 /** 2568 * Loads configuration values from an ini file 2569 * 2570 * @param string $filename Name of ini file 2571 */ 2572 public function loadIni($filename) 2573 { 2574 if ($this->isFinalized('Cannot load directives after finalization')) { 2575 return; 2576 } 2577 $array = parse_ini_file($filename, true); 2578 $this->loadArray($array); 2579 } 2580 2581 /** 2582 * Checks whether or not the configuration object is finalized. 2583 * 2584 * @param string|bool $error String error message, or false for no error 2585 * 2586 * @return bool 2587 */ 2588 public function isFinalized($error = false) 2589 { 2590 if ($this->finalized && $error) { 2591 $this->triggerError($error, E_USER_ERROR); 2592 } 2593 return $this->finalized; 2594 } 2595 2596 /** 2597 * Finalizes configuration only if auto finalize is on and not 2598 * already finalized 2599 */ 2600 public function autoFinalize() 2601 { 2602 if ($this->autoFinalize) { 2603 $this->finalize(); 2604 } else { 2605 $this->plist->squash(true); 2606 } 2607 } 2608 2609 /** 2610 * Finalizes a configuration object, prohibiting further change 2611 */ 2612 public function finalize() 2613 { 2614 $this->finalized = true; 2615 $this->parser = null; 2616 } 2617 2618 /** 2619 * Produces a nicely formatted error message by supplying the 2620 * stack frame information OUTSIDE of HTMLPurifier_Config. 2621 * 2622 * @param string $msg An error message 2623 * @param int $no An error number 2624 */ 2625 protected function triggerError($msg, $no) 2626 { 2627 // determine previous stack frame 2628 $extra = ''; 2629 if ($this->chatty) { 2630 $trace = debug_backtrace(); 2631 // zip(tail(trace), trace) -- but PHP is not Haskell har har 2632 for ($i = 0, $c = count($trace); $i < $c - 1; $i++) { 2633 // XXX this is not correct on some versions of HTML Purifier 2634 if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') { 2635 continue; 2636 } 2637 $frame = $trace[$i]; 2638 $extra = " invoked on line {$frame['line']} in file {$frame['file']}"; 2639 break; 2640 } 2641 } 2642 trigger_error($msg . $extra, $no); 2643 } 2644 2645 /** 2646 * Returns a serialized form of the configuration object that can 2647 * be reconstituted. 2648 * 2649 * @return string 2650 */ 2651 public function serialize() 2652 { 2653 $this->getDefinition('HTML'); 2654 $this->getDefinition('CSS'); 2655 $this->getDefinition('URI'); 2656 return serialize($this); 2657 } 2658 2659} 2660 2661 2662 2663 2664 2665/** 2666 * Configuration definition, defines directives and their defaults. 2667 */ 2668class HTMLPurifier_ConfigSchema 2669{ 2670 /** 2671 * Defaults of the directives and namespaces. 2672 * @type array 2673 * @note This shares the exact same structure as HTMLPurifier_Config::$conf 2674 */ 2675 public $defaults = array(); 2676 2677 /** 2678 * The default property list. Do not edit this property list. 2679 * @type array 2680 */ 2681 public $defaultPlist; 2682 2683 /** 2684 * Definition of the directives. 2685 * The structure of this is: 2686 * 2687 * array( 2688 * 'Namespace' => array( 2689 * 'Directive' => new stdClass(), 2690 * ) 2691 * ) 2692 * 2693 * The stdClass may have the following properties: 2694 * 2695 * - If isAlias isn't set: 2696 * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions 2697 * - allow_null: If set, this directive allows null values 2698 * - aliases: If set, an associative array of value aliases to real values 2699 * - allowed: If set, a lookup array of allowed (string) values 2700 * - If isAlias is set: 2701 * - namespace: Namespace this directive aliases to 2702 * - name: Directive name this directive aliases to 2703 * 2704 * In certain degenerate cases, stdClass will actually be an integer. In 2705 * that case, the value is equivalent to an stdClass with the type 2706 * property set to the integer. If the integer is negative, type is 2707 * equal to the absolute value of integer, and allow_null is true. 2708 * 2709 * This class is friendly with HTMLPurifier_Config. If you need introspection 2710 * about the schema, you're better of using the ConfigSchema_Interchange, 2711 * which uses more memory but has much richer information. 2712 * @type array 2713 */ 2714 public $info = array(); 2715 2716 /** 2717 * Application-wide singleton 2718 * @type HTMLPurifier_ConfigSchema 2719 */ 2720 protected static $singleton; 2721 2722 public function __construct() 2723 { 2724 $this->defaultPlist = new HTMLPurifier_PropertyList(); 2725 } 2726 2727 /** 2728 * Unserializes the default ConfigSchema. 2729 * @return HTMLPurifier_ConfigSchema 2730 */ 2731 public static function makeFromSerial() 2732 { 2733 $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'); 2734 $r = unserialize($contents); 2735 if (!$r) { 2736 $hash = sha1($contents); 2737 trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR); 2738 } 2739 return $r; 2740 } 2741 2742 /** 2743 * Retrieves an instance of the application-wide configuration definition. 2744 * @param HTMLPurifier_ConfigSchema $prototype 2745 * @return HTMLPurifier_ConfigSchema 2746 */ 2747 public static function instance($prototype = null) 2748 { 2749 if ($prototype !== null) { 2750 HTMLPurifier_ConfigSchema::$singleton = $prototype; 2751 } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) { 2752 HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial(); 2753 } 2754 return HTMLPurifier_ConfigSchema::$singleton; 2755 } 2756 2757 /** 2758 * Defines a directive for configuration 2759 * @warning Will fail of directive's namespace is defined. 2760 * @warning This method's signature is slightly different from the legacy 2761 * define() static method! Beware! 2762 * @param string $key Name of directive 2763 * @param mixed $default Default value of directive 2764 * @param string $type Allowed type of the directive. See 2765 * HTMLPurifier_DirectiveDef::$type for allowed values 2766 * @param bool $allow_null Whether or not to allow null values 2767 */ 2768 public function add($key, $default, $type, $allow_null) 2769 { 2770 $obj = new stdClass(); 2771 $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type]; 2772 if ($allow_null) { 2773 $obj->allow_null = true; 2774 } 2775 $this->info[$key] = $obj; 2776 $this->defaults[$key] = $default; 2777 $this->defaultPlist->set($key, $default); 2778 } 2779 2780 /** 2781 * Defines a directive value alias. 2782 * 2783 * Directive value aliases are convenient for developers because it lets 2784 * them set a directive to several values and get the same result. 2785 * @param string $key Name of Directive 2786 * @param array $aliases Hash of aliased values to the real alias 2787 */ 2788 public function addValueAliases($key, $aliases) 2789 { 2790 if (!isset($this->info[$key]->aliases)) { 2791 $this->info[$key]->aliases = array(); 2792 } 2793 foreach ($aliases as $alias => $real) { 2794 $this->info[$key]->aliases[$alias] = $real; 2795 } 2796 } 2797 2798 /** 2799 * Defines a set of allowed values for a directive. 2800 * @warning This is slightly different from the corresponding static 2801 * method definition. 2802 * @param string $key Name of directive 2803 * @param array $allowed Lookup array of allowed values 2804 */ 2805 public function addAllowedValues($key, $allowed) 2806 { 2807 $this->info[$key]->allowed = $allowed; 2808 } 2809 2810 /** 2811 * Defines a directive alias for backwards compatibility 2812 * @param string $key Directive that will be aliased 2813 * @param string $new_key Directive that the alias will be to 2814 */ 2815 public function addAlias($key, $new_key) 2816 { 2817 $obj = new stdClass; 2818 $obj->key = $new_key; 2819 $obj->isAlias = true; 2820 $this->info[$key] = $obj; 2821 } 2822 2823 /** 2824 * Replaces any stdClass that only has the type property with type integer. 2825 */ 2826 public function postProcess() 2827 { 2828 foreach ($this->info as $key => $v) { 2829 if (count((array) $v) == 1) { 2830 $this->info[$key] = $v->type; 2831 } elseif (count((array) $v) == 2 && isset($v->allow_null)) { 2832 $this->info[$key] = -$v->type; 2833 } 2834 } 2835 } 2836} 2837 2838 2839 2840 2841 2842/** 2843 * @todo Unit test 2844 */ 2845class HTMLPurifier_ContentSets 2846{ 2847 2848 /** 2849 * List of content set strings (pipe separators) indexed by name. 2850 * @type array 2851 */ 2852 public $info = array(); 2853 2854 /** 2855 * List of content set lookups (element => true) indexed by name. 2856 * @type array 2857 * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets 2858 */ 2859 public $lookup = array(); 2860 2861 /** 2862 * Synchronized list of defined content sets (keys of info). 2863 * @type array 2864 */ 2865 protected $keys = array(); 2866 /** 2867 * Synchronized list of defined content values (values of info). 2868 * @type array 2869 */ 2870 protected $values = array(); 2871 2872 /** 2873 * Merges in module's content sets, expands identifiers in the content 2874 * sets and populates the keys, values and lookup member variables. 2875 * @param HTMLPurifier_HTMLModule[] $modules List of HTMLPurifier_HTMLModule 2876 */ 2877 public function __construct($modules) 2878 { 2879 if (!is_array($modules)) { 2880 $modules = array($modules); 2881 } 2882 // populate content_sets based on module hints 2883 // sorry, no way of overloading 2884 foreach ($modules as $module) { 2885 foreach ($module->content_sets as $key => $value) { 2886 $temp = $this->convertToLookup($value); 2887 if (isset($this->lookup[$key])) { 2888 // add it into the existing content set 2889 $this->lookup[$key] = array_merge($this->lookup[$key], $temp); 2890 } else { 2891 $this->lookup[$key] = $temp; 2892 } 2893 } 2894 } 2895 $old_lookup = false; 2896 while ($old_lookup !== $this->lookup) { 2897 $old_lookup = $this->lookup; 2898 foreach ($this->lookup as $i => $set) { 2899 $add = array(); 2900 foreach ($set as $element => $x) { 2901 if (isset($this->lookup[$element])) { 2902 $add += $this->lookup[$element]; 2903 unset($this->lookup[$i][$element]); 2904 } 2905 } 2906 $this->lookup[$i] += $add; 2907 } 2908 } 2909 2910 foreach ($this->lookup as $key => $lookup) { 2911 $this->info[$key] = implode(' | ', array_keys($lookup)); 2912 } 2913 $this->keys = array_keys($this->info); 2914 $this->values = array_values($this->info); 2915 } 2916 2917 /** 2918 * Accepts a definition; generates and assigns a ChildDef for it 2919 * @param HTMLPurifier_ElementDef $def HTMLPurifier_ElementDef reference 2920 * @param HTMLPurifier_HTMLModule $module Module that defined the ElementDef 2921 */ 2922 public function generateChildDef(&$def, $module) 2923 { 2924 if (!empty($def->child)) { // already done! 2925 return; 2926 } 2927 $content_model = $def->content_model; 2928 if (is_string($content_model)) { 2929 // Assume that $this->keys is alphanumeric 2930 $def->content_model = preg_replace_callback( 2931 '/\b(' . implode('|', $this->keys) . ')\b/', 2932 array($this, 'generateChildDefCallback'), 2933 $content_model 2934 ); 2935 //$def->content_model = str_replace( 2936 // $this->keys, $this->values, $content_model); 2937 } 2938 $def->child = $this->getChildDef($def, $module); 2939 } 2940 2941 public function generateChildDefCallback($matches) 2942 { 2943 return $this->info[$matches[0]]; 2944 } 2945 2946 /** 2947 * Instantiates a ChildDef based on content_model and content_model_type 2948 * member variables in HTMLPurifier_ElementDef 2949 * @note This will also defer to modules for custom HTMLPurifier_ChildDef 2950 * subclasses that need content set expansion 2951 * @param HTMLPurifier_ElementDef $def HTMLPurifier_ElementDef to have ChildDef extracted 2952 * @param HTMLPurifier_HTMLModule $module Module that defined the ElementDef 2953 * @return HTMLPurifier_ChildDef corresponding to ElementDef 2954 */ 2955 public function getChildDef($def, $module) 2956 { 2957 $value = $def->content_model; 2958 if (is_object($value)) { 2959 trigger_error( 2960 'Literal object child definitions should be stored in '. 2961 'ElementDef->child not ElementDef->content_model', 2962 E_USER_NOTICE 2963 ); 2964 return $value; 2965 } 2966 switch ($def->content_model_type) { 2967 case 'required': 2968 return new HTMLPurifier_ChildDef_Required($value); 2969 case 'optional': 2970 return new HTMLPurifier_ChildDef_Optional($value); 2971 case 'empty': 2972 return new HTMLPurifier_ChildDef_Empty(); 2973 case 'custom': 2974 return new HTMLPurifier_ChildDef_Custom($value); 2975 } 2976 // defer to its module 2977 $return = false; 2978 if ($module->defines_child_def) { // save a func call 2979 $return = $module->getChildDef($def); 2980 } 2981 if ($return !== false) { 2982 return $return; 2983 } 2984 // error-out 2985 trigger_error( 2986 'Could not determine which ChildDef class to instantiate', 2987 E_USER_ERROR 2988 ); 2989 return false; 2990 } 2991 2992 /** 2993 * Converts a string list of elements separated by pipes into 2994 * a lookup array. 2995 * @param string $string List of elements 2996 * @return array Lookup array of elements 2997 */ 2998 protected function convertToLookup($string) 2999 { 3000 $array = explode('|', str_replace(' ', '', $string)); 3001 $ret = array(); 3002 foreach ($array as $k) { 3003 $ret[$k] = true; 3004 } 3005 return $ret; 3006 } 3007} 3008 3009 3010 3011 3012 3013/** 3014 * Registry object that contains information about the current context. 3015 * @warning Is a bit buggy when variables are set to null: it thinks 3016 * they don't exist! So use false instead, please. 3017 * @note Since the variables Context deals with may not be objects, 3018 * references are very important here! Do not remove! 3019 */ 3020class HTMLPurifier_Context 3021{ 3022 3023 /** 3024 * Private array that stores the references. 3025 * @type array 3026 */ 3027 private $_storage = array(); 3028 3029 /** 3030 * Registers a variable into the context. 3031 * @param string $name String name 3032 * @param mixed $ref Reference to variable to be registered 3033 */ 3034 public function register($name, &$ref) 3035 { 3036 if (array_key_exists($name, $this->_storage)) { 3037 trigger_error( 3038 "Name $name produces collision, cannot re-register", 3039 E_USER_ERROR 3040 ); 3041 return; 3042 } 3043 $this->_storage[$name] =& $ref; 3044 } 3045 3046 /** 3047 * Retrieves a variable reference from the context. 3048 * @param string $name String name 3049 * @param bool $ignore_error Boolean whether or not to ignore error 3050 * @return mixed 3051 */ 3052 public function &get($name, $ignore_error = false) 3053 { 3054 if (!array_key_exists($name, $this->_storage)) { 3055 if (!$ignore_error) { 3056 trigger_error( 3057 "Attempted to retrieve non-existent variable $name", 3058 E_USER_ERROR 3059 ); 3060 } 3061 $var = null; // so we can return by reference 3062 return $var; 3063 } 3064 return $this->_storage[$name]; 3065 } 3066 3067 /** 3068 * Destroys a variable in the context. 3069 * @param string $name String name 3070 */ 3071 public function destroy($name) 3072 { 3073 if (!array_key_exists($name, $this->_storage)) { 3074 trigger_error( 3075 "Attempted to destroy non-existent variable $name", 3076 E_USER_ERROR 3077 ); 3078 return; 3079 } 3080 unset($this->_storage[$name]); 3081 } 3082 3083 /** 3084 * Checks whether or not the variable exists. 3085 * @param string $name String name 3086 * @return bool 3087 */ 3088 public function exists($name) 3089 { 3090 return array_key_exists($name, $this->_storage); 3091 } 3092 3093 /** 3094 * Loads a series of variables from an associative array 3095 * @param array $context_array Assoc array of variables to load 3096 */ 3097 public function loadArray($context_array) 3098 { 3099 foreach ($context_array as $key => $discard) { 3100 $this->register($key, $context_array[$key]); 3101 } 3102 } 3103} 3104 3105 3106 3107 3108 3109/** 3110 * Abstract class representing Definition cache managers that implements 3111 * useful common methods and is a factory. 3112 * @todo Create a separate maintenance file advanced users can use to 3113 * cache their custom HTMLDefinition, which can be loaded 3114 * via a configuration directive 3115 * @todo Implement memcached 3116 */ 3117abstract class HTMLPurifier_DefinitionCache 3118{ 3119 /** 3120 * @type string 3121 */ 3122 public $type; 3123 3124 /** 3125 * @param string $type Type of definition objects this instance of the 3126 * cache will handle. 3127 */ 3128 public function __construct($type) 3129 { 3130 $this->type = $type; 3131 } 3132 3133 /** 3134 * Generates a unique identifier for a particular configuration 3135 * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config 3136 * @return string 3137 */ 3138 public function generateKey($config) 3139 { 3140 return $config->version . ',' . // possibly replace with function calls 3141 $config->getBatchSerial($this->type) . ',' . 3142 $config->get($this->type . '.DefinitionRev'); 3143 } 3144 3145 /** 3146 * Tests whether or not a key is old with respect to the configuration's 3147 * version and revision number. 3148 * @param string $key Key to test 3149 * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config to test against 3150 * @return bool 3151 */ 3152 public function isOld($key, $config) 3153 { 3154 if (substr_count($key, ',') < 2) { 3155 return true; 3156 } 3157 list($version, $hash, $revision) = explode(',', $key, 3); 3158 $compare = version_compare($version, $config->version); 3159 // version mismatch, is always old 3160 if ($compare != 0) { 3161 return true; 3162 } 3163 // versions match, ids match, check revision number 3164 if ($hash == $config->getBatchSerial($this->type) && 3165 $revision < $config->get($this->type . '.DefinitionRev')) { 3166 return true; 3167 } 3168 return false; 3169 } 3170 3171 /** 3172 * Checks if a definition's type jives with the cache's type 3173 * @note Throws an error on failure 3174 * @param HTMLPurifier_Definition $def Definition object to check 3175 * @return bool true if good, false if not 3176 */ 3177 public function checkDefType($def) 3178 { 3179 if ($def->type !== $this->type) { 3180 trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}"); 3181 return false; 3182 } 3183 return true; 3184 } 3185 3186 /** 3187 * Adds a definition object to the cache 3188 * @param HTMLPurifier_Definition $def 3189 * @param HTMLPurifier_Config $config 3190 */ 3191 abstract public function add($def, $config); 3192 3193 /** 3194 * Unconditionally saves a definition object to the cache 3195 * @param HTMLPurifier_Definition $def 3196 * @param HTMLPurifier_Config $config 3197 */ 3198 abstract public function set($def, $config); 3199 3200 /** 3201 * Replace an object in the cache 3202 * @param HTMLPurifier_Definition $def 3203 * @param HTMLPurifier_Config $config 3204 */ 3205 abstract public function replace($def, $config); 3206 3207 /** 3208 * Retrieves a definition object from the cache 3209 * @param HTMLPurifier_Config $config 3210 */ 3211 abstract public function get($config); 3212 3213 /** 3214 * Removes a definition object to the cache 3215 * @param HTMLPurifier_Config $config 3216 */ 3217 abstract public function remove($config); 3218 3219 /** 3220 * Clears all objects from cache 3221 * @param HTMLPurifier_Config $config 3222 */ 3223 abstract public function flush($config); 3224 3225 /** 3226 * Clears all expired (older version or revision) objects from cache 3227 * @note Be careful implementing this method as flush. Flush must 3228 * not interfere with other Definition types, and cleanup() 3229 * should not be repeatedly called by userland code. 3230 * @param HTMLPurifier_Config $config 3231 */ 3232 abstract public function cleanup($config); 3233} 3234 3235 3236 3237 3238 3239/** 3240 * Responsible for creating definition caches. 3241 */ 3242class HTMLPurifier_DefinitionCacheFactory 3243{ 3244 /** 3245 * @type array 3246 */ 3247 protected $caches = array('Serializer' => array()); 3248 3249 /** 3250 * @type array 3251 */ 3252 protected $implementations = array(); 3253 3254 /** 3255 * @type HTMLPurifier_DefinitionCache_Decorator[] 3256 */ 3257 protected $decorators = array(); 3258 3259 /** 3260 * Initialize default decorators 3261 */ 3262 public function setup() 3263 { 3264 $this->addDecorator('Cleanup'); 3265 } 3266 3267 /** 3268 * Retrieves an instance of global definition cache factory. 3269 * @param HTMLPurifier_DefinitionCacheFactory $prototype 3270 * @return HTMLPurifier_DefinitionCacheFactory 3271 */ 3272 public static function instance($prototype = null) 3273 { 3274 static $instance; 3275 if ($prototype !== null) { 3276 $instance = $prototype; 3277 } elseif ($instance === null || $prototype === true) { 3278 $instance = new HTMLPurifier_DefinitionCacheFactory(); 3279 $instance->setup(); 3280 } 3281 return $instance; 3282 } 3283 3284 /** 3285 * Registers a new definition cache object 3286 * @param string $short Short name of cache object, for reference 3287 * @param string $long Full class name of cache object, for construction 3288 */ 3289 public function register($short, $long) 3290 { 3291 $this->implementations[$short] = $long; 3292 } 3293 3294 /** 3295 * Factory method that creates a cache object based on configuration 3296 * @param string $type Name of definitions handled by cache 3297 * @param HTMLPurifier_Config $config Config instance 3298 * @return mixed 3299 */ 3300 public function create($type, $config) 3301 { 3302 $method = $config->get('Cache.DefinitionImpl'); 3303 if ($method === null) { 3304 return new HTMLPurifier_DefinitionCache_Null($type); 3305 } 3306 if (!empty($this->caches[$method][$type])) { 3307 return $this->caches[$method][$type]; 3308 } 3309 if (isset($this->implementations[$method]) && 3310 class_exists($class = $this->implementations[$method], false)) { 3311 $cache = new $class($type); 3312 } else { 3313 if ($method != 'Serializer') { 3314 trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING); 3315 } 3316 $cache = new HTMLPurifier_DefinitionCache_Serializer($type); 3317 } 3318 foreach ($this->decorators as $decorator) { 3319 $new_cache = $decorator->decorate($cache); 3320 // prevent infinite recursion in PHP 4 3321 unset($cache); 3322 $cache = $new_cache; 3323 } 3324 $this->caches[$method][$type] = $cache; 3325 return $this->caches[$method][$type]; 3326 } 3327 3328 /** 3329 * Registers a decorator to add to all new cache objects 3330 * @param HTMLPurifier_DefinitionCache_Decorator|string $decorator An instance or the name of a decorator 3331 */ 3332 public function addDecorator($decorator) 3333 { 3334 if (is_string($decorator)) { 3335 $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator"; 3336 $decorator = new $class; 3337 } 3338 $this->decorators[$decorator->name] = $decorator; 3339 } 3340} 3341 3342 3343 3344 3345 3346/** 3347 * Represents a document type, contains information on which modules 3348 * need to be loaded. 3349 * @note This class is inspected by Printer_HTMLDefinition->renderDoctype. 3350 * If structure changes, please update that function. 3351 */ 3352class HTMLPurifier_Doctype 3353{ 3354 /** 3355 * Full name of doctype 3356 * @type string 3357 */ 3358 public $name; 3359 3360 /** 3361 * List of standard modules (string identifiers or literal objects) 3362 * that this doctype uses 3363 * @type array 3364 */ 3365 public $modules = array(); 3366 3367 /** 3368 * List of modules to use for tidying up code 3369 * @type array 3370 */ 3371 public $tidyModules = array(); 3372 3373 /** 3374 * Is the language derived from XML (i.e. XHTML)? 3375 * @type bool 3376 */ 3377 public $xml = true; 3378 3379 /** 3380 * List of aliases for this doctype 3381 * @type array 3382 */ 3383 public $aliases = array(); 3384 3385 /** 3386 * Public DTD identifier 3387 * @type string 3388 */ 3389 public $dtdPublic; 3390 3391 /** 3392 * System DTD identifier 3393 * @type string 3394 */ 3395 public $dtdSystem; 3396 3397 public function __construct( 3398 $name = null, 3399 $xml = true, 3400 $modules = array(), 3401 $tidyModules = array(), 3402 $aliases = array(), 3403 $dtd_public = null, 3404 $dtd_system = null 3405 ) { 3406 $this->name = $name; 3407 $this->xml = $xml; 3408 $this->modules = $modules; 3409 $this->tidyModules = $tidyModules; 3410 $this->aliases = $aliases; 3411 $this->dtdPublic = $dtd_public; 3412 $this->dtdSystem = $dtd_system; 3413 } 3414} 3415 3416 3417 3418 3419 3420class HTMLPurifier_DoctypeRegistry 3421{ 3422 3423 /** 3424 * Hash of doctype names to doctype objects. 3425 * @type array 3426 */ 3427 protected $doctypes; 3428 3429 /** 3430 * Lookup table of aliases to real doctype names. 3431 * @type array 3432 */ 3433 protected $aliases; 3434 3435 /** 3436 * Registers a doctype to the registry 3437 * @note Accepts a fully-formed doctype object, or the 3438 * parameters for constructing a doctype object 3439 * @param string $doctype Name of doctype or literal doctype object 3440 * @param bool $xml 3441 * @param array $modules Modules doctype will load 3442 * @param array $tidy_modules Modules doctype will load for certain modes 3443 * @param array $aliases Alias names for doctype 3444 * @param string $dtd_public 3445 * @param string $dtd_system 3446 * @return HTMLPurifier_Doctype Editable registered doctype 3447 */ 3448 public function register( 3449 $doctype, 3450 $xml = true, 3451 $modules = array(), 3452 $tidy_modules = array(), 3453 $aliases = array(), 3454 $dtd_public = null, 3455 $dtd_system = null 3456 ) { 3457 if (!is_array($modules)) { 3458 $modules = array($modules); 3459 } 3460 if (!is_array($tidy_modules)) { 3461 $tidy_modules = array($tidy_modules); 3462 } 3463 if (!is_array($aliases)) { 3464 $aliases = array($aliases); 3465 } 3466 if (!is_object($doctype)) { 3467 $doctype = new HTMLPurifier_Doctype( 3468 $doctype, 3469 $xml, 3470 $modules, 3471 $tidy_modules, 3472 $aliases, 3473 $dtd_public, 3474 $dtd_system 3475 ); 3476 } 3477 $this->doctypes[$doctype->name] = $doctype; 3478 $name = $doctype->name; 3479 // hookup aliases 3480 foreach ($doctype->aliases as $alias) { 3481 if (isset($this->doctypes[$alias])) { 3482 continue; 3483 } 3484 $this->aliases[$alias] = $name; 3485 } 3486 // remove old aliases 3487 if (isset($this->aliases[$name])) { 3488 unset($this->aliases[$name]); 3489 } 3490 return $doctype; 3491 } 3492 3493 /** 3494 * Retrieves reference to a doctype of a certain name 3495 * @note This function resolves aliases 3496 * @note When possible, use the more fully-featured make() 3497 * @param string $doctype Name of doctype 3498 * @return HTMLPurifier_Doctype Editable doctype object 3499 */ 3500 public function get($doctype) 3501 { 3502 if (isset($this->aliases[$doctype])) { 3503 $doctype = $this->aliases[$doctype]; 3504 } 3505 if (!isset($this->doctypes[$doctype])) { 3506 trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR); 3507 $anon = new HTMLPurifier_Doctype($doctype); 3508 return $anon; 3509 } 3510 return $this->doctypes[$doctype]; 3511 } 3512 3513 /** 3514 * Creates a doctype based on a configuration object, 3515 * will perform initialization on the doctype 3516 * @note Use this function to get a copy of doctype that config 3517 * can hold on to (this is necessary in order to tell 3518 * Generator whether or not the current document is XML 3519 * based or not). 3520 * @param HTMLPurifier_Config $config 3521 * @return HTMLPurifier_Doctype 3522 */ 3523 public function make($config) 3524 { 3525 return clone $this->get($this->getDoctypeFromConfig($config)); 3526 } 3527 3528 /** 3529 * Retrieves the doctype from the configuration object 3530 * @param HTMLPurifier_Config $config 3531 * @return string 3532 */ 3533 public function getDoctypeFromConfig($config) 3534 { 3535 // recommended test 3536 $doctype = $config->get('HTML.Doctype'); 3537 if (!empty($doctype)) { 3538 return $doctype; 3539 } 3540 $doctype = $config->get('HTML.CustomDoctype'); 3541 if (!empty($doctype)) { 3542 return $doctype; 3543 } 3544 // backwards-compatibility 3545 if ($config->get('HTML.XHTML')) { 3546 $doctype = 'XHTML 1.0'; 3547 } else { 3548 $doctype = 'HTML 4.01'; 3549 } 3550 if ($config->get('HTML.Strict')) { 3551 $doctype .= ' Strict'; 3552 } else { 3553 $doctype .= ' Transitional'; 3554 } 3555 return $doctype; 3556 } 3557} 3558 3559 3560 3561 3562 3563/** 3564 * Structure that stores an HTML element definition. Used by 3565 * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule. 3566 * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition. 3567 * Please update that class too. 3568 * @warning If you add new properties to this class, you MUST update 3569 * the mergeIn() method. 3570 */ 3571class HTMLPurifier_ElementDef 3572{ 3573 /** 3574 * Does the definition work by itself, or is it created solely 3575 * for the purpose of merging into another definition? 3576 * @type bool 3577 */ 3578 public $standalone = true; 3579 3580 /** 3581 * Associative array of attribute name to HTMLPurifier_AttrDef. 3582 * @type array 3583 * @note Before being processed by HTMLPurifier_AttrCollections 3584 * when modules are finalized during 3585 * HTMLPurifier_HTMLDefinition->setup(), this array may also 3586 * contain an array at index 0 that indicates which attribute 3587 * collections to load into the full array. It may also 3588 * contain string indentifiers in lieu of HTMLPurifier_AttrDef, 3589 * see HTMLPurifier_AttrTypes on how they are expanded during 3590 * HTMLPurifier_HTMLDefinition->setup() processing. 3591 */ 3592 public $attr = array(); 3593 3594 // XXX: Design note: currently, it's not possible to override 3595 // previously defined AttrTransforms without messing around with 3596 // the final generated config. This is by design; a previous version 3597 // used an associated list of attr_transform, but it was extremely 3598 // easy to accidentally override other attribute transforms by 3599 // forgetting to specify an index (and just using 0.) While we 3600 // could check this by checking the index number and complaining, 3601 // there is a second problem which is that it is not at all easy to 3602 // tell when something is getting overridden. Combine this with a 3603 // codebase where this isn't really being used, and it's perfect for 3604 // nuking. 3605 3606 /** 3607 * List of tags HTMLPurifier_AttrTransform to be done before validation. 3608 * @type array 3609 */ 3610 public $attr_transform_pre = array(); 3611 3612 /** 3613 * List of tags HTMLPurifier_AttrTransform to be done after validation. 3614 * @type array 3615 */ 3616 public $attr_transform_post = array(); 3617 3618 /** 3619 * HTMLPurifier_ChildDef of this tag. 3620 * @type HTMLPurifier_ChildDef 3621 */ 3622 public $child; 3623 3624 /** 3625 * Abstract string representation of internal ChildDef rules. 3626 * @see HTMLPurifier_ContentSets for how this is parsed and then transformed 3627 * into an HTMLPurifier_ChildDef. 3628 * @warning This is a temporary variable that is not available after 3629 * being processed by HTMLDefinition 3630 * @type string 3631 */ 3632 public $content_model; 3633 3634 /** 3635 * Value of $child->type, used to determine which ChildDef to use, 3636 * used in combination with $content_model. 3637 * @warning This must be lowercase 3638 * @warning This is a temporary variable that is not available after 3639 * being processed by HTMLDefinition 3640 * @type string 3641 */ 3642 public $content_model_type; 3643 3644 /** 3645 * Does the element have a content model (#PCDATA | Inline)*? This 3646 * is important for chameleon ins and del processing in 3647 * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't 3648 * have to worry about this one. 3649 * @type bool 3650 */ 3651 public $descendants_are_inline = false; 3652 3653 /** 3654 * List of the names of required attributes this element has. 3655 * Dynamically populated by HTMLPurifier_HTMLDefinition::getElement() 3656 * @type array 3657 */ 3658 public $required_attr = array(); 3659 3660 /** 3661 * Lookup table of tags excluded from all descendants of this tag. 3662 * @type array 3663 * @note SGML permits exclusions for all descendants, but this is 3664 * not possible with DTDs or XML Schemas. W3C has elected to 3665 * use complicated compositions of content_models to simulate 3666 * exclusion for children, but we go the simpler, SGML-style 3667 * route of flat-out exclusions, which correctly apply to 3668 * all descendants and not just children. Note that the XHTML 3669 * Modularization Abstract Modules are blithely unaware of such 3670 * distinctions. 3671 */ 3672 public $excludes = array(); 3673 3674 /** 3675 * This tag is explicitly auto-closed by the following tags. 3676 * @type array 3677 */ 3678 public $autoclose = array(); 3679 3680 /** 3681 * If a foreign element is found in this element, test if it is 3682 * allowed by this sub-element; if it is, instead of closing the 3683 * current element, place it inside this element. 3684 * @type string 3685 */ 3686 public $wrap; 3687 3688 /** 3689 * Whether or not this is a formatting element affected by the 3690 * "Active Formatting Elements" algorithm. 3691 * @type bool 3692 */ 3693 public $formatting; 3694 3695 /** 3696 * Low-level factory constructor for creating new standalone element defs 3697 */ 3698 public static function create($content_model, $content_model_type, $attr) 3699 { 3700 $def = new HTMLPurifier_ElementDef(); 3701 $def->content_model = $content_model; 3702 $def->content_model_type = $content_model_type; 3703 $def->attr = $attr; 3704 return $def; 3705 } 3706 3707 /** 3708 * Merges the values of another element definition into this one. 3709 * Values from the new element def take precedence if a value is 3710 * not mergeable. 3711 * @param HTMLPurifier_ElementDef $def 3712 */ 3713 public function mergeIn($def) 3714 { 3715 // later keys takes precedence 3716 foreach ($def->attr as $k => $v) { 3717 if ($k === 0) { 3718 // merge in the includes 3719 // sorry, no way to override an include 3720 foreach ($v as $v2) { 3721 $this->attr[0][] = $v2; 3722 } 3723 continue; 3724 } 3725 if ($v === false) { 3726 if (isset($this->attr[$k])) { 3727 unset($this->attr[$k]); 3728 } 3729 continue; 3730 } 3731 $this->attr[$k] = $v; 3732 } 3733 $this->_mergeAssocArray($this->excludes, $def->excludes); 3734 $this->attr_transform_pre = array_merge($this->attr_transform_pre, $def->attr_transform_pre); 3735 $this->attr_transform_post = array_merge($this->attr_transform_post, $def->attr_transform_post); 3736 3737 if (!empty($def->content_model)) { 3738 $this->content_model = 3739 str_replace("#SUPER", $this->content_model, $def->content_model); 3740 $this->child = false; 3741 } 3742 if (!empty($def->content_model_type)) { 3743 $this->content_model_type = $def->content_model_type; 3744 $this->child = false; 3745 } 3746 if (!is_null($def->child)) { 3747 $this->child = $def->child; 3748 } 3749 if (!is_null($def->formatting)) { 3750 $this->formatting = $def->formatting; 3751 } 3752 if ($def->descendants_are_inline) { 3753 $this->descendants_are_inline = $def->descendants_are_inline; 3754 } 3755 } 3756 3757 /** 3758 * Merges one array into another, removes values which equal false 3759 * @param $a1 Array by reference that is merged into 3760 * @param $a2 Array that merges into $a1 3761 */ 3762 private function _mergeAssocArray(&$a1, $a2) 3763 { 3764 foreach ($a2 as $k => $v) { 3765 if ($v === false) { 3766 if (isset($a1[$k])) { 3767 unset($a1[$k]); 3768 } 3769 continue; 3770 } 3771 $a1[$k] = $v; 3772 } 3773 } 3774} 3775 3776 3777 3778 3779 3780/** 3781 * A UTF-8 specific character encoder that handles cleaning and transforming. 3782 * @note All functions in this class should be static. 3783 */ 3784class HTMLPurifier_Encoder 3785{ 3786 3787 /** 3788 * Constructor throws fatal error if you attempt to instantiate class 3789 */ 3790 private function __construct() 3791 { 3792 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR); 3793 } 3794 3795 /** 3796 * Error-handler that mutes errors, alternative to shut-up operator. 3797 */ 3798 public static function muteErrorHandler() 3799 { 3800 } 3801 3802 /** 3803 * iconv wrapper which mutes errors, but doesn't work around bugs. 3804 * @param string $in Input encoding 3805 * @param string $out Output encoding 3806 * @param string $text The text to convert 3807 * @return string 3808 */ 3809 public static function unsafeIconv($in, $out, $text) 3810 { 3811 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); 3812 $r = iconv($in, $out, $text); 3813 restore_error_handler(); 3814 return $r; 3815 } 3816 3817 /** 3818 * iconv wrapper which mutes errors and works around bugs. 3819 * @param string $in Input encoding 3820 * @param string $out Output encoding 3821 * @param string $text The text to convert 3822 * @param int $max_chunk_size 3823 * @return string 3824 */ 3825 public static function iconv($in, $out, $text, $max_chunk_size = 8000) 3826 { 3827 $code = self::testIconvTruncateBug(); 3828 if ($code == self::ICONV_OK) { 3829 return self::unsafeIconv($in, $out, $text); 3830 } elseif ($code == self::ICONV_TRUNCATES) { 3831 // we can only work around this if the input character set 3832 // is utf-8 3833 if ($in == 'utf-8') { 3834 if ($max_chunk_size < 4) { 3835 trigger_error('max_chunk_size is too small', E_USER_WARNING); 3836 return false; 3837 } 3838 // split into 8000 byte chunks, but be careful to handle 3839 // multibyte boundaries properly 3840 if (($c = strlen($text)) <= $max_chunk_size) { 3841 return self::unsafeIconv($in, $out, $text); 3842 } 3843 $r = ''; 3844 $i = 0; 3845 while (true) { 3846 if ($i + $max_chunk_size >= $c) { 3847 $r .= self::unsafeIconv($in, $out, substr($text, $i)); 3848 break; 3849 } 3850 // wibble the boundary 3851 if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) { 3852 $chunk_size = $max_chunk_size; 3853 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) { 3854 $chunk_size = $max_chunk_size - 1; 3855 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) { 3856 $chunk_size = $max_chunk_size - 2; 3857 } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) { 3858 $chunk_size = $max_chunk_size - 3; 3859 } else { 3860 return false; // rather confusing UTF-8... 3861 } 3862 $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths 3863 $r .= self::unsafeIconv($in, $out, $chunk); 3864 $i += $chunk_size; 3865 } 3866 return $r; 3867 } else { 3868 return false; 3869 } 3870 } else { 3871 return false; 3872 } 3873 } 3874 3875 /** 3876 * Cleans a UTF-8 string for well-formedness and SGML validity 3877 * 3878 * It will parse according to UTF-8 and return a valid UTF8 string, with 3879 * non-SGML codepoints excluded. 3880 * 3881 * Specifically, it will permit: 3882 * \x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF} 3883 * Source: https://www.w3.org/TR/REC-xml/#NT-Char 3884 * Arguably this function should be modernized to the HTML5 set 3885 * of allowed characters: 3886 * https://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream 3887 * which simultaneously expand and restrict the set of allowed characters. 3888 * 3889 * @param string $str The string to clean 3890 * @param bool $force_php 3891 * @return string 3892 * 3893 * @note Just for reference, the non-SGML code points are 0 to 31 and 3894 * 127 to 159, inclusive. However, we allow code points 9, 10 3895 * and 13, which are the tab, line feed and carriage return 3896 * respectively. 128 and above the code points map to multibyte 3897 * UTF-8 representations. 3898 * 3899 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and 3900 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the 3901 * LGPL license. Notes on what changed are inside, but in general, 3902 * the original code transformed UTF-8 text into an array of integer 3903 * Unicode codepoints. Understandably, transforming that back to 3904 * a string would be somewhat expensive, so the function was modded to 3905 * directly operate on the string. However, this discourages code 3906 * reuse, and the logic enumerated here would be useful for any 3907 * function that needs to be able to understand UTF-8 characters. 3908 * As of right now, only smart lossless character encoding converters 3909 * would need that, and I'm probably not going to implement them. 3910 */ 3911 public static function cleanUTF8($str, $force_php = false) 3912 { 3913 // UTF-8 validity is checked since PHP 4.3.5 3914 // This is an optimization: if the string is already valid UTF-8, no 3915 // need to do PHP stuff. 99% of the time, this will be the case. 3916 if (preg_match( 3917 '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', 3918 $str 3919 )) { 3920 return $str; 3921 } 3922 3923 $mState = 0; // cached expected number of octets after the current octet 3924 // until the beginning of the next UTF8 character sequence 3925 $mUcs4 = 0; // cached Unicode character 3926 $mBytes = 1; // cached expected number of octets in the current sequence 3927 3928 // original code involved an $out that was an array of Unicode 3929 // codepoints. Instead of having to convert back into UTF-8, we've 3930 // decided to directly append valid UTF-8 characters onto a string 3931 // $out once they're done. $char accumulates raw bytes, while $mUcs4 3932 // turns into the Unicode code point, so there's some redundancy. 3933 3934 $out = ''; 3935 $char = ''; 3936 3937 $len = strlen($str); 3938 for ($i = 0; $i < $len; $i++) { 3939 $in = ord($str[$i]); 3940 $char .= $str[$i]; // append byte to char 3941 if (0 == $mState) { 3942 // When mState is zero we expect either a US-ASCII character 3943 // or a multi-octet sequence. 3944 if (0 == (0x80 & ($in))) { 3945 // US-ASCII, pass straight through. 3946 if (($in <= 31 || $in == 127) && 3947 !($in == 9 || $in == 13 || $in == 10) // save \r\t\n 3948 ) { 3949 // control characters, remove 3950 } else { 3951 $out .= $char; 3952 } 3953 // reset 3954 $char = ''; 3955 $mBytes = 1; 3956 } elseif (0xC0 == (0xE0 & ($in))) { 3957 // First octet of 2 octet sequence 3958 $mUcs4 = ($in); 3959 $mUcs4 = ($mUcs4 & 0x1F) << 6; 3960 $mState = 1; 3961 $mBytes = 2; 3962 } elseif (0xE0 == (0xF0 & ($in))) { 3963 // First octet of 3 octet sequence 3964 $mUcs4 = ($in); 3965 $mUcs4 = ($mUcs4 & 0x0F) << 12; 3966 $mState = 2; 3967 $mBytes = 3; 3968 } elseif (0xF0 == (0xF8 & ($in))) { 3969 // First octet of 4 octet sequence 3970 $mUcs4 = ($in); 3971 $mUcs4 = ($mUcs4 & 0x07) << 18; 3972 $mState = 3; 3973 $mBytes = 4; 3974 } elseif (0xF8 == (0xFC & ($in))) { 3975 // First octet of 5 octet sequence. 3976 // 3977 // This is illegal because the encoded codepoint must be 3978 // either: 3979 // (a) not the shortest form or 3980 // (b) outside the Unicode range of 0-0x10FFFF. 3981 // Rather than trying to resynchronize, we will carry on 3982 // until the end of the sequence and let the later error 3983 // handling code catch it. 3984 $mUcs4 = ($in); 3985 $mUcs4 = ($mUcs4 & 0x03) << 24; 3986 $mState = 4; 3987 $mBytes = 5; 3988 } elseif (0xFC == (0xFE & ($in))) { 3989 // First octet of 6 octet sequence, see comments for 5 3990 // octet sequence. 3991 $mUcs4 = ($in); 3992 $mUcs4 = ($mUcs4 & 1) << 30; 3993 $mState = 5; 3994 $mBytes = 6; 3995 } else { 3996 // Current octet is neither in the US-ASCII range nor a 3997 // legal first octet of a multi-octet sequence. 3998 $mState = 0; 3999 $mUcs4 = 0; 4000 $mBytes = 1; 4001 $char = ''; 4002 } 4003 } else { 4004 // When mState is non-zero, we expect a continuation of the 4005 // multi-octet sequence 4006 if (0x80 == (0xC0 & ($in))) { 4007 // Legal continuation. 4008 $shift = ($mState - 1) * 6; 4009 $tmp = $in; 4010 $tmp = ($tmp & 0x0000003F) << $shift; 4011 $mUcs4 |= $tmp; 4012 4013 if (0 == --$mState) { 4014 // End of the multi-octet sequence. mUcs4 now contains 4015 // the final Unicode codepoint to be output 4016 4017 // Check for illegal sequences and codepoints. 4018 4019 // From Unicode 3.1, non-shortest form is illegal 4020 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 4021 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 4022 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 4023 (4 < $mBytes) || 4024 // From Unicode 3.2, surrogate characters = illegal 4025 (($mUcs4 & 0xFFFFF800) == 0xD800) || 4026 // Codepoints outside the Unicode range are illegal 4027 ($mUcs4 > 0x10FFFF) 4028 ) { 4029 4030 } elseif (0xFEFF != $mUcs4 && // omit BOM 4031 // check for valid Char unicode codepoints 4032 ( 4033 0x9 == $mUcs4 || 4034 0xA == $mUcs4 || 4035 0xD == $mUcs4 || 4036 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) || 4037 // 7F-9F is not strictly prohibited by XML, 4038 // but it is non-SGML, and thus we don't allow it 4039 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) || 4040 (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) || 4041 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4) 4042 ) 4043 ) { 4044 $out .= $char; 4045 } 4046 // initialize UTF8 cache (reset) 4047 $mState = 0; 4048 $mUcs4 = 0; 4049 $mBytes = 1; 4050 $char = ''; 4051 } 4052 } else { 4053 // ((0xC0 & (*in) != 0x80) && (mState != 0)) 4054 // Incomplete multi-octet sequence. 4055 // used to result in complete fail, but we'll reset 4056 $mState = 0; 4057 $mUcs4 = 0; 4058 $mBytes = 1; 4059 $char =''; 4060 } 4061 } 4062 } 4063 return $out; 4064 } 4065 4066 /** 4067 * Translates a Unicode codepoint into its corresponding UTF-8 character. 4068 * @note Based on Feyd's function at 4069 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>, 4070 * which is in public domain. 4071 * @note While we're going to do code point parsing anyway, a good 4072 * optimization would be to refuse to translate code points that 4073 * are non-SGML characters. However, this could lead to duplication. 4074 * @note This is very similar to the unichr function in 4075 * maintenance/generate-entity-file.php (although this is superior, 4076 * due to its sanity checks). 4077 */ 4078 4079 // +----------+----------+----------+----------+ 4080 // | 33222222 | 22221111 | 111111 | | 4081 // | 10987654 | 32109876 | 54321098 | 76543210 | bit 4082 // +----------+----------+----------+----------+ 4083 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F 4084 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF 4085 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF 4086 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF 4087 // +----------+----------+----------+----------+ 4088 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) 4089 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes 4090 // +----------+----------+----------+----------+ 4091 4092 public static function unichr($code) 4093 { 4094 if ($code > 1114111 or $code < 0 or 4095 ($code >= 55296 and $code <= 57343) ) { 4096 // bits are set outside the "valid" range as defined 4097 // by UNICODE 4.1.0 4098 return ''; 4099 } 4100 4101 $x = $y = $z = $w = 0; 4102 if ($code < 128) { 4103 // regular ASCII character 4104 $x = $code; 4105 } else { 4106 // set up bits for UTF-8 4107 $x = ($code & 63) | 128; 4108 if ($code < 2048) { 4109 $y = (($code & 2047) >> 6) | 192; 4110 } else { 4111 $y = (($code & 4032) >> 6) | 128; 4112 if ($code < 65536) { 4113 $z = (($code >> 12) & 15) | 224; 4114 } else { 4115 $z = (($code >> 12) & 63) | 128; 4116 $w = (($code >> 18) & 7) | 240; 4117 } 4118 } 4119 } 4120 // set up the actual character 4121 $ret = ''; 4122 if ($w) { 4123 $ret .= chr($w); 4124 } 4125 if ($z) { 4126 $ret .= chr($z); 4127 } 4128 if ($y) { 4129 $ret .= chr($y); 4130 } 4131 $ret .= chr($x); 4132 4133 return $ret; 4134 } 4135 4136 /** 4137 * @return bool 4138 */ 4139 public static function iconvAvailable() 4140 { 4141 static $iconv = null; 4142 if ($iconv === null) { 4143 $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE; 4144 } 4145 return $iconv; 4146 } 4147 4148 /** 4149 * Convert a string to UTF-8 based on configuration. 4150 * @param string $str The string to convert 4151 * @param HTMLPurifier_Config $config 4152 * @param HTMLPurifier_Context $context 4153 * @return string 4154 */ 4155 public static function convertToUTF8($str, $config, $context) 4156 { 4157 $encoding = $config->get('Core.Encoding'); 4158 if ($encoding === 'utf-8') { 4159 return $str; 4160 } 4161 static $iconv = null; 4162 if ($iconv === null) { 4163 $iconv = self::iconvAvailable(); 4164 } 4165 if ($iconv && !$config->get('Test.ForceNoIconv')) { 4166 // unaffected by bugs, since UTF-8 support all characters 4167 $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str); 4168 if ($str === false) { 4169 // $encoding is not a valid encoding 4170 trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); 4171 return ''; 4172 } 4173 // If the string is bjorked by Shift_JIS or a similar encoding 4174 // that doesn't support all of ASCII, convert the naughty 4175 // characters to their true byte-wise ASCII/UTF-8 equivalents. 4176 $str = strtr($str, self::testEncodingSupportsASCII($encoding)); 4177 return $str; 4178 } elseif ($encoding === 'iso-8859-1') { 4179 $str = utf8_encode($str); 4180 return $str; 4181 } 4182 $bug = HTMLPurifier_Encoder::testIconvTruncateBug(); 4183 if ($bug == self::ICONV_OK) { 4184 trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); 4185 } else { 4186 trigger_error( 4187 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' . 4188 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541', 4189 E_USER_ERROR 4190 ); 4191 } 4192 } 4193 4194 /** 4195 * Converts a string from UTF-8 based on configuration. 4196 * @param string $str The string to convert 4197 * @param HTMLPurifier_Config $config 4198 * @param HTMLPurifier_Context $context 4199 * @return string 4200 * @note Currently, this is a lossy conversion, with unexpressable 4201 * characters being omitted. 4202 */ 4203 public static function convertFromUTF8($str, $config, $context) 4204 { 4205 $encoding = $config->get('Core.Encoding'); 4206 if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { 4207 $str = self::convertToASCIIDumbLossless($str); 4208 } 4209 if ($encoding === 'utf-8') { 4210 return $str; 4211 } 4212 static $iconv = null; 4213 if ($iconv === null) { 4214 $iconv = self::iconvAvailable(); 4215 } 4216 if ($iconv && !$config->get('Test.ForceNoIconv')) { 4217 // Undo our previous fix in convertToUTF8, otherwise iconv will barf 4218 $ascii_fix = self::testEncodingSupportsASCII($encoding); 4219 if (!$escape && !empty($ascii_fix)) { 4220 $clear_fix = array(); 4221 foreach ($ascii_fix as $utf8 => $native) { 4222 $clear_fix[$utf8] = ''; 4223 } 4224 $str = strtr($str, $clear_fix); 4225 } 4226 $str = strtr($str, array_flip($ascii_fix)); 4227 // Normal stuff 4228 $str = self::iconv('utf-8', $encoding . '//IGNORE', $str); 4229 return $str; 4230 } elseif ($encoding === 'iso-8859-1') { 4231 $str = utf8_decode($str); 4232 return $str; 4233 } 4234 trigger_error('Encoding not supported', E_USER_ERROR); 4235 // You might be tempted to assume that the ASCII representation 4236 // might be OK, however, this is *not* universally true over all 4237 // encodings. So we take the conservative route here, rather 4238 // than forcibly turn on %Core.EscapeNonASCIICharacters 4239 } 4240 4241 /** 4242 * Lossless (character-wise) conversion of HTML to ASCII 4243 * @param string $str UTF-8 string to be converted to ASCII 4244 * @return string ASCII encoded string with non-ASCII character entity-ized 4245 * @warning Adapted from MediaWiki, claiming fair use: this is a common 4246 * algorithm. If you disagree with this license fudgery, 4247 * implement it yourself. 4248 * @note Uses decimal numeric entities since they are best supported. 4249 * @note This is a DUMB function: it has no concept of keeping 4250 * character entities that the projected character encoding 4251 * can allow. We could possibly implement a smart version 4252 * but that would require it to also know which Unicode 4253 * codepoints the charset supported (not an easy task). 4254 * @note Sort of with cleanUTF8() but it assumes that $str is 4255 * well-formed UTF-8 4256 */ 4257 public static function convertToASCIIDumbLossless($str) 4258 { 4259 $bytesleft = 0; 4260 $result = ''; 4261 $working = 0; 4262 $len = strlen($str); 4263 for ($i = 0; $i < $len; $i++) { 4264 $bytevalue = ord($str[$i]); 4265 if ($bytevalue <= 0x7F) { //0xxx xxxx 4266 $result .= chr($bytevalue); 4267 $bytesleft = 0; 4268 } elseif ($bytevalue <= 0xBF) { //10xx xxxx 4269 $working = $working << 6; 4270 $working += ($bytevalue & 0x3F); 4271 $bytesleft--; 4272 if ($bytesleft <= 0) { 4273 $result .= "&#" . $working . ";"; 4274 } 4275 } elseif ($bytevalue <= 0xDF) { //110x xxxx 4276 $working = $bytevalue & 0x1F; 4277 $bytesleft = 1; 4278 } elseif ($bytevalue <= 0xEF) { //1110 xxxx 4279 $working = $bytevalue & 0x0F; 4280 $bytesleft = 2; 4281 } else { //1111 0xxx 4282 $working = $bytevalue & 0x07; 4283 $bytesleft = 3; 4284 } 4285 } 4286 return $result; 4287 } 4288 4289 /** No bugs detected in iconv. */ 4290 const ICONV_OK = 0; 4291 4292 /** Iconv truncates output if converting from UTF-8 to another 4293 * character set with //IGNORE, and a non-encodable character is found */ 4294 const ICONV_TRUNCATES = 1; 4295 4296 /** Iconv does not support //IGNORE, making it unusable for 4297 * transcoding purposes */ 4298 const ICONV_UNUSABLE = 2; 4299 4300 /** 4301 * glibc iconv has a known bug where it doesn't handle the magic 4302 * //IGNORE stanza correctly. In particular, rather than ignore 4303 * characters, it will return an EILSEQ after consuming some number 4304 * of characters, and expect you to restart iconv as if it were 4305 * an E2BIG. Old versions of PHP did not respect the errno, and 4306 * returned the fragment, so as a result you would see iconv 4307 * mysteriously truncating output. We can work around this by 4308 * manually chopping our input into segments of about 8000 4309 * characters, as long as PHP ignores the error code. If PHP starts 4310 * paying attention to the error code, iconv becomes unusable. 4311 * 4312 * @return int Error code indicating severity of bug. 4313 */ 4314 public static function testIconvTruncateBug() 4315 { 4316 static $code = null; 4317 if ($code === null) { 4318 // better not use iconv, otherwise infinite loop! 4319 $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000)); 4320 if ($r === false) { 4321 $code = self::ICONV_UNUSABLE; 4322 } elseif (($c = strlen($r)) < 9000) { 4323 $code = self::ICONV_TRUNCATES; 4324 } elseif ($c > 9000) { 4325 trigger_error( 4326 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' . 4327 'include your iconv version as per phpversion()', 4328 E_USER_ERROR 4329 ); 4330 } else { 4331 $code = self::ICONV_OK; 4332 } 4333 } 4334 return $code; 4335 } 4336 4337 /** 4338 * This expensive function tests whether or not a given character 4339 * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will 4340 * fail this test, and require special processing. Variable width 4341 * encodings shouldn't ever fail. 4342 * 4343 * @param string $encoding Encoding name to test, as per iconv format 4344 * @param bool $bypass Whether or not to bypass the precompiled arrays. 4345 * @return Array of UTF-8 characters to their corresponding ASCII, 4346 * which can be used to "undo" any overzealous iconv action. 4347 */ 4348 public static function testEncodingSupportsASCII($encoding, $bypass = false) 4349 { 4350 // All calls to iconv here are unsafe, proof by case analysis: 4351 // If ICONV_OK, no difference. 4352 // If ICONV_TRUNCATE, all calls involve one character inputs, 4353 // so bug is not triggered. 4354 // If ICONV_UNUSABLE, this call is irrelevant 4355 static $encodings = array(); 4356 if (!$bypass) { 4357 if (isset($encodings[$encoding])) { 4358 return $encodings[$encoding]; 4359 } 4360 $lenc = strtolower($encoding); 4361 switch ($lenc) { 4362 case 'shift_jis': 4363 return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'); 4364 case 'johab': 4365 return array("\xE2\x82\xA9" => '\\'); 4366 } 4367 if (strpos($lenc, 'iso-8859-') === 0) { 4368 return array(); 4369 } 4370 } 4371 $ret = array(); 4372 if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) { 4373 return false; 4374 } 4375 for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars 4376 $c = chr($i); // UTF-8 char 4377 $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion 4378 if ($r === '' || 4379 // This line is needed for iconv implementations that do not 4380 // omit characters that do not exist in the target character set 4381 ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c) 4382 ) { 4383 // Reverse engineer: what's the UTF-8 equiv of this byte 4384 // sequence? This assumes that there's no variable width 4385 // encoding that doesn't support ASCII. 4386 $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c; 4387 } 4388 } 4389 $encodings[$encoding] = $ret; 4390 return $ret; 4391 } 4392} 4393 4394 4395 4396 4397 4398/** 4399 * Object that provides entity lookup table from entity name to character 4400 */ 4401class HTMLPurifier_EntityLookup 4402{ 4403 /** 4404 * Assoc array of entity name to character represented. 4405 * @type array 4406 */ 4407 public $table; 4408 4409 /** 4410 * Sets up the entity lookup table from the serialized file contents. 4411 * @param bool $file 4412 * @note The serialized contents are versioned, but were generated 4413 * using the maintenance script generate_entity_file.php 4414 * @warning This is not in constructor to help enforce the Singleton 4415 */ 4416 public function setup($file = false) 4417 { 4418 if (!$file) { 4419 $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser'; 4420 } 4421 $this->table = unserialize(file_get_contents($file)); 4422 } 4423 4424 /** 4425 * Retrieves sole instance of the object. 4426 * @param bool|HTMLPurifier_EntityLookup $prototype Optional prototype of custom lookup table to overload with. 4427 * @return HTMLPurifier_EntityLookup 4428 */ 4429 public static function instance($prototype = false) 4430 { 4431 // no references, since PHP doesn't copy unless modified 4432 static $instance = null; 4433 if ($prototype) { 4434 $instance = $prototype; 4435 } elseif (!$instance) { 4436 $instance = new HTMLPurifier_EntityLookup(); 4437 $instance->setup(); 4438 } 4439 return $instance; 4440 } 4441} 4442 4443 4444 4445 4446 4447// if want to implement error collecting here, we'll need to use some sort 4448// of global data (probably trigger_error) because it's impossible to pass 4449// $config or $context to the callback functions. 4450 4451/** 4452 * Handles referencing and derefencing character entities 4453 */ 4454class HTMLPurifier_EntityParser 4455{ 4456 4457 /** 4458 * Reference to entity lookup table. 4459 * @type HTMLPurifier_EntityLookup 4460 */ 4461 protected $_entity_lookup; 4462 4463 /** 4464 * Callback regex string for entities in text. 4465 * @type string 4466 */ 4467 protected $_textEntitiesRegex; 4468 4469 /** 4470 * Callback regex string for entities in attributes. 4471 * @type string 4472 */ 4473 protected $_attrEntitiesRegex; 4474 4475 /** 4476 * Tests if the beginning of a string is a semi-optional regex 4477 */ 4478 protected $_semiOptionalPrefixRegex; 4479 4480 public function __construct() { 4481 // From 4482 // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon 4483 $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml"; 4484 4485 // NB: three empty captures to put the fourth match in the right 4486 // place 4487 $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/"; 4488 4489 $this->_textEntitiesRegex = 4490 '/&(?:'. 4491 // hex 4492 '[#]x([a-fA-F0-9]+);?|'. 4493 // dec 4494 '[#]0*(\d+);?|'. 4495 // string (mandatory semicolon) 4496 // NB: order matters: match semicolon preferentially 4497 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 4498 // string (optional semicolon) 4499 "($semi_optional)". 4500 ')/'; 4501 4502 $this->_attrEntitiesRegex = 4503 '/&(?:'. 4504 // hex 4505 '[#]x([a-fA-F0-9]+);?|'. 4506 // dec 4507 '[#]0*(\d+);?|'. 4508 // string (mandatory semicolon) 4509 // NB: order matters: match semicolon preferentially 4510 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 4511 // string (optional semicolon) 4512 // don't match if trailing is equals or alphanumeric (URL 4513 // like) 4514 "($semi_optional)(?![=;A-Za-z0-9])". 4515 ')/'; 4516 4517 } 4518 4519 /** 4520 * Substitute entities with the parsed equivalents. Use this on 4521 * textual data in an HTML document (as opposed to attributes.) 4522 * 4523 * @param string $string String to have entities parsed. 4524 * @return string Parsed string. 4525 */ 4526 public function substituteTextEntities($string) 4527 { 4528 return preg_replace_callback( 4529 $this->_textEntitiesRegex, 4530 array($this, 'entityCallback'), 4531 $string 4532 ); 4533 } 4534 4535 /** 4536 * Substitute entities with the parsed equivalents. Use this on 4537 * attribute contents in documents. 4538 * 4539 * @param string $string String to have entities parsed. 4540 * @return string Parsed string. 4541 */ 4542 public function substituteAttrEntities($string) 4543 { 4544 return preg_replace_callback( 4545 $this->_attrEntitiesRegex, 4546 array($this, 'entityCallback'), 4547 $string 4548 ); 4549 } 4550 4551 /** 4552 * Callback function for substituteNonSpecialEntities() that does the work. 4553 * 4554 * @param array $matches PCRE matches array, with 0 the entire match, and 4555 * either index 1, 2 or 3 set with a hex value, dec value, 4556 * or string (respectively). 4557 * @return string Replacement string. 4558 */ 4559 4560 protected function entityCallback($matches) 4561 { 4562 $entity = $matches[0]; 4563 $hex_part = @$matches[1]; 4564 $dec_part = @$matches[2]; 4565 $named_part = empty($matches[3]) ? @$matches[4] : $matches[3]; 4566 if ($hex_part !== NULL && $hex_part !== "") { 4567 return HTMLPurifier_Encoder::unichr(hexdec($hex_part)); 4568 } elseif ($dec_part !== NULL && $dec_part !== "") { 4569 return HTMLPurifier_Encoder::unichr((int) $dec_part); 4570 } else { 4571 if (!$this->_entity_lookup) { 4572 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 4573 } 4574 if (isset($this->_entity_lookup->table[$named_part])) { 4575 return $this->_entity_lookup->table[$named_part]; 4576 } else { 4577 // exact match didn't match anything, so test if 4578 // any of the semicolon optional match the prefix. 4579 // Test that this is an EXACT match is important to 4580 // prevent infinite loop 4581 if (!empty($matches[3])) { 4582 return preg_replace_callback( 4583 $this->_semiOptionalPrefixRegex, 4584 array($this, 'entityCallback'), 4585 $entity 4586 ); 4587 } 4588 return $entity; 4589 } 4590 } 4591 } 4592 4593 // LEGACY CODE BELOW 4594 4595 /** 4596 * Callback regex string for parsing entities. 4597 * @type string 4598 */ 4599 protected $_substituteEntitiesRegex = 4600 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; 4601 // 1. hex 2. dec 3. string (XML style) 4602 4603 /** 4604 * Decimal to parsed string conversion table for special entities. 4605 * @type array 4606 */ 4607 protected $_special_dec2str = 4608 array( 4609 34 => '"', 4610 38 => '&', 4611 39 => "'", 4612 60 => '<', 4613 62 => '>' 4614 ); 4615 4616 /** 4617 * Stripped entity names to decimal conversion table for special entities. 4618 * @type array 4619 */ 4620 protected $_special_ent2dec = 4621 array( 4622 'quot' => 34, 4623 'amp' => 38, 4624 'lt' => 60, 4625 'gt' => 62 4626 ); 4627 4628 /** 4629 * Substitutes non-special entities with their parsed equivalents. Since 4630 * running this whenever you have parsed character is t3h 5uck, we run 4631 * it before everything else. 4632 * 4633 * @param string $string String to have non-special entities parsed. 4634 * @return string Parsed string. 4635 */ 4636 public function substituteNonSpecialEntities($string) 4637 { 4638 // it will try to detect missing semicolons, but don't rely on it 4639 return preg_replace_callback( 4640 $this->_substituteEntitiesRegex, 4641 array($this, 'nonSpecialEntityCallback'), 4642 $string 4643 ); 4644 } 4645 4646 /** 4647 * Callback function for substituteNonSpecialEntities() that does the work. 4648 * 4649 * @param array $matches PCRE matches array, with 0 the entire match, and 4650 * either index 1, 2 or 3 set with a hex value, dec value, 4651 * or string (respectively). 4652 * @return string Replacement string. 4653 */ 4654 4655 protected function nonSpecialEntityCallback($matches) 4656 { 4657 // replaces all but big five 4658 $entity = $matches[0]; 4659 $is_num = (@$matches[0][1] === '#'); 4660 if ($is_num) { 4661 $is_hex = (@$entity[2] === 'x'); 4662 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 4663 // abort for special characters 4664 if (isset($this->_special_dec2str[$code])) { 4665 return $entity; 4666 } 4667 return HTMLPurifier_Encoder::unichr($code); 4668 } else { 4669 if (isset($this->_special_ent2dec[$matches[3]])) { 4670 return $entity; 4671 } 4672 if (!$this->_entity_lookup) { 4673 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 4674 } 4675 if (isset($this->_entity_lookup->table[$matches[3]])) { 4676 return $this->_entity_lookup->table[$matches[3]]; 4677 } else { 4678 return $entity; 4679 } 4680 } 4681 } 4682 4683 /** 4684 * Substitutes only special entities with their parsed equivalents. 4685 * 4686 * @notice We try to avoid calling this function because otherwise, it 4687 * would have to be called a lot (for every parsed section). 4688 * 4689 * @param string $string String to have non-special entities parsed. 4690 * @return string Parsed string. 4691 */ 4692 public function substituteSpecialEntities($string) 4693 { 4694 return preg_replace_callback( 4695 $this->_substituteEntitiesRegex, 4696 array($this, 'specialEntityCallback'), 4697 $string 4698 ); 4699 } 4700 4701 /** 4702 * Callback function for substituteSpecialEntities() that does the work. 4703 * 4704 * This callback has same syntax as nonSpecialEntityCallback(). 4705 * 4706 * @param array $matches PCRE-style matches array, with 0 the entire match, and 4707 * either index 1, 2 or 3 set with a hex value, dec value, 4708 * or string (respectively). 4709 * @return string Replacement string. 4710 */ 4711 protected function specialEntityCallback($matches) 4712 { 4713 $entity = $matches[0]; 4714 $is_num = (@$matches[0][1] === '#'); 4715 if ($is_num) { 4716 $is_hex = (@$entity[2] === 'x'); 4717 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 4718 return isset($this->_special_dec2str[$int]) ? 4719 $this->_special_dec2str[$int] : 4720 $entity; 4721 } else { 4722 return isset($this->_special_ent2dec[$matches[3]]) ? 4723 $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] : 4724 $entity; 4725 } 4726 } 4727} 4728 4729 4730 4731 4732 4733/** 4734 * Error collection class that enables HTML Purifier to report HTML 4735 * problems back to the user 4736 */ 4737class HTMLPurifier_ErrorCollector 4738{ 4739 4740 /** 4741 * Identifiers for the returned error array. These are purposely numeric 4742 * so list() can be used. 4743 */ 4744 const LINENO = 0; 4745 const SEVERITY = 1; 4746 const MESSAGE = 2; 4747 const CHILDREN = 3; 4748 4749 /** 4750 * @type array 4751 */ 4752 protected $errors; 4753 4754 /** 4755 * @type array 4756 */ 4757 protected $_current; 4758 4759 /** 4760 * @type array 4761 */ 4762 protected $_stacks = array(array()); 4763 4764 /** 4765 * @type HTMLPurifier_Language 4766 */ 4767 protected $locale; 4768 4769 /** 4770 * @type HTMLPurifier_Generator 4771 */ 4772 protected $generator; 4773 4774 /** 4775 * @type HTMLPurifier_Context 4776 */ 4777 protected $context; 4778 4779 /** 4780 * @type array 4781 */ 4782 protected $lines = array(); 4783 4784 /** 4785 * @param HTMLPurifier_Context $context 4786 */ 4787 public function __construct($context) 4788 { 4789 $this->locale =& $context->get('Locale'); 4790 $this->context = $context; 4791 $this->_current =& $this->_stacks[0]; 4792 $this->errors =& $this->_stacks[0]; 4793 } 4794 4795 /** 4796 * Sends an error message to the collector for later use 4797 * @param int $severity Error severity, PHP error style (don't use E_USER_) 4798 * @param string $msg Error message text 4799 */ 4800 public function send($severity, $msg) 4801 { 4802 $args = array(); 4803 if (func_num_args() > 2) { 4804 $args = func_get_args(); 4805 array_shift($args); 4806 unset($args[0]); 4807 } 4808 4809 $token = $this->context->get('CurrentToken', true); 4810 $line = $token ? $token->line : $this->context->get('CurrentLine', true); 4811 $col = $token ? $token->col : $this->context->get('CurrentCol', true); 4812 $attr = $this->context->get('CurrentAttr', true); 4813 4814 // perform special substitutions, also add custom parameters 4815 $subst = array(); 4816 if (!is_null($token)) { 4817 $args['CurrentToken'] = $token; 4818 } 4819 if (!is_null($attr)) { 4820 $subst['$CurrentAttr.Name'] = $attr; 4821 if (isset($token->attr[$attr])) { 4822 $subst['$CurrentAttr.Value'] = $token->attr[$attr]; 4823 } 4824 } 4825 4826 if (empty($args)) { 4827 $msg = $this->locale->getMessage($msg); 4828 } else { 4829 $msg = $this->locale->formatMessage($msg, $args); 4830 } 4831 4832 if (!empty($subst)) { 4833 $msg = strtr($msg, $subst); 4834 } 4835 4836 // (numerically indexed) 4837 $error = array( 4838 self::LINENO => $line, 4839 self::SEVERITY => $severity, 4840 self::MESSAGE => $msg, 4841 self::CHILDREN => array() 4842 ); 4843 $this->_current[] = $error; 4844 4845 // NEW CODE BELOW ... 4846 // Top-level errors are either: 4847 // TOKEN type, if $value is set appropriately, or 4848 // "syntax" type, if $value is null 4849 $new_struct = new HTMLPurifier_ErrorStruct(); 4850 $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN; 4851 if ($token) { 4852 $new_struct->value = clone $token; 4853 } 4854 if (is_int($line) && is_int($col)) { 4855 if (isset($this->lines[$line][$col])) { 4856 $struct = $this->lines[$line][$col]; 4857 } else { 4858 $struct = $this->lines[$line][$col] = $new_struct; 4859 } 4860 // These ksorts may present a performance problem 4861 ksort($this->lines[$line], SORT_NUMERIC); 4862 } else { 4863 if (isset($this->lines[-1])) { 4864 $struct = $this->lines[-1]; 4865 } else { 4866 $struct = $this->lines[-1] = $new_struct; 4867 } 4868 } 4869 ksort($this->lines, SORT_NUMERIC); 4870 4871 // Now, check if we need to operate on a lower structure 4872 if (!empty($attr)) { 4873 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr); 4874 if (!$struct->value) { 4875 $struct->value = array($attr, 'PUT VALUE HERE'); 4876 } 4877 } 4878 if (!empty($cssprop)) { 4879 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop); 4880 if (!$struct->value) { 4881 // if we tokenize CSS this might be a little more difficult to do 4882 $struct->value = array($cssprop, 'PUT VALUE HERE'); 4883 } 4884 } 4885 4886 // Ok, structs are all setup, now time to register the error 4887 $struct->addError($severity, $msg); 4888 } 4889 4890 /** 4891 * Retrieves raw error data for custom formatter to use 4892 */ 4893 public function getRaw() 4894 { 4895 return $this->errors; 4896 } 4897 4898 /** 4899 * Default HTML formatting implementation for error messages 4900 * @param HTMLPurifier_Config $config Configuration, vital for HTML output nature 4901 * @param array $errors Errors array to display; used for recursion. 4902 * @return string 4903 */ 4904 public function getHTMLFormatted($config, $errors = null) 4905 { 4906 $ret = array(); 4907 4908 $this->generator = new HTMLPurifier_Generator($config, $this->context); 4909 if ($errors === null) { 4910 $errors = $this->errors; 4911 } 4912 4913 // 'At line' message needs to be removed 4914 4915 // generation code for new structure goes here. It needs to be recursive. 4916 foreach ($this->lines as $line => $col_array) { 4917 if ($line == -1) { 4918 continue; 4919 } 4920 foreach ($col_array as $col => $struct) { 4921 $this->_renderStruct($ret, $struct, $line, $col); 4922 } 4923 } 4924 if (isset($this->lines[-1])) { 4925 $this->_renderStruct($ret, $this->lines[-1]); 4926 } 4927 4928 if (empty($errors)) { 4929 return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>'; 4930 } else { 4931 return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>'; 4932 } 4933 4934 } 4935 4936 private function _renderStruct(&$ret, $struct, $line = null, $col = null) 4937 { 4938 $stack = array($struct); 4939 $context_stack = array(array()); 4940 while ($current = array_pop($stack)) { 4941 $context = array_pop($context_stack); 4942 foreach ($current->errors as $error) { 4943 list($severity, $msg) = $error; 4944 $string = ''; 4945 $string .= '<div>'; 4946 // W3C uses an icon to indicate the severity of the error. 4947 $error = $this->locale->getErrorName($severity); 4948 $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> "; 4949 if (!is_null($line) && !is_null($col)) { 4950 $string .= "<em class=\"location\">Line $line, Column $col: </em> "; 4951 } else { 4952 $string .= '<em class="location">End of Document: </em> '; 4953 } 4954 $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> '; 4955 $string .= '</div>'; 4956 // Here, have a marker for the character on the column appropriate. 4957 // Be sure to clip extremely long lines. 4958 //$string .= '<pre>'; 4959 //$string .= ''; 4960 //$string .= '</pre>'; 4961 $ret[] = $string; 4962 } 4963 foreach ($current->children as $array) { 4964 $context[] = $current; 4965 $stack = array_merge($stack, array_reverse($array, true)); 4966 for ($i = count($array); $i > 0; $i--) { 4967 $context_stack[] = $context; 4968 } 4969 } 4970 } 4971 } 4972} 4973 4974 4975 4976 4977 4978/** 4979 * Records errors for particular segments of an HTML document such as tokens, 4980 * attributes or CSS properties. They can contain error structs (which apply 4981 * to components of what they represent), but their main purpose is to hold 4982 * errors applying to whatever struct is being used. 4983 */ 4984class HTMLPurifier_ErrorStruct 4985{ 4986 4987 /** 4988 * Possible values for $children first-key. Note that top-level structures 4989 * are automatically token-level. 4990 */ 4991 const TOKEN = 0; 4992 const ATTR = 1; 4993 const CSSPROP = 2; 4994 4995 /** 4996 * Type of this struct. 4997 * @type string 4998 */ 4999 public $type; 5000 5001 /** 5002 * Value of the struct we are recording errors for. There are various 5003 * values for this: 5004 * - TOKEN: Instance of HTMLPurifier_Token 5005 * - ATTR: array('attr-name', 'value') 5006 * - CSSPROP: array('prop-name', 'value') 5007 * @type mixed 5008 */ 5009 public $value; 5010 5011 /** 5012 * Errors registered for this structure. 5013 * @type array 5014 */ 5015 public $errors = array(); 5016 5017 /** 5018 * Child ErrorStructs that are from this structure. For example, a TOKEN 5019 * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional 5020 * array in structure: [TYPE]['identifier'] 5021 * @type array 5022 */ 5023 public $children = array(); 5024 5025 /** 5026 * @param string $type 5027 * @param string $id 5028 * @return mixed 5029 */ 5030 public function getChild($type, $id) 5031 { 5032 if (!isset($this->children[$type][$id])) { 5033 $this->children[$type][$id] = new HTMLPurifier_ErrorStruct(); 5034 $this->children[$type][$id]->type = $type; 5035 } 5036 return $this->children[$type][$id]; 5037 } 5038 5039 /** 5040 * @param int $severity 5041 * @param string $message 5042 */ 5043 public function addError($severity, $message) 5044 { 5045 $this->errors[] = array($severity, $message); 5046 } 5047} 5048 5049 5050 5051 5052 5053/** 5054 * Global exception class for HTML Purifier; any exceptions we throw 5055 * are from here. 5056 */ 5057class HTMLPurifier_Exception extends Exception 5058{ 5059 5060} 5061 5062 5063 5064 5065 5066/** 5067 * Represents a pre or post processing filter on HTML Purifier's output 5068 * 5069 * Sometimes, a little ad-hoc fixing of HTML has to be done before 5070 * it gets sent through HTML Purifier: you can use filters to acheive 5071 * this effect. For instance, YouTube videos can be preserved using 5072 * this manner. You could have used a decorator for this task, but 5073 * PHP's support for them is not terribly robust, so we're going 5074 * to just loop through the filters. 5075 * 5076 * Filters should be exited first in, last out. If there are three filters, 5077 * named 1, 2 and 3, the order of execution should go 1->preFilter, 5078 * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter, 5079 * 1->postFilter. 5080 * 5081 * @note Methods are not declared abstract as it is perfectly legitimate 5082 * for an implementation not to want anything to happen on a step 5083 */ 5084 5085class HTMLPurifier_Filter 5086{ 5087 5088 /** 5089 * Name of the filter for identification purposes. 5090 * @type string 5091 */ 5092 public $name; 5093 5094 /** 5095 * Pre-processor function, handles HTML before HTML Purifier 5096 * @param string $html 5097 * @param HTMLPurifier_Config $config 5098 * @param HTMLPurifier_Context $context 5099 * @return string 5100 */ 5101 public function preFilter($html, $config, $context) 5102 { 5103 return $html; 5104 } 5105 5106 /** 5107 * Post-processor function, handles HTML after HTML Purifier 5108 * @param string $html 5109 * @param HTMLPurifier_Config $config 5110 * @param HTMLPurifier_Context $context 5111 * @return string 5112 */ 5113 public function postFilter($html, $config, $context) 5114 { 5115 return $html; 5116 } 5117} 5118 5119 5120 5121 5122 5123/** 5124 * Generates HTML from tokens. 5125 * @todo Refactor interface so that configuration/context is determined 5126 * upon instantiation, no need for messy generateFromTokens() calls 5127 * @todo Make some of the more internal functions protected, and have 5128 * unit tests work around that 5129 */ 5130class HTMLPurifier_Generator 5131{ 5132 5133 /** 5134 * Whether or not generator should produce XML output. 5135 * @type bool 5136 */ 5137 private $_xhtml = true; 5138 5139 /** 5140 * :HACK: Whether or not generator should comment the insides of <script> tags. 5141 * @type bool 5142 */ 5143 private $_scriptFix = false; 5144 5145 /** 5146 * Cache of HTMLDefinition during HTML output to determine whether or 5147 * not attributes should be minimized. 5148 * @type HTMLPurifier_HTMLDefinition 5149 */ 5150 private $_def; 5151 5152 /** 5153 * Cache of %Output.SortAttr. 5154 * @type bool 5155 */ 5156 private $_sortAttr; 5157 5158 /** 5159 * Cache of %Output.FlashCompat. 5160 * @type bool 5161 */ 5162 private $_flashCompat; 5163 5164 /** 5165 * Cache of %Output.FixInnerHTML. 5166 * @type bool 5167 */ 5168 private $_innerHTMLFix; 5169 5170 /** 5171 * Stack for keeping track of object information when outputting IE 5172 * compatibility code. 5173 * @type array 5174 */ 5175 private $_flashStack = array(); 5176 5177 /** 5178 * Configuration for the generator 5179 * @type HTMLPurifier_Config 5180 */ 5181 protected $config; 5182 5183 /** 5184 * @param HTMLPurifier_Config $config 5185 * @param HTMLPurifier_Context $context 5186 */ 5187 public function __construct($config, $context) 5188 { 5189 $this->config = $config; 5190 $this->_scriptFix = $config->get('Output.CommentScriptContents'); 5191 $this->_innerHTMLFix = $config->get('Output.FixInnerHTML'); 5192 $this->_sortAttr = $config->get('Output.SortAttr'); 5193 $this->_flashCompat = $config->get('Output.FlashCompat'); 5194 $this->_def = $config->getHTMLDefinition(); 5195 $this->_xhtml = $this->_def->doctype->xml; 5196 } 5197 5198 /** 5199 * Generates HTML from an array of tokens. 5200 * @param HTMLPurifier_Token[] $tokens Array of HTMLPurifier_Token 5201 * @return string Generated HTML 5202 */ 5203 public function generateFromTokens($tokens) 5204 { 5205 if (!$tokens) { 5206 return ''; 5207 } 5208 5209 // Basic algorithm 5210 $html = ''; 5211 for ($i = 0, $size = count($tokens); $i < $size; $i++) { 5212 if ($this->_scriptFix && $tokens[$i]->name === 'script' 5213 && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) { 5214 // script special case 5215 // the contents of the script block must be ONE token 5216 // for this to work. 5217 $html .= $this->generateFromToken($tokens[$i++]); 5218 $html .= $this->generateScriptFromToken($tokens[$i++]); 5219 } 5220 $html .= $this->generateFromToken($tokens[$i]); 5221 } 5222 5223 // Tidy cleanup 5224 if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { 5225 $tidy = new Tidy; 5226 $tidy->parseString( 5227 $html, 5228 array( 5229 'indent'=> true, 5230 'output-xhtml' => $this->_xhtml, 5231 'show-body-only' => true, 5232 'indent-spaces' => 2, 5233 'wrap' => 68, 5234 ), 5235 'utf8' 5236 ); 5237 $tidy->cleanRepair(); 5238 $html = (string) $tidy; // explicit cast necessary 5239 } 5240 5241 // Normalize newlines to system defined value 5242 if ($this->config->get('Core.NormalizeNewlines')) { 5243 $nl = $this->config->get('Output.Newline'); 5244 if ($nl === null) { 5245 $nl = PHP_EOL; 5246 } 5247 if ($nl !== "\n") { 5248 $html = str_replace("\n", $nl, $html); 5249 } 5250 } 5251 return $html; 5252 } 5253 5254 /** 5255 * Generates HTML from a single token. 5256 * @param HTMLPurifier_Token $token HTMLPurifier_Token object. 5257 * @return string Generated HTML 5258 */ 5259 public function generateFromToken($token) 5260 { 5261 if (!$token instanceof HTMLPurifier_Token) { 5262 trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING); 5263 return ''; 5264 5265 } elseif ($token instanceof HTMLPurifier_Token_Start) { 5266 $attr = $this->generateAttributes($token->attr, $token->name); 5267 if ($this->_flashCompat) { 5268 if ($token->name == "object") { 5269 $flash = new stdClass(); 5270 $flash->attr = $token->attr; 5271 $flash->param = array(); 5272 $this->_flashStack[] = $flash; 5273 } 5274 } 5275 return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; 5276 5277 } elseif ($token instanceof HTMLPurifier_Token_End) { 5278 $_extra = ''; 5279 if ($this->_flashCompat) { 5280 if ($token->name == "object" && !empty($this->_flashStack)) { 5281 // doesn't do anything for now 5282 } 5283 } 5284 return $_extra . '</' . $token->name . '>'; 5285 5286 } elseif ($token instanceof HTMLPurifier_Token_Empty) { 5287 if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) { 5288 $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value']; 5289 } 5290 $attr = $this->generateAttributes($token->attr, $token->name); 5291 return '<' . $token->name . ($attr ? ' ' : '') . $attr . 5292 ( $this->_xhtml ? ' /': '' ) // <br /> v. <br> 5293 . '>'; 5294 5295 } elseif ($token instanceof HTMLPurifier_Token_Text) { 5296 return $this->escape($token->data, ENT_NOQUOTES); 5297 5298 } elseif ($token instanceof HTMLPurifier_Token_Comment) { 5299 return '<!--' . $token->data . '-->'; 5300 } else { 5301 return ''; 5302 5303 } 5304 } 5305 5306 /** 5307 * Special case processor for the contents of script tags 5308 * @param HTMLPurifier_Token $token HTMLPurifier_Token object. 5309 * @return string 5310 * @warning This runs into problems if there's already a literal 5311 * --> somewhere inside the script contents. 5312 */ 5313 public function generateScriptFromToken($token) 5314 { 5315 if (!$token instanceof HTMLPurifier_Token_Text) { 5316 return $this->generateFromToken($token); 5317 } 5318 // Thanks <http://lachy.id.au/log/2005/05/script-comments> 5319 $data = preg_replace('#//\s*$#', '', $token->data); 5320 return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>'; 5321 } 5322 5323 /** 5324 * Generates attribute declarations from attribute array. 5325 * @note This does not include the leading or trailing space. 5326 * @param array $assoc_array_of_attributes Attribute array 5327 * @param string $element Name of element attributes are for, used to check 5328 * attribute minimization. 5329 * @return string Generated HTML fragment for insertion. 5330 */ 5331 public function generateAttributes($assoc_array_of_attributes, $element = '') 5332 { 5333 $html = ''; 5334 if ($this->_sortAttr) { 5335 ksort($assoc_array_of_attributes); 5336 } 5337 foreach ($assoc_array_of_attributes as $key => $value) { 5338 if (!$this->_xhtml) { 5339 // Remove namespaced attributes 5340 if (strpos($key, ':') !== false) { 5341 continue; 5342 } 5343 // Check if we should minimize the attribute: val="val" -> val 5344 if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) { 5345 $html .= $key . ' '; 5346 continue; 5347 } 5348 } 5349 // Workaround for Internet Explorer innerHTML bug. 5350 // Essentially, Internet Explorer, when calculating 5351 // innerHTML, omits quotes if there are no instances of 5352 // angled brackets, quotes or spaces. However, when parsing 5353 // HTML (for example, when you assign to innerHTML), it 5354 // treats backticks as quotes. Thus, 5355 // <img alt="``" /> 5356 // becomes 5357 // <img alt=`` /> 5358 // becomes 5359 // <img alt='' /> 5360 // Fortunately, all we need to do is trigger an appropriate 5361 // quoting style, which we do by adding an extra space. 5362 // This also is consistent with the W3C spec, which states 5363 // that user agents may ignore leading or trailing 5364 // whitespace (in fact, most don't, at least for attributes 5365 // like alt, but an extra space at the end is barely 5366 // noticeable). Still, we have a configuration knob for 5367 // this, since this transformation is not necesary if you 5368 // don't process user input with innerHTML or you don't plan 5369 // on supporting Internet Explorer. 5370 if ($this->_innerHTMLFix) { 5371 if (strpos($value, '`') !== false) { 5372 // check if correct quoting style would not already be 5373 // triggered 5374 if (strcspn($value, '"\' <>') === strlen($value)) { 5375 // protect! 5376 $value .= ' '; 5377 } 5378 } 5379 } 5380 $html .= $key.'="'.$this->escape($value).'" '; 5381 } 5382 return rtrim($html); 5383 } 5384 5385 /** 5386 * Escapes raw text data. 5387 * @todo This really ought to be protected, but until we have a facility 5388 * for properly generating HTML here w/o using tokens, it stays 5389 * public. 5390 * @param string $string String data to escape for HTML. 5391 * @param int $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is 5392 * permissible for non-attribute output. 5393 * @return string escaped data. 5394 */ 5395 public function escape($string, $quote = null) 5396 { 5397 // Workaround for APC bug on Mac Leopard reported by sidepodcast 5398 // http://htmlpurifier.org/phorum/read.php?3,4823,4846 5399 if ($quote === null) { 5400 $quote = ENT_COMPAT; 5401 } 5402 return htmlspecialchars($string, $quote, 'UTF-8'); 5403 } 5404} 5405 5406 5407 5408 5409 5410/** 5411 * Definition of the purified HTML that describes allowed children, 5412 * attributes, and many other things. 5413 * 5414 * Conventions: 5415 * 5416 * All member variables that are prefixed with info 5417 * (including the main $info array) are used by HTML Purifier internals 5418 * and should not be directly edited when customizing the HTMLDefinition. 5419 * They can usually be set via configuration directives or custom 5420 * modules. 5421 * 5422 * On the other hand, member variables without the info prefix are used 5423 * internally by the HTMLDefinition and MUST NOT be used by other HTML 5424 * Purifier internals. Many of them, however, are public, and may be 5425 * edited by userspace code to tweak the behavior of HTMLDefinition. 5426 * 5427 * @note This class is inspected by Printer_HTMLDefinition; please 5428 * update that class if things here change. 5429 * 5430 * @warning Directives that change this object's structure must be in 5431 * the HTML or Attr namespace! 5432 */ 5433class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition 5434{ 5435 5436 // FULLY-PUBLIC VARIABLES --------------------------------------------- 5437 5438 /** 5439 * Associative array of element names to HTMLPurifier_ElementDef. 5440 * @type HTMLPurifier_ElementDef[] 5441 */ 5442 public $info = array(); 5443 5444 /** 5445 * Associative array of global attribute name to attribute definition. 5446 * @type array 5447 */ 5448 public $info_global_attr = array(); 5449 5450 /** 5451 * String name of parent element HTML will be going into. 5452 * @type string 5453 */ 5454 public $info_parent = 'div'; 5455 5456 /** 5457 * Definition for parent element, allows parent element to be a 5458 * tag that's not allowed inside the HTML fragment. 5459 * @type HTMLPurifier_ElementDef 5460 */ 5461 public $info_parent_def; 5462 5463 /** 5464 * String name of element used to wrap inline elements in block context. 5465 * @type string 5466 * @note This is rarely used except for BLOCKQUOTEs in strict mode 5467 */ 5468 public $info_block_wrapper = 'p'; 5469 5470 /** 5471 * Associative array of deprecated tag name to HTMLPurifier_TagTransform. 5472 * @type array 5473 */ 5474 public $info_tag_transform = array(); 5475 5476 /** 5477 * Indexed list of HTMLPurifier_AttrTransform to be performed before validation. 5478 * @type HTMLPurifier_AttrTransform[] 5479 */ 5480 public $info_attr_transform_pre = array(); 5481 5482 /** 5483 * Indexed list of HTMLPurifier_AttrTransform to be performed after validation. 5484 * @type HTMLPurifier_AttrTransform[] 5485 */ 5486 public $info_attr_transform_post = array(); 5487 5488 /** 5489 * Nested lookup array of content set name (Block, Inline) to 5490 * element name to whether or not it belongs in that content set. 5491 * @type array 5492 */ 5493 public $info_content_sets = array(); 5494 5495 /** 5496 * Indexed list of HTMLPurifier_Injector to be used. 5497 * @type HTMLPurifier_Injector[] 5498 */ 5499 public $info_injector = array(); 5500 5501 /** 5502 * Doctype object 5503 * @type HTMLPurifier_Doctype 5504 */ 5505 public $doctype; 5506 5507 5508 5509 // RAW CUSTOMIZATION STUFF -------------------------------------------- 5510 5511 /** 5512 * Adds a custom attribute to a pre-existing element 5513 * @note This is strictly convenience, and does not have a corresponding 5514 * method in HTMLPurifier_HTMLModule 5515 * @param string $element_name Element name to add attribute to 5516 * @param string $attr_name Name of attribute 5517 * @param mixed $def Attribute definition, can be string or object, see 5518 * HTMLPurifier_AttrTypes for details 5519 */ 5520 public function addAttribute($element_name, $attr_name, $def) 5521 { 5522 $module = $this->getAnonymousModule(); 5523 if (!isset($module->info[$element_name])) { 5524 $element = $module->addBlankElement($element_name); 5525 } else { 5526 $element = $module->info[$element_name]; 5527 } 5528 $element->attr[$attr_name] = $def; 5529 } 5530 5531 /** 5532 * Adds a custom element to your HTML definition 5533 * @see HTMLPurifier_HTMLModule::addElement() for detailed 5534 * parameter and return value descriptions. 5535 */ 5536 public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) 5537 { 5538 $module = $this->getAnonymousModule(); 5539 // assume that if the user is calling this, the element 5540 // is safe. This may not be a good idea 5541 $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes); 5542 return $element; 5543 } 5544 5545 /** 5546 * Adds a blank element to your HTML definition, for overriding 5547 * existing behavior 5548 * @param string $element_name 5549 * @return HTMLPurifier_ElementDef 5550 * @see HTMLPurifier_HTMLModule::addBlankElement() for detailed 5551 * parameter and return value descriptions. 5552 */ 5553 public function addBlankElement($element_name) 5554 { 5555 $module = $this->getAnonymousModule(); 5556 $element = $module->addBlankElement($element_name); 5557 return $element; 5558 } 5559 5560 /** 5561 * Retrieves a reference to the anonymous module, so you can 5562 * bust out advanced features without having to make your own 5563 * module. 5564 * @return HTMLPurifier_HTMLModule 5565 */ 5566 public function getAnonymousModule() 5567 { 5568 if (!$this->_anonModule) { 5569 $this->_anonModule = new HTMLPurifier_HTMLModule(); 5570 $this->_anonModule->name = 'Anonymous'; 5571 } 5572 return $this->_anonModule; 5573 } 5574 5575 private $_anonModule = null; 5576 5577 // PUBLIC BUT INTERNAL VARIABLES -------------------------------------- 5578 5579 /** 5580 * @type string 5581 */ 5582 public $type = 'HTML'; 5583 5584 /** 5585 * @type HTMLPurifier_HTMLModuleManager 5586 */ 5587 public $manager; 5588 5589 /** 5590 * Performs low-cost, preliminary initialization. 5591 */ 5592 public function __construct() 5593 { 5594 $this->manager = new HTMLPurifier_HTMLModuleManager(); 5595 } 5596 5597 /** 5598 * @param HTMLPurifier_Config $config 5599 */ 5600 protected function doSetup($config) 5601 { 5602 $this->processModules($config); 5603 $this->setupConfigStuff($config); 5604 unset($this->manager); 5605 5606 // cleanup some of the element definitions 5607 foreach ($this->info as $k => $v) { 5608 unset($this->info[$k]->content_model); 5609 unset($this->info[$k]->content_model_type); 5610 } 5611 } 5612 5613 /** 5614 * Extract out the information from the manager 5615 * @param HTMLPurifier_Config $config 5616 */ 5617 protected function processModules($config) 5618 { 5619 if ($this->_anonModule) { 5620 // for user specific changes 5621 // this is late-loaded so we don't have to deal with PHP4 5622 // reference wonky-ness 5623 $this->manager->addModule($this->_anonModule); 5624 unset($this->_anonModule); 5625 } 5626 5627 $this->manager->setup($config); 5628 $this->doctype = $this->manager->doctype; 5629 5630 foreach ($this->manager->modules as $module) { 5631 foreach ($module->info_tag_transform as $k => $v) { 5632 if ($v === false) { 5633 unset($this->info_tag_transform[$k]); 5634 } else { 5635 $this->info_tag_transform[$k] = $v; 5636 } 5637 } 5638 foreach ($module->info_attr_transform_pre as $k => $v) { 5639 if ($v === false) { 5640 unset($this->info_attr_transform_pre[$k]); 5641 } else { 5642 $this->info_attr_transform_pre[$k] = $v; 5643 } 5644 } 5645 foreach ($module->info_attr_transform_post as $k => $v) { 5646 if ($v === false) { 5647 unset($this->info_attr_transform_post[$k]); 5648 } else { 5649 $this->info_attr_transform_post[$k] = $v; 5650 } 5651 } 5652 foreach ($module->info_injector as $k => $v) { 5653 if ($v === false) { 5654 unset($this->info_injector[$k]); 5655 } else { 5656 $this->info_injector[$k] = $v; 5657 } 5658 } 5659 } 5660 $this->info = $this->manager->getElements(); 5661 $this->info_content_sets = $this->manager->contentSets->lookup; 5662 } 5663 5664 /** 5665 * Sets up stuff based on config. We need a better way of doing this. 5666 * @param HTMLPurifier_Config $config 5667 */ 5668 protected function setupConfigStuff($config) 5669 { 5670 $block_wrapper = $config->get('HTML.BlockWrapper'); 5671 if (isset($this->info_content_sets['Block'][$block_wrapper])) { 5672 $this->info_block_wrapper = $block_wrapper; 5673 } else { 5674 trigger_error( 5675 'Cannot use non-block element as block wrapper', 5676 E_USER_ERROR 5677 ); 5678 } 5679 5680 $parent = $config->get('HTML.Parent'); 5681 $def = $this->manager->getElement($parent, true); 5682 if ($def) { 5683 $this->info_parent = $parent; 5684 $this->info_parent_def = $def; 5685 } else { 5686 trigger_error( 5687 'Cannot use unrecognized element as parent', 5688 E_USER_ERROR 5689 ); 5690 $this->info_parent_def = $this->manager->getElement($this->info_parent, true); 5691 } 5692 5693 // support template text 5694 $support = "(for information on implementing this, see the support forums) "; 5695 5696 // setup allowed elements ----------------------------------------- 5697 5698 $allowed_elements = $config->get('HTML.AllowedElements'); 5699 $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early 5700 5701 if (!is_array($allowed_elements) && !is_array($allowed_attributes)) { 5702 $allowed = $config->get('HTML.Allowed'); 5703 if (is_string($allowed)) { 5704 list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed); 5705 } 5706 } 5707 5708 if (is_array($allowed_elements)) { 5709 foreach ($this->info as $name => $d) { 5710 if (!isset($allowed_elements[$name])) { 5711 unset($this->info[$name]); 5712 } 5713 unset($allowed_elements[$name]); 5714 } 5715 // emit errors 5716 foreach ($allowed_elements as $element => $d) { 5717 $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful! 5718 trigger_error("Element '$element' is not supported $support", E_USER_WARNING); 5719 } 5720 } 5721 5722 // setup allowed attributes --------------------------------------- 5723 5724 $allowed_attributes_mutable = $allowed_attributes; // by copy! 5725 if (is_array($allowed_attributes)) { 5726 // This actually doesn't do anything, since we went away from 5727 // global attributes. It's possible that userland code uses 5728 // it, but HTMLModuleManager doesn't! 5729 foreach ($this->info_global_attr as $attr => $x) { 5730 $keys = array($attr, "*@$attr", "*.$attr"); 5731 $delete = true; 5732 foreach ($keys as $key) { 5733 if ($delete && isset($allowed_attributes[$key])) { 5734 $delete = false; 5735 } 5736 if (isset($allowed_attributes_mutable[$key])) { 5737 unset($allowed_attributes_mutable[$key]); 5738 } 5739 } 5740 if ($delete) { 5741 unset($this->info_global_attr[$attr]); 5742 } 5743 } 5744 5745 foreach ($this->info as $tag => $info) { 5746 foreach ($info->attr as $attr => $x) { 5747 $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr"); 5748 $delete = true; 5749 foreach ($keys as $key) { 5750 if ($delete && isset($allowed_attributes[$key])) { 5751 $delete = false; 5752 } 5753 if (isset($allowed_attributes_mutable[$key])) { 5754 unset($allowed_attributes_mutable[$key]); 5755 } 5756 } 5757 if ($delete) { 5758 if ($this->info[$tag]->attr[$attr]->required) { 5759 trigger_error( 5760 "Required attribute '$attr' in element '$tag' " . 5761 "was not allowed, which means '$tag' will not be allowed either", 5762 E_USER_WARNING 5763 ); 5764 } 5765 unset($this->info[$tag]->attr[$attr]); 5766 } 5767 } 5768 } 5769 // emit errors 5770 foreach ($allowed_attributes_mutable as $elattr => $d) { 5771 $bits = preg_split('/[.@]/', $elattr, 2); 5772 $c = count($bits); 5773 switch ($c) { 5774 case 2: 5775 if ($bits[0] !== '*') { 5776 $element = htmlspecialchars($bits[0]); 5777 $attribute = htmlspecialchars($bits[1]); 5778 if (!isset($this->info[$element])) { 5779 trigger_error( 5780 "Cannot allow attribute '$attribute' if element " . 5781 "'$element' is not allowed/supported $support" 5782 ); 5783 } else { 5784 trigger_error( 5785 "Attribute '$attribute' in element '$element' not supported $support", 5786 E_USER_WARNING 5787 ); 5788 } 5789 break; 5790 } 5791 // otherwise fall through 5792 case 1: 5793 $attribute = htmlspecialchars($bits[0]); 5794 trigger_error( 5795 "Global attribute '$attribute' is not ". 5796 "supported in any elements $support", 5797 E_USER_WARNING 5798 ); 5799 break; 5800 } 5801 } 5802 } 5803 5804 // setup forbidden elements --------------------------------------- 5805 5806 $forbidden_elements = $config->get('HTML.ForbiddenElements'); 5807 $forbidden_attributes = $config->get('HTML.ForbiddenAttributes'); 5808 5809 foreach ($this->info as $tag => $info) { 5810 if (isset($forbidden_elements[$tag])) { 5811 unset($this->info[$tag]); 5812 continue; 5813 } 5814 foreach ($info->attr as $attr => $x) { 5815 if (isset($forbidden_attributes["$tag@$attr"]) || 5816 isset($forbidden_attributes["*@$attr"]) || 5817 isset($forbidden_attributes[$attr]) 5818 ) { 5819 unset($this->info[$tag]->attr[$attr]); 5820 continue; 5821 } elseif (isset($forbidden_attributes["$tag.$attr"])) { // this segment might get removed eventually 5822 // $tag.$attr are not user supplied, so no worries! 5823 trigger_error( 5824 "Error with $tag.$attr: tag.attr syntax not supported for " . 5825 "HTML.ForbiddenAttributes; use tag@attr instead", 5826 E_USER_WARNING 5827 ); 5828 } 5829 } 5830 } 5831 foreach ($forbidden_attributes as $key => $v) { 5832 if (strlen($key) < 2) { 5833 continue; 5834 } 5835 if ($key[0] != '*') { 5836 continue; 5837 } 5838 if ($key[1] == '.') { 5839 trigger_error( 5840 "Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", 5841 E_USER_WARNING 5842 ); 5843 } 5844 } 5845 5846 // setup injectors ----------------------------------------------------- 5847 foreach ($this->info_injector as $i => $injector) { 5848 if ($injector->checkNeeded($config) !== false) { 5849 // remove injector that does not have it's required 5850 // elements/attributes present, and is thus not needed. 5851 unset($this->info_injector[$i]); 5852 } 5853 } 5854 } 5855 5856 /** 5857 * Parses a TinyMCE-flavored Allowed Elements and Attributes list into 5858 * separate lists for processing. Format is element[attr1|attr2],element2... 5859 * @warning Although it's largely drawn from TinyMCE's implementation, 5860 * it is different, and you'll probably have to modify your lists 5861 * @param array $list String list to parse 5862 * @return array 5863 * @todo Give this its own class, probably static interface 5864 */ 5865 public function parseTinyMCEAllowedList($list) 5866 { 5867 $list = str_replace(array(' ', "\t"), '', $list); 5868 5869 $elements = array(); 5870 $attributes = array(); 5871 5872 $chunks = preg_split('/(,|[\n\r]+)/', $list); 5873 foreach ($chunks as $chunk) { 5874 if (empty($chunk)) { 5875 continue; 5876 } 5877 // remove TinyMCE element control characters 5878 if (!strpos($chunk, '[')) { 5879 $element = $chunk; 5880 $attr = false; 5881 } else { 5882 list($element, $attr) = explode('[', $chunk); 5883 } 5884 if ($element !== '*') { 5885 $elements[$element] = true; 5886 } 5887 if (!$attr) { 5888 continue; 5889 } 5890 $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ] 5891 $attr = explode('|', $attr); 5892 foreach ($attr as $key) { 5893 $attributes["$element.$key"] = true; 5894 } 5895 } 5896 return array($elements, $attributes); 5897 } 5898} 5899 5900 5901 5902 5903 5904/** 5905 * Represents an XHTML 1.1 module, with information on elements, tags 5906 * and attributes. 5907 * @note Even though this is technically XHTML 1.1, it is also used for 5908 * regular HTML parsing. We are using modulization as a convenient 5909 * way to represent the internals of HTMLDefinition, and our 5910 * implementation is by no means conforming and does not directly 5911 * use the normative DTDs or XML schemas. 5912 * @note The public variables in a module should almost directly 5913 * correspond to the variables in HTMLPurifier_HTMLDefinition. 5914 * However, the prefix info carries no special meaning in these 5915 * objects (include it anyway if that's the correspondence though). 5916 * @todo Consider making some member functions protected 5917 */ 5918 5919class HTMLPurifier_HTMLModule 5920{ 5921 5922 // -- Overloadable ---------------------------------------------------- 5923 5924 /** 5925 * Short unique string identifier of the module. 5926 * @type string 5927 */ 5928 public $name; 5929 5930 /** 5931 * Informally, a list of elements this module changes. 5932 * Not used in any significant way. 5933 * @type array 5934 */ 5935 public $elements = array(); 5936 5937 /** 5938 * Associative array of element names to element definitions. 5939 * Some definitions may be incomplete, to be merged in later 5940 * with the full definition. 5941 * @type array 5942 */ 5943 public $info = array(); 5944 5945 /** 5946 * Associative array of content set names to content set additions. 5947 * This is commonly used to, say, add an A element to the Inline 5948 * content set. This corresponds to an internal variable $content_sets 5949 * and NOT info_content_sets member variable of HTMLDefinition. 5950 * @type array 5951 */ 5952 public $content_sets = array(); 5953 5954 /** 5955 * Associative array of attribute collection names to attribute 5956 * collection additions. More rarely used for adding attributes to 5957 * the global collections. Example is the StyleAttribute module adding 5958 * the style attribute to the Core. Corresponds to HTMLDefinition's 5959 * attr_collections->info, since the object's data is only info, 5960 * with extra behavior associated with it. 5961 * @type array 5962 */ 5963 public $attr_collections = array(); 5964 5965 /** 5966 * Associative array of deprecated tag name to HTMLPurifier_TagTransform. 5967 * @type array 5968 */ 5969 public $info_tag_transform = array(); 5970 5971 /** 5972 * List of HTMLPurifier_AttrTransform to be performed before validation. 5973 * @type array 5974 */ 5975 public $info_attr_transform_pre = array(); 5976 5977 /** 5978 * List of HTMLPurifier_AttrTransform to be performed after validation. 5979 * @type array 5980 */ 5981 public $info_attr_transform_post = array(); 5982 5983 /** 5984 * List of HTMLPurifier_Injector to be performed during well-formedness fixing. 5985 * An injector will only be invoked if all of it's pre-requisites are met; 5986 * if an injector fails setup, there will be no error; it will simply be 5987 * silently disabled. 5988 * @type array 5989 */ 5990 public $info_injector = array(); 5991 5992 /** 5993 * Boolean flag that indicates whether or not getChildDef is implemented. 5994 * For optimization reasons: may save a call to a function. Be sure 5995 * to set it if you do implement getChildDef(), otherwise it will have 5996 * no effect! 5997 * @type bool 5998 */ 5999 public $defines_child_def = false; 6000 6001 /** 6002 * Boolean flag whether or not this module is safe. If it is not safe, all 6003 * of its members are unsafe. Modules are safe by default (this might be 6004 * slightly dangerous, but it doesn't make much sense to force HTML Purifier, 6005 * which is based off of safe HTML, to explicitly say, "This is safe," even 6006 * though there are modules which are "unsafe") 6007 * 6008 * @type bool 6009 * @note Previously, safety could be applied at an element level granularity. 6010 * We've removed this ability, so in order to add "unsafe" elements 6011 * or attributes, a dedicated module with this property set to false 6012 * must be used. 6013 */ 6014 public $safe = true; 6015 6016 /** 6017 * Retrieves a proper HTMLPurifier_ChildDef subclass based on 6018 * content_model and content_model_type member variables of 6019 * the HTMLPurifier_ElementDef class. There is a similar function 6020 * in HTMLPurifier_HTMLDefinition. 6021 * @param HTMLPurifier_ElementDef $def 6022 * @return HTMLPurifier_ChildDef subclass 6023 */ 6024 public function getChildDef($def) 6025 { 6026 return false; 6027 } 6028 6029 // -- Convenience ----------------------------------------------------- 6030 6031 /** 6032 * Convenience function that sets up a new element 6033 * @param string $element Name of element to add 6034 * @param string|bool $type What content set should element be registered to? 6035 * Set as false to skip this step. 6036 * @param string $contents Allowed children in form of: 6037 * "$content_model_type: $content_model" 6038 * @param array $attr_includes What attribute collections to register to 6039 * element? 6040 * @param array $attr What unique attributes does the element define? 6041 * @see HTMLPurifier_ElementDef:: for in-depth descriptions of these parameters. 6042 * @return HTMLPurifier_ElementDef Created element definition object, so you 6043 * can set advanced parameters 6044 */ 6045 public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array()) 6046 { 6047 $this->elements[] = $element; 6048 // parse content_model 6049 list($content_model_type, $content_model) = $this->parseContents($contents); 6050 // merge in attribute inclusions 6051 $this->mergeInAttrIncludes($attr, $attr_includes); 6052 // add element to content sets 6053 if ($type) { 6054 $this->addElementToContentSet($element, $type); 6055 } 6056 // create element 6057 $this->info[$element] = HTMLPurifier_ElementDef::create( 6058 $content_model, 6059 $content_model_type, 6060 $attr 6061 ); 6062 // literal object $contents means direct child manipulation 6063 if (!is_string($contents)) { 6064 $this->info[$element]->child = $contents; 6065 } 6066 return $this->info[$element]; 6067 } 6068 6069 /** 6070 * Convenience function that creates a totally blank, non-standalone 6071 * element. 6072 * @param string $element Name of element to create 6073 * @return HTMLPurifier_ElementDef Created element 6074 */ 6075 public function addBlankElement($element) 6076 { 6077 if (!isset($this->info[$element])) { 6078 $this->elements[] = $element; 6079 $this->info[$element] = new HTMLPurifier_ElementDef(); 6080 $this->info[$element]->standalone = false; 6081 } else { 6082 trigger_error("Definition for $element already exists in module, cannot redefine"); 6083 } 6084 return $this->info[$element]; 6085 } 6086 6087 /** 6088 * Convenience function that registers an element to a content set 6089 * @param string $element Element to register 6090 * @param string $type Name content set (warning: case sensitive, usually upper-case 6091 * first letter) 6092 */ 6093 public function addElementToContentSet($element, $type) 6094 { 6095 if (!isset($this->content_sets[$type])) { 6096 $this->content_sets[$type] = ''; 6097 } else { 6098 $this->content_sets[$type] .= ' | '; 6099 } 6100 $this->content_sets[$type] .= $element; 6101 } 6102 6103 /** 6104 * Convenience function that transforms single-string contents 6105 * into separate content model and content model type 6106 * @param string $contents Allowed children in form of: 6107 * "$content_model_type: $content_model" 6108 * @return array 6109 * @note If contents is an object, an array of two nulls will be 6110 * returned, and the callee needs to take the original $contents 6111 * and use it directly. 6112 */ 6113 public function parseContents($contents) 6114 { 6115 if (!is_string($contents)) { 6116 return array(null, null); 6117 } // defer 6118 switch ($contents) { 6119 // check for shorthand content model forms 6120 case 'Empty': 6121 return array('empty', ''); 6122 case 'Inline': 6123 return array('optional', 'Inline | #PCDATA'); 6124 case 'Flow': 6125 return array('optional', 'Flow | #PCDATA'); 6126 } 6127 list($content_model_type, $content_model) = explode(':', $contents); 6128 $content_model_type = strtolower(trim($content_model_type)); 6129 $content_model = trim($content_model); 6130 return array($content_model_type, $content_model); 6131 } 6132 6133 /** 6134 * Convenience function that merges a list of attribute includes into 6135 * an attribute array. 6136 * @param array $attr Reference to attr array to modify 6137 * @param array $attr_includes Array of includes / string include to merge in 6138 */ 6139 public function mergeInAttrIncludes(&$attr, $attr_includes) 6140 { 6141 if (!is_array($attr_includes)) { 6142 if (empty($attr_includes)) { 6143 $attr_includes = array(); 6144 } else { 6145 $attr_includes = array($attr_includes); 6146 } 6147 } 6148 $attr[0] = $attr_includes; 6149 } 6150 6151 /** 6152 * Convenience function that generates a lookup table with boolean 6153 * true as value. 6154 * @param string $list List of values to turn into a lookup 6155 * @note You can also pass an arbitrary number of arguments in 6156 * place of the regular argument 6157 * @return array array equivalent of list 6158 */ 6159 public function makeLookup($list) 6160 { 6161 if (is_string($list)) { 6162 $list = func_get_args(); 6163 } 6164 $ret = array(); 6165 foreach ($list as $value) { 6166 if (is_null($value)) { 6167 continue; 6168 } 6169 $ret[$value] = true; 6170 } 6171 return $ret; 6172 } 6173 6174 /** 6175 * Lazy load construction of the module after determining whether 6176 * or not it's needed, and also when a finalized configuration object 6177 * is available. 6178 * @param HTMLPurifier_Config $config 6179 */ 6180 public function setup($config) 6181 { 6182 } 6183} 6184 6185 6186 6187 6188 6189class HTMLPurifier_HTMLModuleManager 6190{ 6191 6192 /** 6193 * @type HTMLPurifier_DoctypeRegistry 6194 */ 6195 public $doctypes; 6196 6197 /** 6198 * Instance of current doctype. 6199 * @type string 6200 */ 6201 public $doctype; 6202 6203 /** 6204 * @type HTMLPurifier_AttrTypes 6205 */ 6206 public $attrTypes; 6207 6208 /** 6209 * Active instances of modules for the specified doctype are 6210 * indexed, by name, in this array. 6211 * @type HTMLPurifier_HTMLModule[] 6212 */ 6213 public $modules = array(); 6214 6215 /** 6216 * Array of recognized HTMLPurifier_HTMLModule instances, 6217 * indexed by module's class name. This array is usually lazy loaded, but a 6218 * user can overload a module by pre-emptively registering it. 6219 * @type HTMLPurifier_HTMLModule[] 6220 */ 6221 public $registeredModules = array(); 6222 6223 /** 6224 * List of extra modules that were added by the user 6225 * using addModule(). These get unconditionally merged into the current doctype, whatever 6226 * it may be. 6227 * @type HTMLPurifier_HTMLModule[] 6228 */ 6229 public $userModules = array(); 6230 6231 /** 6232 * Associative array of element name to list of modules that have 6233 * definitions for the element; this array is dynamically filled. 6234 * @type array 6235 */ 6236 public $elementLookup = array(); 6237 6238 /** 6239 * List of prefixes we should use for registering small names. 6240 * @type array 6241 */ 6242 public $prefixes = array('HTMLPurifier_HTMLModule_'); 6243 6244 /** 6245 * @type HTMLPurifier_ContentSets 6246 */ 6247 public $contentSets; 6248 6249 /** 6250 * @type HTMLPurifier_AttrCollections 6251 */ 6252 public $attrCollections; 6253 6254 /** 6255 * If set to true, unsafe elements and attributes will be allowed. 6256 * @type bool 6257 */ 6258 public $trusted = false; 6259 6260 public function __construct() 6261 { 6262 // editable internal objects 6263 $this->attrTypes = new HTMLPurifier_AttrTypes(); 6264 $this->doctypes = new HTMLPurifier_DoctypeRegistry(); 6265 6266 // setup basic modules 6267 $common = array( 6268 'CommonAttributes', 'Text', 'Hypertext', 'List', 6269 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', 6270 'StyleAttribute', 6271 // Unsafe: 6272 'Scripting', 'Object', 'Forms', 6273 // Sorta legacy, but present in strict: 6274 'Name', 6275 ); 6276 $transitional = array('Legacy', 'Target', 'Iframe'); 6277 $xml = array('XMLCommonAttributes'); 6278 $non_xml = array('NonXMLCommonAttributes'); 6279 6280 // setup basic doctypes 6281 $this->doctypes->register( 6282 'HTML 4.01 Transitional', 6283 false, 6284 array_merge($common, $transitional, $non_xml), 6285 array('Tidy_Transitional', 'Tidy_Proprietary'), 6286 array(), 6287 '-//W3C//DTD HTML 4.01 Transitional//EN', 6288 'http://www.w3.org/TR/html4/loose.dtd' 6289 ); 6290 6291 $this->doctypes->register( 6292 'HTML 4.01 Strict', 6293 false, 6294 array_merge($common, $non_xml), 6295 array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'), 6296 array(), 6297 '-//W3C//DTD HTML 4.01//EN', 6298 'http://www.w3.org/TR/html4/strict.dtd' 6299 ); 6300 6301 $this->doctypes->register( 6302 'XHTML 1.0 Transitional', 6303 true, 6304 array_merge($common, $transitional, $xml, $non_xml), 6305 array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'), 6306 array(), 6307 '-//W3C//DTD XHTML 1.0 Transitional//EN', 6308 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 6309 ); 6310 6311 $this->doctypes->register( 6312 'XHTML 1.0 Strict', 6313 true, 6314 array_merge($common, $xml, $non_xml), 6315 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'), 6316 array(), 6317 '-//W3C//DTD XHTML 1.0 Strict//EN', 6318 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' 6319 ); 6320 6321 $this->doctypes->register( 6322 'XHTML 1.1', 6323 true, 6324 // Iframe is a real XHTML 1.1 module, despite being 6325 // "transitional"! 6326 array_merge($common, $xml, array('Ruby', 'Iframe')), 6327 array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1 6328 array(), 6329 '-//W3C//DTD XHTML 1.1//EN', 6330 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' 6331 ); 6332 6333 } 6334 6335 /** 6336 * Registers a module to the recognized module list, useful for 6337 * overloading pre-existing modules. 6338 * @param $module Mixed: string module name, with or without 6339 * HTMLPurifier_HTMLModule prefix, or instance of 6340 * subclass of HTMLPurifier_HTMLModule. 6341 * @param $overload Boolean whether or not to overload previous modules. 6342 * If this is not set, and you do overload a module, 6343 * HTML Purifier will complain with a warning. 6344 * @note This function will not call autoload, you must instantiate 6345 * (and thus invoke) autoload outside the method. 6346 * @note If a string is passed as a module name, different variants 6347 * will be tested in this order: 6348 * - Check for HTMLPurifier_HTMLModule_$name 6349 * - Check all prefixes with $name in order they were added 6350 * - Check for literal object name 6351 * - Throw fatal error 6352 * If your object name collides with an internal class, specify 6353 * your module manually. All modules must have been included 6354 * externally: registerModule will not perform inclusions for you! 6355 */ 6356 public function registerModule($module, $overload = false) 6357 { 6358 if (is_string($module)) { 6359 // attempt to load the module 6360 $original_module = $module; 6361 $ok = false; 6362 foreach ($this->prefixes as $prefix) { 6363 $module = $prefix . $original_module; 6364 if (class_exists($module)) { 6365 $ok = true; 6366 break; 6367 } 6368 } 6369 if (!$ok) { 6370 $module = $original_module; 6371 if (!class_exists($module)) { 6372 trigger_error( 6373 $original_module . ' module does not exist', 6374 E_USER_ERROR 6375 ); 6376 return; 6377 } 6378 } 6379 $module = new $module(); 6380 } 6381 if (empty($module->name)) { 6382 trigger_error('Module instance of ' . get_class($module) . ' must have name'); 6383 return; 6384 } 6385 if (!$overload && isset($this->registeredModules[$module->name])) { 6386 trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING); 6387 } 6388 $this->registeredModules[$module->name] = $module; 6389 } 6390 6391 /** 6392 * Adds a module to the current doctype by first registering it, 6393 * and then tacking it on to the active doctype 6394 */ 6395 public function addModule($module) 6396 { 6397 $this->registerModule($module); 6398 if (is_object($module)) { 6399 $module = $module->name; 6400 } 6401 $this->userModules[] = $module; 6402 } 6403 6404 /** 6405 * Adds a class prefix that registerModule() will use to resolve a 6406 * string name to a concrete class 6407 */ 6408 public function addPrefix($prefix) 6409 { 6410 $this->prefixes[] = $prefix; 6411 } 6412 6413 /** 6414 * Performs processing on modules, after being called you may 6415 * use getElement() and getElements() 6416 * @param HTMLPurifier_Config $config 6417 */ 6418 public function setup($config) 6419 { 6420 $this->trusted = $config->get('HTML.Trusted'); 6421 6422 // generate 6423 $this->doctype = $this->doctypes->make($config); 6424 $modules = $this->doctype->modules; 6425 6426 // take out the default modules that aren't allowed 6427 $lookup = $config->get('HTML.AllowedModules'); 6428 $special_cases = $config->get('HTML.CoreModules'); 6429 6430 if (is_array($lookup)) { 6431 foreach ($modules as $k => $m) { 6432 if (isset($special_cases[$m])) { 6433 continue; 6434 } 6435 if (!isset($lookup[$m])) { 6436 unset($modules[$k]); 6437 } 6438 } 6439 } 6440 6441 // custom modules 6442 if ($config->get('HTML.Proprietary')) { 6443 $modules[] = 'Proprietary'; 6444 } 6445 if ($config->get('HTML.SafeObject')) { 6446 $modules[] = 'SafeObject'; 6447 } 6448 if ($config->get('HTML.SafeEmbed')) { 6449 $modules[] = 'SafeEmbed'; 6450 } 6451 if ($config->get('HTML.SafeScripting') !== array()) { 6452 $modules[] = 'SafeScripting'; 6453 } 6454 if ($config->get('HTML.Nofollow')) { 6455 $modules[] = 'Nofollow'; 6456 } 6457 if ($config->get('HTML.TargetBlank')) { 6458 $modules[] = 'TargetBlank'; 6459 } 6460 // NB: HTML.TargetNoreferrer and HTML.TargetNoopener must be AFTER HTML.TargetBlank 6461 // so that its post-attr-transform gets run afterwards. 6462 if ($config->get('HTML.TargetNoreferrer')) { 6463 $modules[] = 'TargetNoreferrer'; 6464 } 6465 if ($config->get('HTML.TargetNoopener')) { 6466 $modules[] = 'TargetNoopener'; 6467 } 6468 6469 // merge in custom modules 6470 $modules = array_merge($modules, $this->userModules); 6471 6472 foreach ($modules as $module) { 6473 $this->processModule($module); 6474 $this->modules[$module]->setup($config); 6475 } 6476 6477 foreach ($this->doctype->tidyModules as $module) { 6478 $this->processModule($module); 6479 $this->modules[$module]->setup($config); 6480 } 6481 6482 // prepare any injectors 6483 foreach ($this->modules as $module) { 6484 $n = array(); 6485 foreach ($module->info_injector as $injector) { 6486 if (!is_object($injector)) { 6487 $class = "HTMLPurifier_Injector_$injector"; 6488 $injector = new $class; 6489 } 6490 $n[$injector->name] = $injector; 6491 } 6492 $module->info_injector = $n; 6493 } 6494 6495 // setup lookup table based on all valid modules 6496 foreach ($this->modules as $module) { 6497 foreach ($module->info as $name => $def) { 6498 if (!isset($this->elementLookup[$name])) { 6499 $this->elementLookup[$name] = array(); 6500 } 6501 $this->elementLookup[$name][] = $module->name; 6502 } 6503 } 6504 6505 // note the different choice 6506 $this->contentSets = new HTMLPurifier_ContentSets( 6507 // content set assembly deals with all possible modules, 6508 // not just ones deemed to be "safe" 6509 $this->modules 6510 ); 6511 $this->attrCollections = new HTMLPurifier_AttrCollections( 6512 $this->attrTypes, 6513 // there is no way to directly disable a global attribute, 6514 // but using AllowedAttributes or simply not including 6515 // the module in your custom doctype should be sufficient 6516 $this->modules 6517 ); 6518 } 6519 6520 /** 6521 * Takes a module and adds it to the active module collection, 6522 * registering it if necessary. 6523 */ 6524 public function processModule($module) 6525 { 6526 if (!isset($this->registeredModules[$module]) || is_object($module)) { 6527 $this->registerModule($module); 6528 } 6529 $this->modules[$module] = $this->registeredModules[$module]; 6530 } 6531 6532 /** 6533 * Retrieves merged element definitions. 6534 * @return Array of HTMLPurifier_ElementDef 6535 */ 6536 public function getElements() 6537 { 6538 $elements = array(); 6539 foreach ($this->modules as $module) { 6540 if (!$this->trusted && !$module->safe) { 6541 continue; 6542 } 6543 foreach ($module->info as $name => $v) { 6544 if (isset($elements[$name])) { 6545 continue; 6546 } 6547 $elements[$name] = $this->getElement($name); 6548 } 6549 } 6550 6551 // remove dud elements, this happens when an element that 6552 // appeared to be safe actually wasn't 6553 foreach ($elements as $n => $v) { 6554 if ($v === false) { 6555 unset($elements[$n]); 6556 } 6557 } 6558 6559 return $elements; 6560 6561 } 6562 6563 /** 6564 * Retrieves a single merged element definition 6565 * @param string $name Name of element 6566 * @param bool $trusted Boolean trusted overriding parameter: set to true 6567 * if you want the full version of an element 6568 * @return HTMLPurifier_ElementDef Merged HTMLPurifier_ElementDef 6569 * @note You may notice that modules are getting iterated over twice (once 6570 * in getElements() and once here). This 6571 * is because 6572 */ 6573 public function getElement($name, $trusted = null) 6574 { 6575 if (!isset($this->elementLookup[$name])) { 6576 return false; 6577 } 6578 6579 // setup global state variables 6580 $def = false; 6581 if ($trusted === null) { 6582 $trusted = $this->trusted; 6583 } 6584 6585 // iterate through each module that has registered itself to this 6586 // element 6587 foreach ($this->elementLookup[$name] as $module_name) { 6588 $module = $this->modules[$module_name]; 6589 6590 // refuse to create/merge from a module that is deemed unsafe-- 6591 // pretend the module doesn't exist--when trusted mode is not on. 6592 if (!$trusted && !$module->safe) { 6593 continue; 6594 } 6595 6596 // clone is used because, ideally speaking, the original 6597 // definition should not be modified. Usually, this will 6598 // make no difference, but for consistency's sake 6599 $new_def = clone $module->info[$name]; 6600 6601 if (!$def && $new_def->standalone) { 6602 $def = $new_def; 6603 } elseif ($def) { 6604 // This will occur even if $new_def is standalone. In practice, 6605 // this will usually result in a full replacement. 6606 $def->mergeIn($new_def); 6607 } else { 6608 // :TODO: 6609 // non-standalone definitions that don't have a standalone 6610 // to merge into could be deferred to the end 6611 // HOWEVER, it is perfectly valid for a non-standalone 6612 // definition to lack a standalone definition, even 6613 // after all processing: this allows us to safely 6614 // specify extra attributes for elements that may not be 6615 // enabled all in one place. In particular, this might 6616 // be the case for trusted elements. WARNING: care must 6617 // be taken that the /extra/ definitions are all safe. 6618 continue; 6619 } 6620 6621 // attribute value expansions 6622 $this->attrCollections->performInclusions($def->attr); 6623 $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes); 6624 6625 // descendants_are_inline, for ChildDef_Chameleon 6626 if (is_string($def->content_model) && 6627 strpos($def->content_model, 'Inline') !== false) { 6628 if ($name != 'del' && $name != 'ins') { 6629 // this is for you, ins/del 6630 $def->descendants_are_inline = true; 6631 } 6632 } 6633 6634 $this->contentSets->generateChildDef($def, $module); 6635 } 6636 6637 // This can occur if there is a blank definition, but no base to 6638 // mix it in with 6639 if (!$def) { 6640 return false; 6641 } 6642 6643 // add information on required attributes 6644 foreach ($def->attr as $attr_name => $attr_def) { 6645 if ($attr_def->required) { 6646 $def->required_attr[] = $attr_name; 6647 } 6648 } 6649 return $def; 6650 } 6651} 6652 6653 6654 6655 6656 6657/** 6658 * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes 6659 * @note In Slashdot-speak, dupe means duplicate. 6660 * @note The default constructor does not accept $config or $context objects: 6661 * use must use the static build() factory method to perform initialization. 6662 */ 6663class HTMLPurifier_IDAccumulator 6664{ 6665 6666 /** 6667 * Lookup table of IDs we've accumulated. 6668 * @public 6669 */ 6670 public $ids = array(); 6671 6672 /** 6673 * Builds an IDAccumulator, also initializing the default blacklist 6674 * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config 6675 * @param HTMLPurifier_Context $context Instance of HTMLPurifier_Context 6676 * @return HTMLPurifier_IDAccumulator Fully initialized HTMLPurifier_IDAccumulator 6677 */ 6678 public static function build($config, $context) 6679 { 6680 $id_accumulator = new HTMLPurifier_IDAccumulator(); 6681 $id_accumulator->load($config->get('Attr.IDBlacklist')); 6682 return $id_accumulator; 6683 } 6684 6685 /** 6686 * Add an ID to the lookup table. 6687 * @param string $id ID to be added. 6688 * @return bool status, true if success, false if there's a dupe 6689 */ 6690 public function add($id) 6691 { 6692 if (isset($this->ids[$id])) { 6693 return false; 6694 } 6695 return $this->ids[$id] = true; 6696 } 6697 6698 /** 6699 * Load a list of IDs into the lookup table 6700 * @param $array_of_ids Array of IDs to load 6701 * @note This function doesn't care about duplicates 6702 */ 6703 public function load($array_of_ids) 6704 { 6705 foreach ($array_of_ids as $id) { 6706 $this->ids[$id] = true; 6707 } 6708 } 6709} 6710 6711 6712 6713 6714 6715/** 6716 * Injects tokens into the document while parsing for well-formedness. 6717 * This enables "formatter-like" functionality such as auto-paragraphing, 6718 * smiley-ification and linkification to take place. 6719 * 6720 * A note on how handlers create changes; this is done by assigning a new 6721 * value to the $token reference. These values can take a variety of forms and 6722 * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken() 6723 * documentation. 6724 * 6725 * @todo Allow injectors to request a re-run on their output. This 6726 * would help if an operation is recursive. 6727 */ 6728abstract class HTMLPurifier_Injector 6729{ 6730 6731 /** 6732 * Advisory name of injector, this is for friendly error messages. 6733 * @type string 6734 */ 6735 public $name; 6736 6737 /** 6738 * @type HTMLPurifier_HTMLDefinition 6739 */ 6740 protected $htmlDefinition; 6741 6742 /** 6743 * Reference to CurrentNesting variable in Context. This is an array 6744 * list of tokens that we are currently "inside" 6745 * @type array 6746 */ 6747 protected $currentNesting; 6748 6749 /** 6750 * Reference to current token. 6751 * @type HTMLPurifier_Token 6752 */ 6753 protected $currentToken; 6754 6755 /** 6756 * Reference to InputZipper variable in Context. 6757 * @type HTMLPurifier_Zipper 6758 */ 6759 protected $inputZipper; 6760 6761 /** 6762 * Array of elements and attributes this injector creates and therefore 6763 * need to be allowed by the definition. Takes form of 6764 * array('element' => array('attr', 'attr2'), 'element2') 6765 * @type array 6766 */ 6767 public $needed = array(); 6768 6769 /** 6770 * Number of elements to rewind backwards (relative). 6771 * @type bool|int 6772 */ 6773 protected $rewindOffset = false; 6774 6775 /** 6776 * Rewind to a spot to re-perform processing. This is useful if you 6777 * deleted a node, and now need to see if this change affected any 6778 * earlier nodes. Rewinding does not affect other injectors, and can 6779 * result in infinite loops if not used carefully. 6780 * @param bool|int $offset 6781 * @warning HTML Purifier will prevent you from fast-forwarding with this 6782 * function. 6783 */ 6784 public function rewindOffset($offset) 6785 { 6786 $this->rewindOffset = $offset; 6787 } 6788 6789 /** 6790 * Retrieves rewind offset, and then unsets it. 6791 * @return bool|int 6792 */ 6793 public function getRewindOffset() 6794 { 6795 $r = $this->rewindOffset; 6796 $this->rewindOffset = false; 6797 return $r; 6798 } 6799 6800 /** 6801 * Prepares the injector by giving it the config and context objects: 6802 * this allows references to important variables to be made within 6803 * the injector. This function also checks if the HTML environment 6804 * will work with the Injector (see checkNeeded()). 6805 * @param HTMLPurifier_Config $config 6806 * @param HTMLPurifier_Context $context 6807 * @return bool|string Boolean false if success, string of missing needed element/attribute if failure 6808 */ 6809 public function prepare($config, $context) 6810 { 6811 $this->htmlDefinition = $config->getHTMLDefinition(); 6812 // Even though this might fail, some unit tests ignore this and 6813 // still test checkNeeded, so be careful. Maybe get rid of that 6814 // dependency. 6815 $result = $this->checkNeeded($config); 6816 if ($result !== false) { 6817 return $result; 6818 } 6819 $this->currentNesting =& $context->get('CurrentNesting'); 6820 $this->currentToken =& $context->get('CurrentToken'); 6821 $this->inputZipper =& $context->get('InputZipper'); 6822 return false; 6823 } 6824 6825 /** 6826 * This function checks if the HTML environment 6827 * will work with the Injector: if p tags are not allowed, the 6828 * Auto-Paragraphing injector should not be enabled. 6829 * @param HTMLPurifier_Config $config 6830 * @return bool|string Boolean false if success, string of missing needed element/attribute if failure 6831 */ 6832 public function checkNeeded($config) 6833 { 6834 $def = $config->getHTMLDefinition(); 6835 foreach ($this->needed as $element => $attributes) { 6836 if (is_int($element)) { 6837 $element = $attributes; 6838 } 6839 if (!isset($def->info[$element])) { 6840 return $element; 6841 } 6842 if (!is_array($attributes)) { 6843 continue; 6844 } 6845 foreach ($attributes as $name) { 6846 if (!isset($def->info[$element]->attr[$name])) { 6847 return "$element.$name"; 6848 } 6849 } 6850 } 6851 return false; 6852 } 6853 6854 /** 6855 * Tests if the context node allows a certain element 6856 * @param string $name Name of element to test for 6857 * @return bool True if element is allowed, false if it is not 6858 */ 6859 public function allowsElement($name) 6860 { 6861 if (!empty($this->currentNesting)) { 6862 $parent_token = array_pop($this->currentNesting); 6863 $this->currentNesting[] = $parent_token; 6864 $parent = $this->htmlDefinition->info[$parent_token->name]; 6865 } else { 6866 $parent = $this->htmlDefinition->info_parent_def; 6867 } 6868 if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) { 6869 return false; 6870 } 6871 // check for exclusion 6872 if (!empty($this->currentNesting)) { 6873 for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) { 6874 $node = $this->currentNesting[$i]; 6875 $def = $this->htmlDefinition->info[$node->name]; 6876 if (isset($def->excludes[$name])) { 6877 return false; 6878 } 6879 } 6880 } 6881 return true; 6882 } 6883 6884 /** 6885 * Iterator function, which starts with the next token and continues until 6886 * you reach the end of the input tokens. 6887 * @warning Please prevent previous references from interfering with this 6888 * functions by setting $i = null beforehand! 6889 * @param int $i Current integer index variable for inputTokens 6890 * @param HTMLPurifier_Token $current Current token variable. 6891 * Do NOT use $token, as that variable is also a reference 6892 * @return bool 6893 */ 6894 protected function forward(&$i, &$current) 6895 { 6896 if ($i === null) { 6897 $i = count($this->inputZipper->back) - 1; 6898 } else { 6899 $i--; 6900 } 6901 if ($i < 0) { 6902 return false; 6903 } 6904 $current = $this->inputZipper->back[$i]; 6905 return true; 6906 } 6907 6908 /** 6909 * Similar to _forward, but accepts a third parameter $nesting (which 6910 * should be initialized at 0) and stops when we hit the end tag 6911 * for the node $this->inputIndex starts in. 6912 * @param int $i Current integer index variable for inputTokens 6913 * @param HTMLPurifier_Token $current Current token variable. 6914 * Do NOT use $token, as that variable is also a reference 6915 * @param int $nesting 6916 * @return bool 6917 */ 6918 protected function forwardUntilEndToken(&$i, &$current, &$nesting) 6919 { 6920 $result = $this->forward($i, $current); 6921 if (!$result) { 6922 return false; 6923 } 6924 if ($nesting === null) { 6925 $nesting = 0; 6926 } 6927 if ($current instanceof HTMLPurifier_Token_Start) { 6928 $nesting++; 6929 } elseif ($current instanceof HTMLPurifier_Token_End) { 6930 if ($nesting <= 0) { 6931 return false; 6932 } 6933 $nesting--; 6934 } 6935 return true; 6936 } 6937 6938 /** 6939 * Iterator function, starts with the previous token and continues until 6940 * you reach the beginning of input tokens. 6941 * @warning Please prevent previous references from interfering with this 6942 * functions by setting $i = null beforehand! 6943 * @param int $i Current integer index variable for inputTokens 6944 * @param HTMLPurifier_Token $current Current token variable. 6945 * Do NOT use $token, as that variable is also a reference 6946 * @return bool 6947 */ 6948 protected function backward(&$i, &$current) 6949 { 6950 if ($i === null) { 6951 $i = count($this->inputZipper->front) - 1; 6952 } else { 6953 $i--; 6954 } 6955 if ($i < 0) { 6956 return false; 6957 } 6958 $current = $this->inputZipper->front[$i]; 6959 return true; 6960 } 6961 6962 /** 6963 * Handler that is called when a text token is processed 6964 */ 6965 public function handleText(&$token) 6966 { 6967 } 6968 6969 /** 6970 * Handler that is called when a start or empty token is processed 6971 */ 6972 public function handleElement(&$token) 6973 { 6974 } 6975 6976 /** 6977 * Handler that is called when an end token is processed 6978 */ 6979 public function handleEnd(&$token) 6980 { 6981 $this->notifyEnd($token); 6982 } 6983 6984 /** 6985 * Notifier that is called when an end token is processed 6986 * @param HTMLPurifier_Token $token Current token variable. 6987 * @note This differs from handlers in that the token is read-only 6988 * @deprecated 6989 */ 6990 public function notifyEnd($token) 6991 { 6992 } 6993} 6994 6995 6996 6997 6998 6999/** 7000 * Represents a language and defines localizable string formatting and 7001 * other functions, as well as the localized messages for HTML Purifier. 7002 */ 7003class HTMLPurifier_Language 7004{ 7005 7006 /** 7007 * ISO 639 language code of language. Prefers shortest possible version. 7008 * @type string 7009 */ 7010 public $code = 'en'; 7011 7012 /** 7013 * Fallback language code. 7014 * @type bool|string 7015 */ 7016 public $fallback = false; 7017 7018 /** 7019 * Array of localizable messages. 7020 * @type array 7021 */ 7022 public $messages = array(); 7023 7024 /** 7025 * Array of localizable error codes. 7026 * @type array 7027 */ 7028 public $errorNames = array(); 7029 7030 /** 7031 * True if no message file was found for this language, so English 7032 * is being used instead. Check this if you'd like to notify the 7033 * user that they've used a non-supported language. 7034 * @type bool 7035 */ 7036 public $error = false; 7037 7038 /** 7039 * Has the language object been loaded yet? 7040 * @type bool 7041 * @todo Make it private, fix usage in HTMLPurifier_LanguageTest 7042 */ 7043 public $_loaded = false; 7044 7045 /** 7046 * @type HTMLPurifier_Config 7047 */ 7048 protected $config; 7049 7050 /** 7051 * @type HTMLPurifier_Context 7052 */ 7053 protected $context; 7054 7055 /** 7056 * @param HTMLPurifier_Config $config 7057 * @param HTMLPurifier_Context $context 7058 */ 7059 public function __construct($config, $context) 7060 { 7061 $this->config = $config; 7062 $this->context = $context; 7063 } 7064 7065 /** 7066 * Loads language object with necessary info from factory cache 7067 * @note This is a lazy loader 7068 */ 7069 public function load() 7070 { 7071 if ($this->_loaded) { 7072 return; 7073 } 7074 $factory = HTMLPurifier_LanguageFactory::instance(); 7075 $factory->loadLanguage($this->code); 7076 foreach ($factory->keys as $key) { 7077 $this->$key = $factory->cache[$this->code][$key]; 7078 } 7079 $this->_loaded = true; 7080 } 7081 7082 /** 7083 * Retrieves a localised message. 7084 * @param string $key string identifier of message 7085 * @return string localised message 7086 */ 7087 public function getMessage($key) 7088 { 7089 if (!$this->_loaded) { 7090 $this->load(); 7091 } 7092 if (!isset($this->messages[$key])) { 7093 return "[$key]"; 7094 } 7095 return $this->messages[$key]; 7096 } 7097 7098 /** 7099 * Retrieves a localised error name. 7100 * @param int $int error number, corresponding to PHP's error reporting 7101 * @return string localised message 7102 */ 7103 public function getErrorName($int) 7104 { 7105 if (!$this->_loaded) { 7106 $this->load(); 7107 } 7108 if (!isset($this->errorNames[$int])) { 7109 return "[Error: $int]"; 7110 } 7111 return $this->errorNames[$int]; 7112 } 7113 7114 /** 7115 * Converts an array list into a string readable representation 7116 * @param array $array 7117 * @return string 7118 */ 7119 public function listify($array) 7120 { 7121 $sep = $this->getMessage('Item separator'); 7122 $sep_last = $this->getMessage('Item separator last'); 7123 $ret = ''; 7124 for ($i = 0, $c = count($array); $i < $c; $i++) { 7125 if ($i == 0) { 7126 } elseif ($i + 1 < $c) { 7127 $ret .= $sep; 7128 } else { 7129 $ret .= $sep_last; 7130 } 7131 $ret .= $array[$i]; 7132 } 7133 return $ret; 7134 } 7135 7136 /** 7137 * Formats a localised message with passed parameters 7138 * @param string $key string identifier of message 7139 * @param array $args Parameters to substitute in 7140 * @return string localised message 7141 * @todo Implement conditionals? Right now, some messages make 7142 * reference to line numbers, but those aren't always available 7143 */ 7144 public function formatMessage($key, $args = array()) 7145 { 7146 if (!$this->_loaded) { 7147 $this->load(); 7148 } 7149 if (!isset($this->messages[$key])) { 7150 return "[$key]"; 7151 } 7152 $raw = $this->messages[$key]; 7153 $subst = array(); 7154 $generator = false; 7155 foreach ($args as $i => $value) { 7156 if (is_object($value)) { 7157 if ($value instanceof HTMLPurifier_Token) { 7158 // factor this out some time 7159 if (!$generator) { 7160 $generator = $this->context->get('Generator'); 7161 } 7162 if (isset($value->name)) { 7163 $subst['$'.$i.'.Name'] = $value->name; 7164 } 7165 if (isset($value->data)) { 7166 $subst['$'.$i.'.Data'] = $value->data; 7167 } 7168 $subst['$'.$i.'.Compact'] = 7169 $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value); 7170 // a more complex algorithm for compact representation 7171 // could be introduced for all types of tokens. This 7172 // may need to be factored out into a dedicated class 7173 if (!empty($value->attr)) { 7174 $stripped_token = clone $value; 7175 $stripped_token->attr = array(); 7176 $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token); 7177 } 7178 $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown'; 7179 } 7180 continue; 7181 } elseif (is_array($value)) { 7182 $keys = array_keys($value); 7183 if (array_keys($keys) === $keys) { 7184 // list 7185 $subst['$'.$i] = $this->listify($value); 7186 } else { 7187 // associative array 7188 // no $i implementation yet, sorry 7189 $subst['$'.$i.'.Keys'] = $this->listify($keys); 7190 $subst['$'.$i.'.Values'] = $this->listify(array_values($value)); 7191 } 7192 continue; 7193 } 7194 $subst['$' . $i] = $value; 7195 } 7196 return strtr($raw, $subst); 7197 } 7198} 7199 7200 7201 7202 7203 7204/** 7205 * Class responsible for generating HTMLPurifier_Language objects, managing 7206 * caching and fallbacks. 7207 * @note Thanks to MediaWiki for the general logic, although this version 7208 * has been entirely rewritten 7209 * @todo Serialized cache for languages 7210 */ 7211class HTMLPurifier_LanguageFactory 7212{ 7213 7214 /** 7215 * Cache of language code information used to load HTMLPurifier_Language objects. 7216 * Structure is: $factory->cache[$language_code][$key] = $value 7217 * @type array 7218 */ 7219 public $cache; 7220 7221 /** 7222 * Valid keys in the HTMLPurifier_Language object. Designates which 7223 * variables to slurp out of a message file. 7224 * @type array 7225 */ 7226 public $keys = array('fallback', 'messages', 'errorNames'); 7227 7228 /** 7229 * Instance to validate language codes. 7230 * @type HTMLPurifier_AttrDef_Lang 7231 * 7232 */ 7233 protected $validator; 7234 7235 /** 7236 * Cached copy of dirname(__FILE__), directory of current file without 7237 * trailing slash. 7238 * @type string 7239 */ 7240 protected $dir; 7241 7242 /** 7243 * Keys whose contents are a hash map and can be merged. 7244 * @type array 7245 */ 7246 protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true); 7247 7248 /** 7249 * Keys whose contents are a list and can be merged. 7250 * @value array lookup 7251 */ 7252 protected $mergeable_keys_list = array(); 7253 7254 /** 7255 * Retrieve sole instance of the factory. 7256 * @param HTMLPurifier_LanguageFactory $prototype Optional prototype to overload sole instance with, 7257 * or bool true to reset to default factory. 7258 * @return HTMLPurifier_LanguageFactory 7259 */ 7260 public static function instance($prototype = null) 7261 { 7262 static $instance = null; 7263 if ($prototype !== null) { 7264 $instance = $prototype; 7265 } elseif ($instance === null || $prototype == true) { 7266 $instance = new HTMLPurifier_LanguageFactory(); 7267 $instance->setup(); 7268 } 7269 return $instance; 7270 } 7271 7272 /** 7273 * Sets up the singleton, much like a constructor 7274 * @note Prevents people from getting this outside of the singleton 7275 */ 7276 public function setup() 7277 { 7278 $this->validator = new HTMLPurifier_AttrDef_Lang(); 7279 $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier'; 7280 } 7281 7282 /** 7283 * Creates a language object, handles class fallbacks 7284 * @param HTMLPurifier_Config $config 7285 * @param HTMLPurifier_Context $context 7286 * @param bool|string $code Code to override configuration with. Private parameter. 7287 * @return HTMLPurifier_Language 7288 */ 7289 public function create($config, $context, $code = false) 7290 { 7291 // validate language code 7292 if ($code === false) { 7293 $code = $this->validator->validate( 7294 $config->get('Core.Language'), 7295 $config, 7296 $context 7297 ); 7298 } else { 7299 $code = $this->validator->validate($code, $config, $context); 7300 } 7301 if ($code === false) { 7302 $code = 'en'; // malformed code becomes English 7303 } 7304 7305 $pcode = str_replace('-', '_', $code); // make valid PHP classname 7306 static $depth = 0; // recursion protection 7307 7308 if ($code == 'en') { 7309 $lang = new HTMLPurifier_Language($config, $context); 7310 } else { 7311 $class = 'HTMLPurifier_Language_' . $pcode; 7312 $file = $this->dir . '/Language/classes/' . $code . '.php'; 7313 if (file_exists($file) || class_exists($class, false)) { 7314 $lang = new $class($config, $context); 7315 } else { 7316 // Go fallback 7317 $raw_fallback = $this->getFallbackFor($code); 7318 $fallback = $raw_fallback ? $raw_fallback : 'en'; 7319 $depth++; 7320 $lang = $this->create($config, $context, $fallback); 7321 if (!$raw_fallback) { 7322 $lang->error = true; 7323 } 7324 $depth--; 7325 } 7326 } 7327 $lang->code = $code; 7328 return $lang; 7329 } 7330 7331 /** 7332 * Returns the fallback language for language 7333 * @note Loads the original language into cache 7334 * @param string $code language code 7335 * @return string|bool 7336 */ 7337 public function getFallbackFor($code) 7338 { 7339 $this->loadLanguage($code); 7340 return $this->cache[$code]['fallback']; 7341 } 7342 7343 /** 7344 * Loads language into the cache, handles message file and fallbacks 7345 * @param string $code language code 7346 */ 7347 public function loadLanguage($code) 7348 { 7349 static $languages_seen = array(); // recursion guard 7350 7351 // abort if we've already loaded it 7352 if (isset($this->cache[$code])) { 7353 return; 7354 } 7355 7356 // generate filename 7357 $filename = $this->dir . '/Language/messages/' . $code . '.php'; 7358 7359 // default fallback : may be overwritten by the ensuing include 7360 $fallback = ($code != 'en') ? 'en' : false; 7361 7362 // load primary localisation 7363 if (!file_exists($filename)) { 7364 // skip the include: will rely solely on fallback 7365 $filename = $this->dir . '/Language/messages/en.php'; 7366 $cache = array(); 7367 } else { 7368 include $filename; 7369 $cache = compact($this->keys); 7370 } 7371 7372 // load fallback localisation 7373 if (!empty($fallback)) { 7374 7375 // infinite recursion guard 7376 if (isset($languages_seen[$code])) { 7377 trigger_error( 7378 'Circular fallback reference in language ' . 7379 $code, 7380 E_USER_ERROR 7381 ); 7382 $fallback = 'en'; 7383 } 7384 $language_seen[$code] = true; 7385 7386 // load the fallback recursively 7387 $this->loadLanguage($fallback); 7388 $fallback_cache = $this->cache[$fallback]; 7389 7390 // merge fallback with current language 7391 foreach ($this->keys as $key) { 7392 if (isset($cache[$key]) && isset($fallback_cache[$key])) { 7393 if (isset($this->mergeable_keys_map[$key])) { 7394 $cache[$key] = $cache[$key] + $fallback_cache[$key]; 7395 } elseif (isset($this->mergeable_keys_list[$key])) { 7396 $cache[$key] = array_merge($fallback_cache[$key], $cache[$key]); 7397 } 7398 } else { 7399 $cache[$key] = $fallback_cache[$key]; 7400 } 7401 } 7402 } 7403 7404 // save to cache for later retrieval 7405 $this->cache[$code] = $cache; 7406 return; 7407 } 7408} 7409 7410 7411 7412 7413 7414/** 7415 * Represents a measurable length, with a string numeric magnitude 7416 * and a unit. This object is immutable. 7417 */ 7418class HTMLPurifier_Length 7419{ 7420 7421 /** 7422 * String numeric magnitude. 7423 * @type string 7424 */ 7425 protected $n; 7426 7427 /** 7428 * String unit. False is permitted if $n = 0. 7429 * @type string|bool 7430 */ 7431 protected $unit; 7432 7433 /** 7434 * Whether or not this length is valid. Null if not calculated yet. 7435 * @type bool 7436 */ 7437 protected $isValid; 7438 7439 /** 7440 * Array Lookup array of units recognized by CSS 3 7441 * @type array 7442 */ 7443 protected static $allowedUnits = array( 7444 'em' => true, 'ex' => true, 'px' => true, 'in' => true, 7445 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true, 7446 'ch' => true, 'rem' => true, 'vw' => true, 'vh' => true, 7447 'vmin' => true, 'vmax' => true 7448 ); 7449 7450 /** 7451 * @param string $n Magnitude 7452 * @param bool|string $u Unit 7453 */ 7454 public function __construct($n = '0', $u = false) 7455 { 7456 $this->n = (string) $n; 7457 $this->unit = $u !== false ? (string) $u : false; 7458 } 7459 7460 /** 7461 * @param string $s Unit string, like '2em' or '3.4in' 7462 * @return HTMLPurifier_Length 7463 * @warning Does not perform validation. 7464 */ 7465 public static function make($s) 7466 { 7467 if ($s instanceof HTMLPurifier_Length) { 7468 return $s; 7469 } 7470 $n_length = strspn($s, '1234567890.+-'); 7471 $n = substr($s, 0, $n_length); 7472 $unit = substr($s, $n_length); 7473 if ($unit === '') { 7474 $unit = false; 7475 } 7476 return new HTMLPurifier_Length($n, $unit); 7477 } 7478 7479 /** 7480 * Validates the number and unit. 7481 * @return bool 7482 */ 7483 protected function validate() 7484 { 7485 // Special case: 7486 if ($this->n === '+0' || $this->n === '-0') { 7487 $this->n = '0'; 7488 } 7489 if ($this->n === '0' && $this->unit === false) { 7490 return true; 7491 } 7492 if (!ctype_lower($this->unit)) { 7493 $this->unit = strtolower($this->unit); 7494 } 7495 if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) { 7496 return false; 7497 } 7498 // Hack: 7499 $def = new HTMLPurifier_AttrDef_CSS_Number(); 7500 $result = $def->validate($this->n, false, false); 7501 if ($result === false) { 7502 return false; 7503 } 7504 $this->n = $result; 7505 return true; 7506 } 7507 7508 /** 7509 * Returns string representation of number. 7510 * @return string 7511 */ 7512 public function toString() 7513 { 7514 if (!$this->isValid()) { 7515 return false; 7516 } 7517 return $this->n . $this->unit; 7518 } 7519 7520 /** 7521 * Retrieves string numeric magnitude. 7522 * @return string 7523 */ 7524 public function getN() 7525 { 7526 return $this->n; 7527 } 7528 7529 /** 7530 * Retrieves string unit. 7531 * @return string 7532 */ 7533 public function getUnit() 7534 { 7535 return $this->unit; 7536 } 7537 7538 /** 7539 * Returns true if this length unit is valid. 7540 * @return bool 7541 */ 7542 public function isValid() 7543 { 7544 if ($this->isValid === null) { 7545 $this->isValid = $this->validate(); 7546 } 7547 return $this->isValid; 7548 } 7549 7550 /** 7551 * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal. 7552 * @param HTMLPurifier_Length $l 7553 * @return int 7554 * @warning If both values are too large or small, this calculation will 7555 * not work properly 7556 */ 7557 public function compareTo($l) 7558 { 7559 if ($l === false) { 7560 return false; 7561 } 7562 if ($l->unit !== $this->unit) { 7563 $converter = new HTMLPurifier_UnitConverter(); 7564 $l = $converter->convert($l, $this->unit); 7565 if ($l === false) { 7566 return false; 7567 } 7568 } 7569 return $this->n - $l->n; 7570 } 7571} 7572 7573 7574 7575 7576 7577/** 7578 * Forgivingly lexes HTML (SGML-style) markup into tokens. 7579 * 7580 * A lexer parses a string of SGML-style markup and converts them into 7581 * corresponding tokens. It doesn't check for well-formedness, although its 7582 * internal mechanism may make this automatic (such as the case of 7583 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose 7584 * from. 7585 * 7586 * A lexer is HTML-oriented: it might work with XML, but it's not 7587 * recommended, as we adhere to a subset of the specification for optimization 7588 * reasons. This might change in the future. Also, most tokenizers are not 7589 * expected to handle DTDs or PIs. 7590 * 7591 * This class should not be directly instantiated, but you may use create() to 7592 * retrieve a default copy of the lexer. Being a supertype, this class 7593 * does not actually define any implementation, but offers commonly used 7594 * convenience functions for subclasses. 7595 * 7596 * @note The unit tests will instantiate this class for testing purposes, as 7597 * many of the utility functions require a class to be instantiated. 7598 * This means that, even though this class is not runnable, it will 7599 * not be declared abstract. 7600 * 7601 * @par 7602 * 7603 * @note 7604 * We use tokens rather than create a DOM representation because DOM would: 7605 * 7606 * @par 7607 * -# Require more processing and memory to create, 7608 * -# Is not streamable, and 7609 * -# Has the entire document structure (html and body not needed). 7610 * 7611 * @par 7612 * However, DOM is helpful in that it makes it easy to move around nodes 7613 * without a lot of lookaheads to see when a tag is closed. This is a 7614 * limitation of the token system and some workarounds would be nice. 7615 */ 7616class HTMLPurifier_Lexer 7617{ 7618 7619 /** 7620 * Whether or not this lexer implements line-number/column-number tracking. 7621 * If it does, set to true. 7622 */ 7623 public $tracksLineNumbers = false; 7624 7625 // -- STATIC ---------------------------------------------------------- 7626 7627 /** 7628 * Retrieves or sets the default Lexer as a Prototype Factory. 7629 * 7630 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are 7631 * a few exceptions involving special features that only DirectLex 7632 * implements. 7633 * 7634 * @note The behavior of this class has changed, rather than accepting 7635 * a prototype object, it now accepts a configuration object. 7636 * To specify your own prototype, set %Core.LexerImpl to it. 7637 * This change in behavior de-singletonizes the lexer object. 7638 * 7639 * @param HTMLPurifier_Config $config 7640 * @return HTMLPurifier_Lexer 7641 * @throws HTMLPurifier_Exception 7642 */ 7643 public static function create($config) 7644 { 7645 if (!($config instanceof HTMLPurifier_Config)) { 7646 $lexer = $config; 7647 trigger_error( 7648 "Passing a prototype to 7649 HTMLPurifier_Lexer::create() is deprecated, please instead 7650 use %Core.LexerImpl", 7651 E_USER_WARNING 7652 ); 7653 } else { 7654 $lexer = $config->get('Core.LexerImpl'); 7655 } 7656 7657 $needs_tracking = 7658 $config->get('Core.MaintainLineNumbers') || 7659 $config->get('Core.CollectErrors'); 7660 7661 $inst = null; 7662 if (is_object($lexer)) { 7663 $inst = $lexer; 7664 } else { 7665 if (is_null($lexer)) { 7666 do { 7667 // auto-detection algorithm 7668 if ($needs_tracking) { 7669 $lexer = 'DirectLex'; 7670 break; 7671 } 7672 7673 if (class_exists('DOMDocument', false) && 7674 method_exists('DOMDocument', 'loadHTML') && 7675 !extension_loaded('domxml') 7676 ) { 7677 // check for DOM support, because while it's part of the 7678 // core, it can be disabled compile time. Also, the PECL 7679 // domxml extension overrides the default DOM, and is evil 7680 // and nasty and we shan't bother to support it 7681 $lexer = 'DOMLex'; 7682 } else { 7683 $lexer = 'DirectLex'; 7684 } 7685 } while (0); 7686 } // do..while so we can break 7687 7688 // instantiate recognized string names 7689 switch ($lexer) { 7690 case 'DOMLex': 7691 $inst = new HTMLPurifier_Lexer_DOMLex(); 7692 break; 7693 case 'DirectLex': 7694 $inst = new HTMLPurifier_Lexer_DirectLex(); 7695 break; 7696 case 'PH5P': 7697 $inst = new HTMLPurifier_Lexer_PH5P(); 7698 break; 7699 default: 7700 throw new HTMLPurifier_Exception( 7701 "Cannot instantiate unrecognized Lexer type " . 7702 htmlspecialchars($lexer) 7703 ); 7704 } 7705 } 7706 7707 if (!$inst) { 7708 throw new HTMLPurifier_Exception('No lexer was instantiated'); 7709 } 7710 7711 // once PHP DOM implements native line numbers, or we 7712 // hack out something using XSLT, remove this stipulation 7713 if ($needs_tracking && !$inst->tracksLineNumbers) { 7714 throw new HTMLPurifier_Exception( 7715 'Cannot use lexer that does not support line numbers with ' . 7716 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 7717 ); 7718 } 7719 7720 return $inst; 7721 7722 } 7723 7724 // -- CONVENIENCE MEMBERS --------------------------------------------- 7725 7726 public function __construct() 7727 { 7728 $this->_entity_parser = new HTMLPurifier_EntityParser(); 7729 } 7730 7731 /** 7732 * Most common entity to raw value conversion table for special entities. 7733 * @type array 7734 */ 7735 protected $_special_entity2str = 7736 array( 7737 '"' => '"', 7738 '&' => '&', 7739 '<' => '<', 7740 '>' => '>', 7741 ''' => "'", 7742 ''' => "'", 7743 ''' => "'" 7744 ); 7745 7746 public function parseText($string, $config) { 7747 return $this->parseData($string, false, $config); 7748 } 7749 7750 public function parseAttr($string, $config) { 7751 return $this->parseData($string, true, $config); 7752 } 7753 7754 /** 7755 * Parses special entities into the proper characters. 7756 * 7757 * This string will translate escaped versions of the special characters 7758 * into the correct ones. 7759 * 7760 * @param string $string String character data to be parsed. 7761 * @return string Parsed character data. 7762 */ 7763 public function parseData($string, $is_attr, $config) 7764 { 7765 // following functions require at least one character 7766 if ($string === '') { 7767 return ''; 7768 } 7769 7770 // subtracts amps that cannot possibly be escaped 7771 $num_amp = substr_count($string, '&') - substr_count($string, '& ') - 7772 ($string[strlen($string) - 1] === '&' ? 1 : 0); 7773 7774 if (!$num_amp) { 7775 return $string; 7776 } // abort if no entities 7777 $num_esc_amp = substr_count($string, '&'); 7778 $string = strtr($string, $this->_special_entity2str); 7779 7780 // code duplication for sake of optimization, see above 7781 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - 7782 ($string[strlen($string) - 1] === '&' ? 1 : 0); 7783 7784 if ($num_amp_2 <= $num_esc_amp) { 7785 return $string; 7786 } 7787 7788 // hmm... now we have some uncommon entities. Use the callback. 7789 if ($config->get('Core.LegacyEntityDecoder')) { 7790 $string = $this->_entity_parser->substituteSpecialEntities($string); 7791 } else { 7792 if ($is_attr) { 7793 $string = $this->_entity_parser->substituteAttrEntities($string); 7794 } else { 7795 $string = $this->_entity_parser->substituteTextEntities($string); 7796 } 7797 } 7798 return $string; 7799 } 7800 7801 /** 7802 * Lexes an HTML string into tokens. 7803 * @param $string String HTML. 7804 * @param HTMLPurifier_Config $config 7805 * @param HTMLPurifier_Context $context 7806 * @return HTMLPurifier_Token[] array representation of HTML. 7807 */ 7808 public function tokenizeHTML($string, $config, $context) 7809 { 7810 trigger_error('Call to abstract class', E_USER_ERROR); 7811 } 7812 7813 /** 7814 * Translates CDATA sections into regular sections (through escaping). 7815 * @param string $string HTML string to process. 7816 * @return string HTML with CDATA sections escaped. 7817 */ 7818 protected static function escapeCDATA($string) 7819 { 7820 return preg_replace_callback( 7821 '/<!\[CDATA\[(.+?)\]\]>/s', 7822 array('HTMLPurifier_Lexer', 'CDATACallback'), 7823 $string 7824 ); 7825 } 7826 7827 /** 7828 * Special CDATA case that is especially convoluted for <script> 7829 * @param string $string HTML string to process. 7830 * @return string HTML with CDATA sections escaped. 7831 */ 7832 protected static function escapeCommentedCDATA($string) 7833 { 7834 return preg_replace_callback( 7835 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', 7836 array('HTMLPurifier_Lexer', 'CDATACallback'), 7837 $string 7838 ); 7839 } 7840 7841 /** 7842 * Special Internet Explorer conditional comments should be removed. 7843 * @param string $string HTML string to process. 7844 * @return string HTML with conditional comments removed. 7845 */ 7846 protected static function removeIEConditional($string) 7847 { 7848 return preg_replace( 7849 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings 7850 '', 7851 $string 7852 ); 7853 } 7854 7855 /** 7856 * Callback function for escapeCDATA() that does the work. 7857 * 7858 * @warning Though this is public in order to let the callback happen, 7859 * calling it directly is not recommended. 7860 * @param array $matches PCRE matches array, with index 0 the entire match 7861 * and 1 the inside of the CDATA section. 7862 * @return string Escaped internals of the CDATA section. 7863 */ 7864 protected static function CDATACallback($matches) 7865 { 7866 // not exactly sure why the character set is needed, but whatever 7867 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); 7868 } 7869 7870 /** 7871 * Takes a piece of HTML and normalizes it by converting entities, fixing 7872 * encoding, extracting bits, and other good stuff. 7873 * @param string $html HTML. 7874 * @param HTMLPurifier_Config $config 7875 * @param HTMLPurifier_Context $context 7876 * @return string 7877 * @todo Consider making protected 7878 */ 7879 public function normalize($html, $config, $context) 7880 { 7881 // normalize newlines to \n 7882 if ($config->get('Core.NormalizeNewlines')) { 7883 $html = str_replace("\r\n", "\n", $html); 7884 $html = str_replace("\r", "\n", $html); 7885 } 7886 7887 if ($config->get('HTML.Trusted')) { 7888 // escape convoluted CDATA 7889 $html = $this->escapeCommentedCDATA($html); 7890 } 7891 7892 // escape CDATA 7893 $html = $this->escapeCDATA($html); 7894 7895 $html = $this->removeIEConditional($html); 7896 7897 // extract body from document if applicable 7898 if ($config->get('Core.ConvertDocumentToFragment')) { 7899 $e = false; 7900 if ($config->get('Core.CollectErrors')) { 7901 $e =& $context->get('ErrorCollector'); 7902 } 7903 $new_html = $this->extractBody($html); 7904 if ($e && $new_html != $html) { 7905 $e->send(E_WARNING, 'Lexer: Extracted body'); 7906 } 7907 $html = $new_html; 7908 } 7909 7910 // expand entities that aren't the big five 7911 if ($config->get('Core.LegacyEntityDecoder')) { 7912 $html = $this->_entity_parser->substituteNonSpecialEntities($html); 7913 } 7914 7915 // clean into wellformed UTF-8 string for an SGML context: this has 7916 // to be done after entity expansion because the entities sometimes 7917 // represent non-SGML characters (horror, horror!) 7918 $html = HTMLPurifier_Encoder::cleanUTF8($html); 7919 7920 // if processing instructions are to removed, remove them now 7921 if ($config->get('Core.RemoveProcessingInstructions')) { 7922 $html = preg_replace('#<\?.+?\?>#s', '', $html); 7923 } 7924 7925 $hidden_elements = $config->get('Core.HiddenElements'); 7926 if ($config->get('Core.AggressivelyRemoveScript') && 7927 !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents') 7928 || empty($hidden_elements["script"]))) { 7929 $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html); 7930 } 7931 7932 return $html; 7933 } 7934 7935 /** 7936 * Takes a string of HTML (fragment or document) and returns the content 7937 * @todo Consider making protected 7938 */ 7939 public function extractBody($html) 7940 { 7941 $matches = array(); 7942 $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches); 7943 if ($result) { 7944 // Make sure it's not in a comment 7945 $comment_start = strrpos($matches[1], '<!--'); 7946 $comment_end = strrpos($matches[1], '-->'); 7947 if ($comment_start === false || 7948 ($comment_end !== false && $comment_end > $comment_start)) { 7949 return $matches[2]; 7950 } 7951 } 7952 return $html; 7953 } 7954} 7955 7956 7957 7958 7959 7960/** 7961 * Abstract base node class that all others inherit from. 7962 * 7963 * Why do we not use the DOM extension? (1) It is not always available, 7964 * (2) it has funny constraints on the data it can represent, 7965 * whereas we want a maximally flexible representation, and (3) its 7966 * interface is a bit cumbersome. 7967 */ 7968abstract class HTMLPurifier_Node 7969{ 7970 /** 7971 * Line number of the start token in the source document 7972 * @type int 7973 */ 7974 public $line; 7975 7976 /** 7977 * Column number of the start token in the source document. Null if unknown. 7978 * @type int 7979 */ 7980 public $col; 7981 7982 /** 7983 * Lookup array of processing that this token is exempt from. 7984 * Currently, valid values are "ValidateAttributes". 7985 * @type array 7986 */ 7987 public $armor = array(); 7988 7989 /** 7990 * When true, this node should be ignored as non-existent. 7991 * 7992 * Who is responsible for ignoring dead nodes? FixNesting is 7993 * responsible for removing them before passing on to child 7994 * validators. 7995 */ 7996 public $dead = false; 7997 7998 /** 7999 * Returns a pair of start and end tokens, where the end token 8000 * is null if it is not necessary. Does not include children. 8001 * @type array 8002 */ 8003 abstract public function toTokenPair(); 8004} 8005 8006 8007 8008 8009 8010/** 8011 * Class that handles operations involving percent-encoding in URIs. 8012 * 8013 * @warning 8014 * Be careful when reusing instances of PercentEncoder. The object 8015 * you use for normalize() SHOULD NOT be used for encode(), or 8016 * vice-versa. 8017 */ 8018class HTMLPurifier_PercentEncoder 8019{ 8020 8021 /** 8022 * Reserved characters to preserve when using encode(). 8023 * @type array 8024 */ 8025 protected $preserve = array(); 8026 8027 /** 8028 * String of characters that should be preserved while using encode(). 8029 * @param bool $preserve 8030 */ 8031 public function __construct($preserve = false) 8032 { 8033 // unreserved letters, ought to const-ify 8034 for ($i = 48; $i <= 57; $i++) { // digits 8035 $this->preserve[$i] = true; 8036 } 8037 for ($i = 65; $i <= 90; $i++) { // upper-case 8038 $this->preserve[$i] = true; 8039 } 8040 for ($i = 97; $i <= 122; $i++) { // lower-case 8041 $this->preserve[$i] = true; 8042 } 8043 $this->preserve[45] = true; // Dash - 8044 $this->preserve[46] = true; // Period . 8045 $this->preserve[95] = true; // Underscore _ 8046 $this->preserve[126]= true; // Tilde ~ 8047 8048 // extra letters not to escape 8049 if ($preserve !== false) { 8050 for ($i = 0, $c = strlen($preserve); $i < $c; $i++) { 8051 $this->preserve[ord($preserve[$i])] = true; 8052 } 8053 } 8054 } 8055 8056 /** 8057 * Our replacement for urlencode, it encodes all non-reserved characters, 8058 * as well as any extra characters that were instructed to be preserved. 8059 * @note 8060 * Assumes that the string has already been normalized, making any 8061 * and all percent escape sequences valid. Percents will not be 8062 * re-escaped, regardless of their status in $preserve 8063 * @param string $string String to be encoded 8064 * @return string Encoded string. 8065 */ 8066 public function encode($string) 8067 { 8068 $ret = ''; 8069 for ($i = 0, $c = strlen($string); $i < $c; $i++) { 8070 if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])])) { 8071 $ret .= '%' . sprintf('%02X', $int); 8072 } else { 8073 $ret .= $string[$i]; 8074 } 8075 } 8076 return $ret; 8077 } 8078 8079 /** 8080 * Fix up percent-encoding by decoding unreserved characters and normalizing. 8081 * @warning This function is affected by $preserve, even though the 8082 * usual desired behavior is for this not to preserve those 8083 * characters. Be careful when reusing instances of PercentEncoder! 8084 * @param string $string String to normalize 8085 * @return string 8086 */ 8087 public function normalize($string) 8088 { 8089 if ($string == '') { 8090 return ''; 8091 } 8092 $parts = explode('%', $string); 8093 $ret = array_shift($parts); 8094 foreach ($parts as $part) { 8095 $length = strlen($part); 8096 if ($length < 2) { 8097 $ret .= '%25' . $part; 8098 continue; 8099 } 8100 $encoding = substr($part, 0, 2); 8101 $text = substr($part, 2); 8102 if (!ctype_xdigit($encoding)) { 8103 $ret .= '%25' . $part; 8104 continue; 8105 } 8106 $int = hexdec($encoding); 8107 if (isset($this->preserve[$int])) { 8108 $ret .= chr($int) . $text; 8109 continue; 8110 } 8111 $encoding = strtoupper($encoding); 8112 $ret .= '%' . $encoding . $text; 8113 } 8114 return $ret; 8115 } 8116} 8117 8118 8119 8120 8121 8122/** 8123 * Generic property list implementation 8124 */ 8125class HTMLPurifier_PropertyList 8126{ 8127 /** 8128 * Internal data-structure for properties. 8129 * @type array 8130 */ 8131 protected $data = array(); 8132 8133 /** 8134 * Parent plist. 8135 * @type HTMLPurifier_PropertyList 8136 */ 8137 protected $parent; 8138 8139 /** 8140 * Cache. 8141 * @type array 8142 */ 8143 protected $cache; 8144 8145 /** 8146 * @param HTMLPurifier_PropertyList $parent Parent plist 8147 */ 8148 public function __construct($parent = null) 8149 { 8150 $this->parent = $parent; 8151 } 8152 8153 /** 8154 * Recursively retrieves the value for a key 8155 * @param string $name 8156 * @throws HTMLPurifier_Exception 8157 */ 8158 public function get($name) 8159 { 8160 if ($this->has($name)) { 8161 return $this->data[$name]; 8162 } 8163 // possible performance bottleneck, convert to iterative if necessary 8164 if ($this->parent) { 8165 return $this->parent->get($name); 8166 } 8167 throw new HTMLPurifier_Exception("Key '$name' not found"); 8168 } 8169 8170 /** 8171 * Sets the value of a key, for this plist 8172 * @param string $name 8173 * @param mixed $value 8174 */ 8175 public function set($name, $value) 8176 { 8177 $this->data[$name] = $value; 8178 } 8179 8180 /** 8181 * Returns true if a given key exists 8182 * @param string $name 8183 * @return bool 8184 */ 8185 public function has($name) 8186 { 8187 return array_key_exists($name, $this->data); 8188 } 8189 8190 /** 8191 * Resets a value to the value of it's parent, usually the default. If 8192 * no value is specified, the entire plist is reset. 8193 * @param string $name 8194 */ 8195 public function reset($name = null) 8196 { 8197 if ($name == null) { 8198 $this->data = array(); 8199 } else { 8200 unset($this->data[$name]); 8201 } 8202 } 8203 8204 /** 8205 * Squashes this property list and all of its property lists into a single 8206 * array, and returns the array. This value is cached by default. 8207 * @param bool $force If true, ignores the cache and regenerates the array. 8208 * @return array 8209 */ 8210 public function squash($force = false) 8211 { 8212 if ($this->cache !== null && !$force) { 8213 return $this->cache; 8214 } 8215 if ($this->parent) { 8216 return $this->cache = array_merge($this->parent->squash($force), $this->data); 8217 } else { 8218 return $this->cache = $this->data; 8219 } 8220 } 8221 8222 /** 8223 * Returns the parent plist. 8224 * @return HTMLPurifier_PropertyList 8225 */ 8226 public function getParent() 8227 { 8228 return $this->parent; 8229 } 8230 8231 /** 8232 * Sets the parent plist. 8233 * @param HTMLPurifier_PropertyList $plist Parent plist 8234 */ 8235 public function setParent($plist) 8236 { 8237 $this->parent = $plist; 8238 } 8239} 8240 8241 8242 8243 8244 8245/** 8246 * Property list iterator. Do not instantiate this class directly. 8247 */ 8248class HTMLPurifier_PropertyListIterator extends FilterIterator 8249{ 8250 8251 /** 8252 * @type int 8253 */ 8254 protected $l; 8255 /** 8256 * @type string 8257 */ 8258 protected $filter; 8259 8260 /** 8261 * @param Iterator $iterator Array of data to iterate over 8262 * @param string $filter Optional prefix to only allow values of 8263 */ 8264 public function __construct(Iterator $iterator, $filter = null) 8265 { 8266 parent::__construct($iterator); 8267 $this->l = strlen($filter); 8268 $this->filter = $filter; 8269 } 8270 8271 /** 8272 * @return bool 8273 */ 8274 public function accept() 8275 { 8276 $key = $this->getInnerIterator()->key(); 8277 if (strncmp($key, $this->filter, $this->l) !== 0) { 8278 return false; 8279 } 8280 return true; 8281 } 8282} 8283 8284 8285 8286 8287 8288/** 8289 * A simple array-backed queue, based off of the classic Okasaki 8290 * persistent amortized queue. The basic idea is to maintain two 8291 * stacks: an input stack and an output stack. When the output 8292 * stack runs out, reverse the input stack and use it as the output 8293 * stack. 8294 * 8295 * We don't use the SPL implementation because it's only supported 8296 * on PHP 5.3 and later. 8297 * 8298 * Exercise: Prove that push/pop on this queue take amortized O(1) time. 8299 * 8300 * Exercise: Extend this queue to be a deque, while preserving amortized 8301 * O(1) time. Some care must be taken on rebalancing to avoid quadratic 8302 * behaviour caused by repeatedly shuffling data from the input stack 8303 * to the output stack and back. 8304 */ 8305class HTMLPurifier_Queue { 8306 private $input; 8307 private $output; 8308 8309 public function __construct($input = array()) { 8310 $this->input = $input; 8311 $this->output = array(); 8312 } 8313 8314 /** 8315 * Shifts an element off the front of the queue. 8316 */ 8317 public function shift() { 8318 if (empty($this->output)) { 8319 $this->output = array_reverse($this->input); 8320 $this->input = array(); 8321 } 8322 if (empty($this->output)) { 8323 return NULL; 8324 } 8325 return array_pop($this->output); 8326 } 8327 8328 /** 8329 * Pushes an element onto the front of the queue. 8330 */ 8331 public function push($x) { 8332 array_push($this->input, $x); 8333 } 8334 8335 /** 8336 * Checks if it's empty. 8337 */ 8338 public function isEmpty() { 8339 return empty($this->input) && empty($this->output); 8340 } 8341} 8342 8343 8344 8345/** 8346 * Supertype for classes that define a strategy for modifying/purifying tokens. 8347 * 8348 * While HTMLPurifier's core purpose is fixing HTML into something proper, 8349 * strategies provide plug points for extra configuration or even extra 8350 * features, such as custom tags, custom parsing of text, etc. 8351 */ 8352 8353 8354abstract class HTMLPurifier_Strategy 8355{ 8356 8357 /** 8358 * Executes the strategy on the tokens. 8359 * 8360 * @param HTMLPurifier_Token[] $tokens Array of HTMLPurifier_Token objects to be operated on. 8361 * @param HTMLPurifier_Config $config 8362 * @param HTMLPurifier_Context $context 8363 * @return HTMLPurifier_Token[] Processed array of token objects. 8364 */ 8365 abstract public function execute($tokens, $config, $context); 8366} 8367 8368 8369 8370 8371 8372/** 8373 * This is in almost every respect equivalent to an array except 8374 * that it keeps track of which keys were accessed. 8375 * 8376 * @warning For the sake of backwards compatibility with early versions 8377 * of PHP 5, you must not use the $hash[$key] syntax; if you do 8378 * our version of offsetGet is never called. 8379 */ 8380class HTMLPurifier_StringHash extends ArrayObject 8381{ 8382 /** 8383 * @type array 8384 */ 8385 protected $accessed = array(); 8386 8387 /** 8388 * Retrieves a value, and logs the access. 8389 * @param mixed $index 8390 * @return mixed 8391 */ 8392 public function offsetGet($index) 8393 { 8394 $this->accessed[$index] = true; 8395 return parent::offsetGet($index); 8396 } 8397 8398 /** 8399 * Returns a lookup array of all array indexes that have been accessed. 8400 * @return array in form array($index => true). 8401 */ 8402 public function getAccessed() 8403 { 8404 return $this->accessed; 8405 } 8406 8407 /** 8408 * Resets the access array. 8409 */ 8410 public function resetAccessed() 8411 { 8412 $this->accessed = array(); 8413 } 8414} 8415 8416 8417 8418 8419 8420/** 8421 * Parses string hash files. File format is as such: 8422 * 8423 * DefaultKeyValue 8424 * KEY: Value 8425 * KEY2: Value2 8426 * --MULTILINE-KEY-- 8427 * Multiline 8428 * value. 8429 * 8430 * Which would output something similar to: 8431 * 8432 * array( 8433 * 'ID' => 'DefaultKeyValue', 8434 * 'KEY' => 'Value', 8435 * 'KEY2' => 'Value2', 8436 * 'MULTILINE-KEY' => "Multiline\nvalue.\n", 8437 * ) 8438 * 8439 * We use this as an easy to use file-format for configuration schema 8440 * files, but the class itself is usage agnostic. 8441 * 8442 * You can use ---- to forcibly terminate parsing of a single string-hash; 8443 * this marker is used in multi string-hashes to delimit boundaries. 8444 */ 8445class HTMLPurifier_StringHashParser 8446{ 8447 8448 /** 8449 * @type string 8450 */ 8451 public $default = 'ID'; 8452 8453 /** 8454 * Parses a file that contains a single string-hash. 8455 * @param string $file 8456 * @return array 8457 */ 8458 public function parseFile($file) 8459 { 8460 if (!file_exists($file)) { 8461 return false; 8462 } 8463 $fh = fopen($file, 'r'); 8464 if (!$fh) { 8465 return false; 8466 } 8467 $ret = $this->parseHandle($fh); 8468 fclose($fh); 8469 return $ret; 8470 } 8471 8472 /** 8473 * Parses a file that contains multiple string-hashes delimited by '----' 8474 * @param string $file 8475 * @return array 8476 */ 8477 public function parseMultiFile($file) 8478 { 8479 if (!file_exists($file)) { 8480 return false; 8481 } 8482 $ret = array(); 8483 $fh = fopen($file, 'r'); 8484 if (!$fh) { 8485 return false; 8486 } 8487 while (!feof($fh)) { 8488 $ret[] = $this->parseHandle($fh); 8489 } 8490 fclose($fh); 8491 return $ret; 8492 } 8493 8494 /** 8495 * Internal parser that acepts a file handle. 8496 * @note While it's possible to simulate in-memory parsing by using 8497 * custom stream wrappers, if such a use-case arises we should 8498 * factor out the file handle into its own class. 8499 * @param resource $fh File handle with pointer at start of valid string-hash 8500 * block. 8501 * @return array 8502 */ 8503 protected function parseHandle($fh) 8504 { 8505 $state = false; 8506 $single = false; 8507 $ret = array(); 8508 do { 8509 $line = fgets($fh); 8510 if ($line === false) { 8511 break; 8512 } 8513 $line = rtrim($line, "\n\r"); 8514 if (!$state && $line === '') { 8515 continue; 8516 } 8517 if ($line === '----') { 8518 break; 8519 } 8520 if (strncmp('--#', $line, 3) === 0) { 8521 // Comment 8522 continue; 8523 } elseif (strncmp('--', $line, 2) === 0) { 8524 // Multiline declaration 8525 $state = trim($line, '- '); 8526 if (!isset($ret[$state])) { 8527 $ret[$state] = ''; 8528 } 8529 continue; 8530 } elseif (!$state) { 8531 $single = true; 8532 if (strpos($line, ':') !== false) { 8533 // Single-line declaration 8534 list($state, $line) = explode(':', $line, 2); 8535 $line = trim($line); 8536 } else { 8537 // Use default declaration 8538 $state = $this->default; 8539 } 8540 } 8541 if ($single) { 8542 $ret[$state] = $line; 8543 $single = false; 8544 $state = false; 8545 } else { 8546 $ret[$state] .= "$line\n"; 8547 } 8548 } while (!feof($fh)); 8549 return $ret; 8550 } 8551} 8552 8553 8554 8555 8556 8557/** 8558 * Defines a mutation of an obsolete tag into a valid tag. 8559 */ 8560abstract class HTMLPurifier_TagTransform 8561{ 8562 8563 /** 8564 * Tag name to transform the tag to. 8565 * @type string 8566 */ 8567 public $transform_to; 8568 8569 /** 8570 * Transforms the obsolete tag into the valid tag. 8571 * @param HTMLPurifier_Token_Tag $tag Tag to be transformed. 8572 * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object 8573 * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object 8574 */ 8575 abstract public function transform($tag, $config, $context); 8576 8577 /** 8578 * Prepends CSS properties to the style attribute, creating the 8579 * attribute if it doesn't exist. 8580 * @warning Copied over from AttrTransform, be sure to keep in sync 8581 * @param array $attr Attribute array to process (passed by reference) 8582 * @param string $css CSS to prepend 8583 */ 8584 protected function prependCSS(&$attr, $css) 8585 { 8586 $attr['style'] = isset($attr['style']) ? $attr['style'] : ''; 8587 $attr['style'] = $css . $attr['style']; 8588 } 8589} 8590 8591 8592 8593 8594 8595/** 8596 * Abstract base token class that all others inherit from. 8597 */ 8598abstract class HTMLPurifier_Token 8599{ 8600 /** 8601 * Line number node was on in source document. Null if unknown. 8602 * @type int 8603 */ 8604 public $line; 8605 8606 /** 8607 * Column of line node was on in source document. Null if unknown. 8608 * @type int 8609 */ 8610 public $col; 8611 8612 /** 8613 * Lookup array of processing that this token is exempt from. 8614 * Currently, valid values are "ValidateAttributes" and 8615 * "MakeWellFormed_TagClosedError" 8616 * @type array 8617 */ 8618 public $armor = array(); 8619 8620 /** 8621 * Used during MakeWellFormed. See Note [Injector skips] 8622 * @type 8623 */ 8624 public $skip; 8625 8626 /** 8627 * @type 8628 */ 8629 public $rewind; 8630 8631 /** 8632 * @type 8633 */ 8634 public $carryover; 8635 8636 /** 8637 * @param string $n 8638 * @return null|string 8639 */ 8640 public function __get($n) 8641 { 8642 if ($n === 'type') { 8643 trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE); 8644 switch (get_class($this)) { 8645 case 'HTMLPurifier_Token_Start': 8646 return 'start'; 8647 case 'HTMLPurifier_Token_Empty': 8648 return 'empty'; 8649 case 'HTMLPurifier_Token_End': 8650 return 'end'; 8651 case 'HTMLPurifier_Token_Text': 8652 return 'text'; 8653 case 'HTMLPurifier_Token_Comment': 8654 return 'comment'; 8655 default: 8656 return null; 8657 } 8658 } 8659 } 8660 8661 /** 8662 * Sets the position of the token in the source document. 8663 * @param int $l 8664 * @param int $c 8665 */ 8666 public function position($l = null, $c = null) 8667 { 8668 $this->line = $l; 8669 $this->col = $c; 8670 } 8671 8672 /** 8673 * Convenience function for DirectLex settings line/col position. 8674 * @param int $l 8675 * @param int $c 8676 */ 8677 public function rawPosition($l, $c) 8678 { 8679 if ($c === -1) { 8680 $l++; 8681 } 8682 $this->line = $l; 8683 $this->col = $c; 8684 } 8685 8686 /** 8687 * Converts a token into its corresponding node. 8688 */ 8689 abstract public function toNode(); 8690} 8691 8692 8693 8694 8695 8696/** 8697 * Factory for token generation. 8698 * 8699 * @note Doing some benchmarking indicates that the new operator is much 8700 * slower than the clone operator (even discounting the cost of the 8701 * constructor). This class is for that optimization. 8702 * Other then that, there's not much point as we don't 8703 * maintain parallel HTMLPurifier_Token hierarchies (the main reason why 8704 * you'd want to use an abstract factory). 8705 * @todo Port DirectLex to use this 8706 */ 8707class HTMLPurifier_TokenFactory 8708{ 8709 // p stands for prototype 8710 8711 /** 8712 * @type HTMLPurifier_Token_Start 8713 */ 8714 private $p_start; 8715 8716 /** 8717 * @type HTMLPurifier_Token_End 8718 */ 8719 private $p_end; 8720 8721 /** 8722 * @type HTMLPurifier_Token_Empty 8723 */ 8724 private $p_empty; 8725 8726 /** 8727 * @type HTMLPurifier_Token_Text 8728 */ 8729 private $p_text; 8730 8731 /** 8732 * @type HTMLPurifier_Token_Comment 8733 */ 8734 private $p_comment; 8735 8736 /** 8737 * Generates blank prototypes for cloning. 8738 */ 8739 public function __construct() 8740 { 8741 $this->p_start = new HTMLPurifier_Token_Start('', array()); 8742 $this->p_end = new HTMLPurifier_Token_End(''); 8743 $this->p_empty = new HTMLPurifier_Token_Empty('', array()); 8744 $this->p_text = new HTMLPurifier_Token_Text(''); 8745 $this->p_comment = new HTMLPurifier_Token_Comment(''); 8746 } 8747 8748 /** 8749 * Creates a HTMLPurifier_Token_Start. 8750 * @param string $name Tag name 8751 * @param array $attr Associative array of attributes 8752 * @return HTMLPurifier_Token_Start Generated HTMLPurifier_Token_Start 8753 */ 8754 public function createStart($name, $attr = array()) 8755 { 8756 $p = clone $this->p_start; 8757 $p->__construct($name, $attr); 8758 return $p; 8759 } 8760 8761 /** 8762 * Creates a HTMLPurifier_Token_End. 8763 * @param string $name Tag name 8764 * @return HTMLPurifier_Token_End Generated HTMLPurifier_Token_End 8765 */ 8766 public function createEnd($name) 8767 { 8768 $p = clone $this->p_end; 8769 $p->__construct($name); 8770 return $p; 8771 } 8772 8773 /** 8774 * Creates a HTMLPurifier_Token_Empty. 8775 * @param string $name Tag name 8776 * @param array $attr Associative array of attributes 8777 * @return HTMLPurifier_Token_Empty Generated HTMLPurifier_Token_Empty 8778 */ 8779 public function createEmpty($name, $attr = array()) 8780 { 8781 $p = clone $this->p_empty; 8782 $p->__construct($name, $attr); 8783 return $p; 8784 } 8785 8786 /** 8787 * Creates a HTMLPurifier_Token_Text. 8788 * @param string $data Data of text token 8789 * @return HTMLPurifier_Token_Text Generated HTMLPurifier_Token_Text 8790 */ 8791 public function createText($data) 8792 { 8793 $p = clone $this->p_text; 8794 $p->__construct($data); 8795 return $p; 8796 } 8797 8798 /** 8799 * Creates a HTMLPurifier_Token_Comment. 8800 * @param string $data Data of comment token 8801 * @return HTMLPurifier_Token_Comment Generated HTMLPurifier_Token_Comment 8802 */ 8803 public function createComment($data) 8804 { 8805 $p = clone $this->p_comment; 8806 $p->__construct($data); 8807 return $p; 8808 } 8809} 8810 8811 8812 8813 8814 8815/** 8816 * HTML Purifier's internal representation of a URI. 8817 * @note 8818 * Internal data-structures are completely escaped. If the data needs 8819 * to be used in a non-URI context (which is very unlikely), be sure 8820 * to decode it first. The URI may not necessarily be well-formed until 8821 * validate() is called. 8822 */ 8823class HTMLPurifier_URI 8824{ 8825 /** 8826 * @type string 8827 */ 8828 public $scheme; 8829 8830 /** 8831 * @type string 8832 */ 8833 public $userinfo; 8834 8835 /** 8836 * @type string 8837 */ 8838 public $host; 8839 8840 /** 8841 * @type int 8842 */ 8843 public $port; 8844 8845 /** 8846 * @type string 8847 */ 8848 public $path; 8849 8850 /** 8851 * @type string 8852 */ 8853 public $query; 8854 8855 /** 8856 * @type string 8857 */ 8858 public $fragment; 8859 8860 /** 8861 * @param string $scheme 8862 * @param string $userinfo 8863 * @param string $host 8864 * @param int $port 8865 * @param string $path 8866 * @param string $query 8867 * @param string $fragment 8868 * @note Automatically normalizes scheme and port 8869 */ 8870 public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) 8871 { 8872 $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme); 8873 $this->userinfo = $userinfo; 8874 $this->host = $host; 8875 $this->port = is_null($port) ? $port : (int)$port; 8876 $this->path = $path; 8877 $this->query = $query; 8878 $this->fragment = $fragment; 8879 } 8880 8881 /** 8882 * Retrieves a scheme object corresponding to the URI's scheme/default 8883 * @param HTMLPurifier_Config $config 8884 * @param HTMLPurifier_Context $context 8885 * @return HTMLPurifier_URIScheme Scheme object appropriate for validating this URI 8886 */ 8887 public function getSchemeObj($config, $context) 8888 { 8889 $registry = HTMLPurifier_URISchemeRegistry::instance(); 8890 if ($this->scheme !== null) { 8891 $scheme_obj = $registry->getScheme($this->scheme, $config, $context); 8892 if (!$scheme_obj) { 8893 return false; 8894 } // invalid scheme, clean it out 8895 } else { 8896 // no scheme: retrieve the default one 8897 $def = $config->getDefinition('URI'); 8898 $scheme_obj = $def->getDefaultScheme($config, $context); 8899 if (!$scheme_obj) { 8900 if ($def->defaultScheme !== null) { 8901 // something funky happened to the default scheme object 8902 trigger_error( 8903 'Default scheme object "' . $def->defaultScheme . '" was not readable', 8904 E_USER_WARNING 8905 ); 8906 } // suppress error if it's null 8907 return false; 8908 } 8909 } 8910 return $scheme_obj; 8911 } 8912 8913 /** 8914 * Generic validation method applicable for all schemes. May modify 8915 * this URI in order to get it into a compliant form. 8916 * @param HTMLPurifier_Config $config 8917 * @param HTMLPurifier_Context $context 8918 * @return bool True if validation/filtering succeeds, false if failure 8919 */ 8920 public function validate($config, $context) 8921 { 8922 // ABNF definitions from RFC 3986 8923 $chars_sub_delims = '!$&\'()*+,;='; 8924 $chars_gen_delims = ':/?#[]@'; 8925 $chars_pchar = $chars_sub_delims . ':@'; 8926 8927 // validate host 8928 if (!is_null($this->host)) { 8929 $host_def = new HTMLPurifier_AttrDef_URI_Host(); 8930 $this->host = $host_def->validate($this->host, $config, $context); 8931 if ($this->host === false) { 8932 $this->host = null; 8933 } 8934 } 8935 8936 // validate scheme 8937 // NOTE: It's not appropriate to check whether or not this 8938 // scheme is in our registry, since a URIFilter may convert a 8939 // URI that we don't allow into one we do. So instead, we just 8940 // check if the scheme can be dropped because there is no host 8941 // and it is our default scheme. 8942 if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') { 8943 // support for relative paths is pretty abysmal when the 8944 // scheme is present, so axe it when possible 8945 $def = $config->getDefinition('URI'); 8946 if ($def->defaultScheme === $this->scheme) { 8947 $this->scheme = null; 8948 } 8949 } 8950 8951 // validate username 8952 if (!is_null($this->userinfo)) { 8953 $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':'); 8954 $this->userinfo = $encoder->encode($this->userinfo); 8955 } 8956 8957 // validate port 8958 if (!is_null($this->port)) { 8959 if ($this->port < 1 || $this->port > 65535) { 8960 $this->port = null; 8961 } 8962 } 8963 8964 // validate path 8965 $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/'); 8966 if (!is_null($this->host)) { // this catches $this->host === '' 8967 // path-abempty (hier and relative) 8968 // http://www.example.com/my/path 8969 // //www.example.com/my/path (looks odd, but works, and 8970 // recognized by most browsers) 8971 // (this set is valid or invalid on a scheme by scheme 8972 // basis, so we'll deal with it later) 8973 // file:///my/path 8974 // ///my/path 8975 $this->path = $segments_encoder->encode($this->path); 8976 } elseif ($this->path !== '') { 8977 if ($this->path[0] === '/') { 8978 // path-absolute (hier and relative) 8979 // http:/my/path 8980 // /my/path 8981 if (strlen($this->path) >= 2 && $this->path[1] === '/') { 8982 // This could happen if both the host gets stripped 8983 // out 8984 // http://my/path 8985 // //my/path 8986 $this->path = ''; 8987 } else { 8988 $this->path = $segments_encoder->encode($this->path); 8989 } 8990 } elseif (!is_null($this->scheme)) { 8991 // path-rootless (hier) 8992 // http:my/path 8993 // Short circuit evaluation means we don't need to check nz 8994 $this->path = $segments_encoder->encode($this->path); 8995 } else { 8996 // path-noscheme (relative) 8997 // my/path 8998 // (once again, not checking nz) 8999 $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@'); 9000 $c = strpos($this->path, '/'); 9001 if ($c !== false) { 9002 $this->path = 9003 $segment_nc_encoder->encode(substr($this->path, 0, $c)) . 9004 $segments_encoder->encode(substr($this->path, $c)); 9005 } else { 9006 $this->path = $segment_nc_encoder->encode($this->path); 9007 } 9008 } 9009 } else { 9010 // path-empty (hier and relative) 9011 $this->path = ''; // just to be safe 9012 } 9013 9014 // qf = query and fragment 9015 $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?'); 9016 9017 if (!is_null($this->query)) { 9018 $this->query = $qf_encoder->encode($this->query); 9019 } 9020 9021 if (!is_null($this->fragment)) { 9022 $this->fragment = $qf_encoder->encode($this->fragment); 9023 } 9024 return true; 9025 } 9026 9027 /** 9028 * Convert URI back to string 9029 * @return string URI appropriate for output 9030 */ 9031 public function toString() 9032 { 9033 // reconstruct authority 9034 $authority = null; 9035 // there is a rendering difference between a null authority 9036 // (http:foo-bar) and an empty string authority 9037 // (http:///foo-bar). 9038 if (!is_null($this->host)) { 9039 $authority = ''; 9040 if (!is_null($this->userinfo)) { 9041 $authority .= $this->userinfo . '@'; 9042 } 9043 $authority .= $this->host; 9044 if (!is_null($this->port)) { 9045 $authority .= ':' . $this->port; 9046 } 9047 } 9048 9049 // Reconstruct the result 9050 // One might wonder about parsing quirks from browsers after 9051 // this reconstruction. Unfortunately, parsing behavior depends 9052 // on what *scheme* was employed (file:///foo is handled *very* 9053 // differently than http:///foo), so unfortunately we have to 9054 // defer to the schemes to do the right thing. 9055 $result = ''; 9056 if (!is_null($this->scheme)) { 9057 $result .= $this->scheme . ':'; 9058 } 9059 if (!is_null($authority)) { 9060 $result .= '//' . $authority; 9061 } 9062 $result .= $this->path; 9063 if (!is_null($this->query)) { 9064 $result .= '?' . $this->query; 9065 } 9066 if (!is_null($this->fragment)) { 9067 $result .= '#' . $this->fragment; 9068 } 9069 9070 return $result; 9071 } 9072 9073 /** 9074 * Returns true if this URL might be considered a 'local' URL given 9075 * the current context. This is true when the host is null, or 9076 * when it matches the host supplied to the configuration. 9077 * 9078 * Note that this does not do any scheme checking, so it is mostly 9079 * only appropriate for metadata that doesn't care about protocol 9080 * security. isBenign is probably what you actually want. 9081 * @param HTMLPurifier_Config $config 9082 * @param HTMLPurifier_Context $context 9083 * @return bool 9084 */ 9085 public function isLocal($config, $context) 9086 { 9087 if ($this->host === null) { 9088 return true; 9089 } 9090 $uri_def = $config->getDefinition('URI'); 9091 if ($uri_def->host === $this->host) { 9092 return true; 9093 } 9094 return false; 9095 } 9096 9097 /** 9098 * Returns true if this URL should be considered a 'benign' URL, 9099 * that is: 9100 * 9101 * - It is a local URL (isLocal), and 9102 * - It has a equal or better level of security 9103 * @param HTMLPurifier_Config $config 9104 * @param HTMLPurifier_Context $context 9105 * @return bool 9106 */ 9107 public function isBenign($config, $context) 9108 { 9109 if (!$this->isLocal($config, $context)) { 9110 return false; 9111 } 9112 9113 $scheme_obj = $this->getSchemeObj($config, $context); 9114 if (!$scheme_obj) { 9115 return false; 9116 } // conservative approach 9117 9118 $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context); 9119 if ($current_scheme_obj->secure) { 9120 if (!$scheme_obj->secure) { 9121 return false; 9122 } 9123 } 9124 return true; 9125 } 9126} 9127 9128 9129 9130 9131 9132class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition 9133{ 9134 9135 public $type = 'URI'; 9136 protected $filters = array(); 9137 protected $postFilters = array(); 9138 protected $registeredFilters = array(); 9139 9140 /** 9141 * HTMLPurifier_URI object of the base specified at %URI.Base 9142 */ 9143 public $base; 9144 9145 /** 9146 * String host to consider "home" base, derived off of $base 9147 */ 9148 public $host; 9149 9150 /** 9151 * Name of default scheme based on %URI.DefaultScheme and %URI.Base 9152 */ 9153 public $defaultScheme; 9154 9155 public function __construct() 9156 { 9157 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal()); 9158 $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources()); 9159 $this->registerFilter(new HTMLPurifier_URIFilter_DisableResources()); 9160 $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist()); 9161 $this->registerFilter(new HTMLPurifier_URIFilter_SafeIframe()); 9162 $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute()); 9163 $this->registerFilter(new HTMLPurifier_URIFilter_Munge()); 9164 } 9165 9166 public function registerFilter($filter) 9167 { 9168 $this->registeredFilters[$filter->name] = $filter; 9169 } 9170 9171 public function addFilter($filter, $config) 9172 { 9173 $r = $filter->prepare($config); 9174 if ($r === false) return; // null is ok, for backwards compat 9175 if ($filter->post) { 9176 $this->postFilters[$filter->name] = $filter; 9177 } else { 9178 $this->filters[$filter->name] = $filter; 9179 } 9180 } 9181 9182 protected function doSetup($config) 9183 { 9184 $this->setupMemberVariables($config); 9185 $this->setupFilters($config); 9186 } 9187 9188 protected function setupFilters($config) 9189 { 9190 foreach ($this->registeredFilters as $name => $filter) { 9191 if ($filter->always_load) { 9192 $this->addFilter($filter, $config); 9193 } else { 9194 $conf = $config->get('URI.' . $name); 9195 if ($conf !== false && $conf !== null) { 9196 $this->addFilter($filter, $config); 9197 } 9198 } 9199 } 9200 unset($this->registeredFilters); 9201 } 9202 9203 protected function setupMemberVariables($config) 9204 { 9205 $this->host = $config->get('URI.Host'); 9206 $base_uri = $config->get('URI.Base'); 9207 if (!is_null($base_uri)) { 9208 $parser = new HTMLPurifier_URIParser(); 9209 $this->base = $parser->parse($base_uri); 9210 $this->defaultScheme = $this->base->scheme; 9211 if (is_null($this->host)) $this->host = $this->base->host; 9212 } 9213 if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme'); 9214 } 9215 9216 public function getDefaultScheme($config, $context) 9217 { 9218 return HTMLPurifier_URISchemeRegistry::instance()->getScheme($this->defaultScheme, $config, $context); 9219 } 9220 9221 public function filter(&$uri, $config, $context) 9222 { 9223 foreach ($this->filters as $name => $f) { 9224 $result = $f->filter($uri, $config, $context); 9225 if (!$result) return false; 9226 } 9227 return true; 9228 } 9229 9230 public function postFilter(&$uri, $config, $context) 9231 { 9232 foreach ($this->postFilters as $name => $f) { 9233 $result = $f->filter($uri, $config, $context); 9234 if (!$result) return false; 9235 } 9236 return true; 9237 } 9238 9239} 9240 9241 9242 9243 9244 9245/** 9246 * Chainable filters for custom URI processing. 9247 * 9248 * These filters can perform custom actions on a URI filter object, 9249 * including transformation or blacklisting. A filter named Foo 9250 * must have a corresponding configuration directive %URI.Foo, 9251 * unless always_load is specified to be true. 9252 * 9253 * The following contexts may be available while URIFilters are being 9254 * processed: 9255 * 9256 * - EmbeddedURI: true if URI is an embedded resource that will 9257 * be loaded automatically on page load 9258 * - CurrentToken: a reference to the token that is currently 9259 * being processed 9260 * - CurrentAttr: the name of the attribute that is currently being 9261 * processed 9262 * - CurrentCSSProperty: the name of the CSS property that is 9263 * currently being processed (if applicable) 9264 * 9265 * @warning This filter is called before scheme object validation occurs. 9266 * Make sure, if you require a specific scheme object, you 9267 * you check that it exists. This allows filters to convert 9268 * proprietary URI schemes into regular ones. 9269 */ 9270abstract class HTMLPurifier_URIFilter 9271{ 9272 9273 /** 9274 * Unique identifier of filter. 9275 * @type string 9276 */ 9277 public $name; 9278 9279 /** 9280 * True if this filter should be run after scheme validation. 9281 * @type bool 9282 */ 9283 public $post = false; 9284 9285 /** 9286 * True if this filter should always be loaded. 9287 * This permits a filter to be named Foo without the corresponding 9288 * %URI.Foo directive existing. 9289 * @type bool 9290 */ 9291 public $always_load = false; 9292 9293 /** 9294 * Performs initialization for the filter. If the filter returns 9295 * false, this means that it shouldn't be considered active. 9296 * @param HTMLPurifier_Config $config 9297 * @return bool 9298 */ 9299 public function prepare($config) 9300 { 9301 return true; 9302 } 9303 9304 /** 9305 * Filter a URI object 9306 * @param HTMLPurifier_URI $uri Reference to URI object variable 9307 * @param HTMLPurifier_Config $config 9308 * @param HTMLPurifier_Context $context 9309 * @return bool Whether or not to continue processing: false indicates 9310 * URL is no good, true indicates continue processing. Note that 9311 * all changes are committed directly on the URI object 9312 */ 9313 abstract public function filter(&$uri, $config, $context); 9314} 9315 9316 9317 9318 9319 9320/** 9321 * Parses a URI into the components and fragment identifier as specified 9322 * by RFC 3986. 9323 */ 9324class HTMLPurifier_URIParser 9325{ 9326 9327 /** 9328 * Instance of HTMLPurifier_PercentEncoder to do normalization with. 9329 */ 9330 protected $percentEncoder; 9331 9332 public function __construct() 9333 { 9334 $this->percentEncoder = new HTMLPurifier_PercentEncoder(); 9335 } 9336 9337 /** 9338 * Parses a URI. 9339 * @param $uri string URI to parse 9340 * @return HTMLPurifier_URI representation of URI. This representation has 9341 * not been validated yet and may not conform to RFC. 9342 */ 9343 public function parse($uri) 9344 { 9345 $uri = $this->percentEncoder->normalize($uri); 9346 9347 // Regexp is as per Appendix B. 9348 // Note that ["<>] are an addition to the RFC's recommended 9349 // characters, because they represent external delimeters. 9350 $r_URI = '!'. 9351 '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme 9352 '(//([^/?#"<>]*))?'. // 4. Authority 9353 '([^?#"<>]*)'. // 5. Path 9354 '(\?([^#"<>]*))?'. // 7. Query 9355 '(#([^"<>]*))?'. // 8. Fragment 9356 '!'; 9357 9358 $matches = array(); 9359 $result = preg_match($r_URI, $uri, $matches); 9360 9361 if (!$result) return false; // *really* invalid URI 9362 9363 // seperate out parts 9364 $scheme = !empty($matches[1]) ? $matches[2] : null; 9365 $authority = !empty($matches[3]) ? $matches[4] : null; 9366 $path = $matches[5]; // always present, can be empty 9367 $query = !empty($matches[6]) ? $matches[7] : null; 9368 $fragment = !empty($matches[8]) ? $matches[9] : null; 9369 9370 // further parse authority 9371 if ($authority !== null) { 9372 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; 9373 $matches = array(); 9374 preg_match($r_authority, $authority, $matches); 9375 $userinfo = !empty($matches[1]) ? $matches[2] : null; 9376 $host = !empty($matches[3]) ? $matches[3] : ''; 9377 $port = !empty($matches[4]) ? (int) $matches[5] : null; 9378 } else { 9379 $port = $host = $userinfo = null; 9380 } 9381 9382 return new HTMLPurifier_URI( 9383 $scheme, $userinfo, $host, $port, $path, $query, $fragment); 9384 } 9385 9386} 9387 9388 9389 9390 9391 9392/** 9393 * Validator for the components of a URI for a specific scheme 9394 */ 9395abstract class HTMLPurifier_URIScheme 9396{ 9397 9398 /** 9399 * Scheme's default port (integer). If an explicit port number is 9400 * specified that coincides with the default port, it will be 9401 * elided. 9402 * @type int 9403 */ 9404 public $default_port = null; 9405 9406 /** 9407 * Whether or not URIs of this scheme are locatable by a browser 9408 * http and ftp are accessible, while mailto and news are not. 9409 * @type bool 9410 */ 9411 public $browsable = false; 9412 9413 /** 9414 * Whether or not data transmitted over this scheme is encrypted. 9415 * https is secure, http is not. 9416 * @type bool 9417 */ 9418 public $secure = false; 9419 9420 /** 9421 * Whether or not the URI always uses <hier_part>, resolves edge cases 9422 * with making relative URIs absolute 9423 * @type bool 9424 */ 9425 public $hierarchical = false; 9426 9427 /** 9428 * Whether or not the URI may omit a hostname when the scheme is 9429 * explicitly specified, ala file:///path/to/file. As of writing, 9430 * 'file' is the only scheme that browsers support his properly. 9431 * @type bool 9432 */ 9433 public $may_omit_host = false; 9434 9435 /** 9436 * Validates the components of a URI for a specific scheme. 9437 * @param HTMLPurifier_URI $uri Reference to a HTMLPurifier_URI object 9438 * @param HTMLPurifier_Config $config 9439 * @param HTMLPurifier_Context $context 9440 * @return bool success or failure 9441 */ 9442 abstract public function doValidate(&$uri, $config, $context); 9443 9444 /** 9445 * Public interface for validating components of a URI. Performs a 9446 * bunch of default actions. Don't overload this method. 9447 * @param HTMLPurifier_URI $uri Reference to a HTMLPurifier_URI object 9448 * @param HTMLPurifier_Config $config 9449 * @param HTMLPurifier_Context $context 9450 * @return bool success or failure 9451 */ 9452 public function validate(&$uri, $config, $context) 9453 { 9454 if ($this->default_port == $uri->port) { 9455 $uri->port = null; 9456 } 9457 // kludge: browsers do funny things when the scheme but not the 9458 // authority is set 9459 if (!$this->may_omit_host && 9460 // if the scheme is present, a missing host is always in error 9461 (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) || 9462 // if the scheme is not present, a *blank* host is in error, 9463 // since this translates into '///path' which most browsers 9464 // interpret as being 'http://path'. 9465 (is_null($uri->scheme) && $uri->host === '') 9466 ) { 9467 do { 9468 if (is_null($uri->scheme)) { 9469 if (substr($uri->path, 0, 2) != '//') { 9470 $uri->host = null; 9471 break; 9472 } 9473 // URI is '////path', so we cannot nullify the 9474 // host to preserve semantics. Try expanding the 9475 // hostname instead (fall through) 9476 } 9477 // first see if we can manually insert a hostname 9478 $host = $config->get('URI.Host'); 9479 if (!is_null($host)) { 9480 $uri->host = $host; 9481 } else { 9482 // we can't do anything sensible, reject the URL. 9483 return false; 9484 } 9485 } while (false); 9486 } 9487 return $this->doValidate($uri, $config, $context); 9488 } 9489} 9490 9491 9492 9493 9494 9495/** 9496 * Registry for retrieving specific URI scheme validator objects. 9497 */ 9498class HTMLPurifier_URISchemeRegistry 9499{ 9500 9501 /** 9502 * Retrieve sole instance of the registry. 9503 * @param HTMLPurifier_URISchemeRegistry $prototype Optional prototype to overload sole instance with, 9504 * or bool true to reset to default registry. 9505 * @return HTMLPurifier_URISchemeRegistry 9506 * @note Pass a registry object $prototype with a compatible interface and 9507 * the function will copy it and return it all further times. 9508 */ 9509 public static function instance($prototype = null) 9510 { 9511 static $instance = null; 9512 if ($prototype !== null) { 9513 $instance = $prototype; 9514 } elseif ($instance === null || $prototype == true) { 9515 $instance = new HTMLPurifier_URISchemeRegistry(); 9516 } 9517 return $instance; 9518 } 9519 9520 /** 9521 * Cache of retrieved schemes. 9522 * @type HTMLPurifier_URIScheme[] 9523 */ 9524 protected $schemes = array(); 9525 9526 /** 9527 * Retrieves a scheme validator object 9528 * @param string $scheme String scheme name like http or mailto 9529 * @param HTMLPurifier_Config $config 9530 * @param HTMLPurifier_Context $context 9531 * @return HTMLPurifier_URIScheme 9532 */ 9533 public function getScheme($scheme, $config, $context) 9534 { 9535 if (!$config) { 9536 $config = HTMLPurifier_Config::createDefault(); 9537 } 9538 9539 // important, otherwise attacker could include arbitrary file 9540 $allowed_schemes = $config->get('URI.AllowedSchemes'); 9541 if (!$config->get('URI.OverrideAllowedSchemes') && 9542 !isset($allowed_schemes[$scheme]) 9543 ) { 9544 return; 9545 } 9546 9547 if (isset($this->schemes[$scheme])) { 9548 return $this->schemes[$scheme]; 9549 } 9550 if (!isset($allowed_schemes[$scheme])) { 9551 return; 9552 } 9553 9554 $class = 'HTMLPurifier_URIScheme_' . $scheme; 9555 if (!class_exists($class)) { 9556 return; 9557 } 9558 $this->schemes[$scheme] = new $class(); 9559 return $this->schemes[$scheme]; 9560 } 9561 9562 /** 9563 * Registers a custom scheme to the cache, bypassing reflection. 9564 * @param string $scheme Scheme name 9565 * @param HTMLPurifier_URIScheme $scheme_obj 9566 */ 9567 public function register($scheme, $scheme_obj) 9568 { 9569 $this->schemes[$scheme] = $scheme_obj; 9570 } 9571} 9572 9573 9574 9575 9576 9577/** 9578 * Class for converting between different unit-lengths as specified by 9579 * CSS. 9580 */ 9581class HTMLPurifier_UnitConverter 9582{ 9583 9584 const ENGLISH = 1; 9585 const METRIC = 2; 9586 const DIGITAL = 3; 9587 9588 /** 9589 * Units information array. Units are grouped into measuring systems 9590 * (English, Metric), and are assigned an integer representing 9591 * the conversion factor between that unit and the smallest unit in 9592 * the system. Numeric indexes are actually magical constants that 9593 * encode conversion data from one system to the next, with a O(n^2) 9594 * constraint on memory (this is generally not a problem, since 9595 * the number of measuring systems is small.) 9596 */ 9597 protected static $units = array( 9598 self::ENGLISH => array( 9599 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary 9600 'pt' => 4, 9601 'pc' => 48, 9602 'in' => 288, 9603 self::METRIC => array('pt', '0.352777778', 'mm'), 9604 ), 9605 self::METRIC => array( 9606 'mm' => 1, 9607 'cm' => 10, 9608 self::ENGLISH => array('mm', '2.83464567', 'pt'), 9609 ), 9610 ); 9611 9612 /** 9613 * Minimum bcmath precision for output. 9614 * @type int 9615 */ 9616 protected $outputPrecision; 9617 9618 /** 9619 * Bcmath precision for internal calculations. 9620 * @type int 9621 */ 9622 protected $internalPrecision; 9623 9624 /** 9625 * Whether or not BCMath is available. 9626 * @type bool 9627 */ 9628 private $bcmath; 9629 9630 public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false) 9631 { 9632 $this->outputPrecision = $output_precision; 9633 $this->internalPrecision = $internal_precision; 9634 $this->bcmath = !$force_no_bcmath && function_exists('bcmul'); 9635 } 9636 9637 /** 9638 * Converts a length object of one unit into another unit. 9639 * @param HTMLPurifier_Length $length 9640 * Instance of HTMLPurifier_Length to convert. You must validate() 9641 * it before passing it here! 9642 * @param string $to_unit 9643 * Unit to convert to. 9644 * @return HTMLPurifier_Length|bool 9645 * @note 9646 * About precision: This conversion function pays very special 9647 * attention to the incoming precision of values and attempts 9648 * to maintain a number of significant figure. Results are 9649 * fairly accurate up to nine digits. Some caveats: 9650 * - If a number is zero-padded as a result of this significant 9651 * figure tracking, the zeroes will be eliminated. 9652 * - If a number contains less than four sigfigs ($outputPrecision) 9653 * and this causes some decimals to be excluded, those 9654 * decimals will be added on. 9655 */ 9656 public function convert($length, $to_unit) 9657 { 9658 if (!$length->isValid()) { 9659 return false; 9660 } 9661 9662 $n = $length->getN(); 9663 $unit = $length->getUnit(); 9664 9665 if ($n === '0' || $unit === false) { 9666 return new HTMLPurifier_Length('0', false); 9667 } 9668 9669 $state = $dest_state = false; 9670 foreach (self::$units as $k => $x) { 9671 if (isset($x[$unit])) { 9672 $state = $k; 9673 } 9674 if (isset($x[$to_unit])) { 9675 $dest_state = $k; 9676 } 9677 } 9678 if (!$state || !$dest_state) { 9679 return false; 9680 } 9681 9682 // Some calculations about the initial precision of the number; 9683 // this will be useful when we need to do final rounding. 9684 $sigfigs = $this->getSigFigs($n); 9685 if ($sigfigs < $this->outputPrecision) { 9686 $sigfigs = $this->outputPrecision; 9687 } 9688 9689 // BCMath's internal precision deals only with decimals. Use 9690 // our default if the initial number has no decimals, or increase 9691 // it by how ever many decimals, thus, the number of guard digits 9692 // will always be greater than or equal to internalPrecision. 9693 $log = (int)floor(log(abs($n), 10)); 9694 $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision 9695 9696 for ($i = 0; $i < 2; $i++) { 9697 9698 // Determine what unit IN THIS SYSTEM we need to convert to 9699 if ($dest_state === $state) { 9700 // Simple conversion 9701 $dest_unit = $to_unit; 9702 } else { 9703 // Convert to the smallest unit, pending a system shift 9704 $dest_unit = self::$units[$state][$dest_state][0]; 9705 } 9706 9707 // Do the conversion if necessary 9708 if ($dest_unit !== $unit) { 9709 $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp); 9710 $n = $this->mul($n, $factor, $cp); 9711 $unit = $dest_unit; 9712 } 9713 9714 // Output was zero, so bail out early. Shouldn't ever happen. 9715 if ($n === '') { 9716 $n = '0'; 9717 $unit = $to_unit; 9718 break; 9719 } 9720 9721 // It was a simple conversion, so bail out 9722 if ($dest_state === $state) { 9723 break; 9724 } 9725 9726 if ($i !== 0) { 9727 // Conversion failed! Apparently, the system we forwarded 9728 // to didn't have this unit. This should never happen! 9729 return false; 9730 } 9731 9732 // Pre-condition: $i == 0 9733 9734 // Perform conversion to next system of units 9735 $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp); 9736 $unit = self::$units[$state][$dest_state][2]; 9737 $state = $dest_state; 9738 9739 // One more loop around to convert the unit in the new system. 9740 9741 } 9742 9743 // Post-condition: $unit == $to_unit 9744 if ($unit !== $to_unit) { 9745 return false; 9746 } 9747 9748 // Useful for debugging: 9749 //echo "<pre>n"; 9750 //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n"; 9751 9752 $n = $this->round($n, $sigfigs); 9753 if (strpos($n, '.') !== false) { 9754 $n = rtrim($n, '0'); 9755 } 9756 $n = rtrim($n, '.'); 9757 9758 return new HTMLPurifier_Length($n, $unit); 9759 } 9760 9761 /** 9762 * Returns the number of significant figures in a string number. 9763 * @param string $n Decimal number 9764 * @return int number of sigfigs 9765 */ 9766 public function getSigFigs($n) 9767 { 9768 $n = ltrim($n, '0+-'); 9769 $dp = strpos($n, '.'); // decimal position 9770 if ($dp === false) { 9771 $sigfigs = strlen(rtrim($n, '0')); 9772 } else { 9773 $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character 9774 if ($dp !== 0) { 9775 $sigfigs--; 9776 } 9777 } 9778 return $sigfigs; 9779 } 9780 9781 /** 9782 * Adds two numbers, using arbitrary precision when available. 9783 * @param string $s1 9784 * @param string $s2 9785 * @param int $scale 9786 * @return string 9787 */ 9788 private function add($s1, $s2, $scale) 9789 { 9790 if ($this->bcmath) { 9791 return bcadd($s1, $s2, $scale); 9792 } else { 9793 return $this->scale((float)$s1 + (float)$s2, $scale); 9794 } 9795 } 9796 9797 /** 9798 * Multiples two numbers, using arbitrary precision when available. 9799 * @param string $s1 9800 * @param string $s2 9801 * @param int $scale 9802 * @return string 9803 */ 9804 private function mul($s1, $s2, $scale) 9805 { 9806 if ($this->bcmath) { 9807 return bcmul($s1, $s2, $scale); 9808 } else { 9809 return $this->scale((float)$s1 * (float)$s2, $scale); 9810 } 9811 } 9812 9813 /** 9814 * Divides two numbers, using arbitrary precision when available. 9815 * @param string $s1 9816 * @param string $s2 9817 * @param int $scale 9818 * @return string 9819 */ 9820 private function div($s1, $s2, $scale) 9821 { 9822 if ($this->bcmath) { 9823 return bcdiv($s1, $s2, $scale); 9824 } else { 9825 return $this->scale((float)$s1 / (float)$s2, $scale); 9826 } 9827 } 9828 9829 /** 9830 * Rounds a number according to the number of sigfigs it should have, 9831 * using arbitrary precision when available. 9832 * @param float $n 9833 * @param int $sigfigs 9834 * @return string 9835 */ 9836 private function round($n, $sigfigs) 9837 { 9838 $new_log = (int)floor(log(abs($n), 10)); // Number of digits left of decimal - 1 9839 $rp = $sigfigs - $new_log - 1; // Number of decimal places needed 9840 $neg = $n < 0 ? '-' : ''; // Negative sign 9841 if ($this->bcmath) { 9842 if ($rp >= 0) { 9843 $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1); 9844 $n = bcdiv($n, '1', $rp); 9845 } else { 9846 // This algorithm partially depends on the standardized 9847 // form of numbers that comes out of bcmath. 9848 $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0); 9849 $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1); 9850 } 9851 return $n; 9852 } else { 9853 return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1); 9854 } 9855 } 9856 9857 /** 9858 * Scales a float to $scale digits right of decimal point, like BCMath. 9859 * @param float $r 9860 * @param int $scale 9861 * @return string 9862 */ 9863 private function scale($r, $scale) 9864 { 9865 if ($scale < 0) { 9866 // The f sprintf type doesn't support negative numbers, so we 9867 // need to cludge things manually. First get the string. 9868 $r = sprintf('%.0f', (float)$r); 9869 // Due to floating point precision loss, $r will more than likely 9870 // look something like 4652999999999.9234. We grab one more digit 9871 // than we need to precise from $r and then use that to round 9872 // appropriately. 9873 $precise = (string)round(substr($r, 0, strlen($r) + $scale), -1); 9874 // Now we return it, truncating the zero that was rounded off. 9875 return substr($precise, 0, -1) . str_repeat('0', -$scale + 1); 9876 } 9877 return sprintf('%.' . $scale . 'f', (float)$r); 9878 } 9879} 9880 9881 9882 9883 9884 9885/** 9886 * Parses string representations into their corresponding native PHP 9887 * variable type. The base implementation does a simple type-check. 9888 */ 9889class HTMLPurifier_VarParser 9890{ 9891 9892 const STRING = 1; 9893 const ISTRING = 2; 9894 const TEXT = 3; 9895 const ITEXT = 4; 9896 const INT = 5; 9897 const FLOAT = 6; 9898 const BOOL = 7; 9899 const LOOKUP = 8; 9900 const ALIST = 9; 9901 const HASH = 10; 9902 const MIXED = 11; 9903 9904 /** 9905 * Lookup table of allowed types. Mainly for backwards compatibility, but 9906 * also convenient for transforming string type names to the integer constants. 9907 */ 9908 public static $types = array( 9909 'string' => self::STRING, 9910 'istring' => self::ISTRING, 9911 'text' => self::TEXT, 9912 'itext' => self::ITEXT, 9913 'int' => self::INT, 9914 'float' => self::FLOAT, 9915 'bool' => self::BOOL, 9916 'lookup' => self::LOOKUP, 9917 'list' => self::ALIST, 9918 'hash' => self::HASH, 9919 'mixed' => self::MIXED 9920 ); 9921 9922 /** 9923 * Lookup table of types that are string, and can have aliases or 9924 * allowed value lists. 9925 */ 9926 public static $stringTypes = array( 9927 self::STRING => true, 9928 self::ISTRING => true, 9929 self::TEXT => true, 9930 self::ITEXT => true, 9931 ); 9932 9933 /** 9934 * Validate a variable according to type. 9935 * It may return NULL as a valid type if $allow_null is true. 9936 * 9937 * @param mixed $var Variable to validate 9938 * @param int $type Type of variable, see HTMLPurifier_VarParser->types 9939 * @param bool $allow_null Whether or not to permit null as a value 9940 * @return string Validated and type-coerced variable 9941 * @throws HTMLPurifier_VarParserException 9942 */ 9943 final public function parse($var, $type, $allow_null = false) 9944 { 9945 if (is_string($type)) { 9946 if (!isset(HTMLPurifier_VarParser::$types[$type])) { 9947 throw new HTMLPurifier_VarParserException("Invalid type '$type'"); 9948 } else { 9949 $type = HTMLPurifier_VarParser::$types[$type]; 9950 } 9951 } 9952 $var = $this->parseImplementation($var, $type, $allow_null); 9953 if ($allow_null && $var === null) { 9954 return null; 9955 } 9956 // These are basic checks, to make sure nothing horribly wrong 9957 // happened in our implementations. 9958 switch ($type) { 9959 case (self::STRING): 9960 case (self::ISTRING): 9961 case (self::TEXT): 9962 case (self::ITEXT): 9963 if (!is_string($var)) { 9964 break; 9965 } 9966 if ($type == self::ISTRING || $type == self::ITEXT) { 9967 $var = strtolower($var); 9968 } 9969 return $var; 9970 case (self::INT): 9971 if (!is_int($var)) { 9972 break; 9973 } 9974 return $var; 9975 case (self::FLOAT): 9976 if (!is_float($var)) { 9977 break; 9978 } 9979 return $var; 9980 case (self::BOOL): 9981 if (!is_bool($var)) { 9982 break; 9983 } 9984 return $var; 9985 case (self::LOOKUP): 9986 case (self::ALIST): 9987 case (self::HASH): 9988 if (!is_array($var)) { 9989 break; 9990 } 9991 if ($type === self::LOOKUP) { 9992 foreach ($var as $k) { 9993 if ($k !== true) { 9994 $this->error('Lookup table contains value other than true'); 9995 } 9996 } 9997 } elseif ($type === self::ALIST) { 9998 $keys = array_keys($var); 9999 if (array_keys($keys) !== $keys) { 10000 $this->error('Indices for list are not uniform'); 10001 } 10002 } 10003 return $var; 10004 case (self::MIXED): 10005 return $var; 10006 default: 10007 $this->errorInconsistent(get_class($this), $type); 10008 } 10009 $this->errorGeneric($var, $type); 10010 } 10011 10012 /** 10013 * Actually implements the parsing. Base implementation does not 10014 * do anything to $var. Subclasses should overload this! 10015 * @param mixed $var 10016 * @param int $type 10017 * @param bool $allow_null 10018 * @return string 10019 */ 10020 protected function parseImplementation($var, $type, $allow_null) 10021 { 10022 return $var; 10023 } 10024 10025 /** 10026 * Throws an exception. 10027 * @throws HTMLPurifier_VarParserException 10028 */ 10029 protected function error($msg) 10030 { 10031 throw new HTMLPurifier_VarParserException($msg); 10032 } 10033 10034 /** 10035 * Throws an inconsistency exception. 10036 * @note This should not ever be called. It would be called if we 10037 * extend the allowed values of HTMLPurifier_VarParser without 10038 * updating subclasses. 10039 * @param string $class 10040 * @param int $type 10041 * @throws HTMLPurifier_Exception 10042 */ 10043 protected function errorInconsistent($class, $type) 10044 { 10045 throw new HTMLPurifier_Exception( 10046 "Inconsistency in $class: " . HTMLPurifier_VarParser::getTypeName($type) . 10047 " not implemented" 10048 ); 10049 } 10050 10051 /** 10052 * Generic error for if a type didn't work. 10053 * @param mixed $var 10054 * @param int $type 10055 */ 10056 protected function errorGeneric($var, $type) 10057 { 10058 $vtype = gettype($var); 10059 $this->error("Expected type " . HTMLPurifier_VarParser::getTypeName($type) . ", got $vtype"); 10060 } 10061 10062 /** 10063 * @param int $type 10064 * @return string 10065 */ 10066 public static function getTypeName($type) 10067 { 10068 static $lookup; 10069 if (!$lookup) { 10070 // Lazy load the alternative lookup table 10071 $lookup = array_flip(HTMLPurifier_VarParser::$types); 10072 } 10073 if (!isset($lookup[$type])) { 10074 return 'unknown'; 10075 } 10076 return $lookup[$type]; 10077 } 10078} 10079 10080 10081 10082 10083 10084/** 10085 * Exception type for HTMLPurifier_VarParser 10086 */ 10087class HTMLPurifier_VarParserException extends HTMLPurifier_Exception 10088{ 10089 10090} 10091 10092 10093 10094 10095 10096/** 10097 * A zipper is a purely-functional data structure which contains 10098 * a focus that can be efficiently manipulated. It is known as 10099 * a "one-hole context". This mutable variant implements a zipper 10100 * for a list as a pair of two arrays, laid out as follows: 10101 * 10102 * Base list: 1 2 3 4 [ ] 6 7 8 9 10103 * Front list: 1 2 3 4 10104 * Back list: 9 8 7 6 10105 * 10106 * User is expected to keep track of the "current element" and properly 10107 * fill it back in as necessary. (ToDo: Maybe it's more user friendly 10108 * to implicitly track the current element?) 10109 * 10110 * Nota bene: the current class gets confused if you try to store NULLs 10111 * in the list. 10112 */ 10113 10114class HTMLPurifier_Zipper 10115{ 10116 public $front, $back; 10117 10118 public function __construct($front, $back) { 10119 $this->front = $front; 10120 $this->back = $back; 10121 } 10122 10123 /** 10124 * Creates a zipper from an array, with a hole in the 10125 * 0-index position. 10126 * @param Array to zipper-ify. 10127 * @return Tuple of zipper and element of first position. 10128 */ 10129 static public function fromArray($array) { 10130 $z = new self(array(), array_reverse($array)); 10131 $t = $z->delete(); // delete the "dummy hole" 10132 return array($z, $t); 10133 } 10134 10135 /** 10136 * Convert zipper back into a normal array, optionally filling in 10137 * the hole with a value. (Usually you should supply a $t, unless you 10138 * are at the end of the array.) 10139 */ 10140 public function toArray($t = NULL) { 10141 $a = $this->front; 10142 if ($t !== NULL) $a[] = $t; 10143 for ($i = count($this->back)-1; $i >= 0; $i--) { 10144 $a[] = $this->back[$i]; 10145 } 10146 return $a; 10147 } 10148 10149 /** 10150 * Move hole to the next element. 10151 * @param $t Element to fill hole with 10152 * @return Original contents of new hole. 10153 */ 10154 public function next($t) { 10155 if ($t !== NULL) array_push($this->front, $t); 10156 return empty($this->back) ? NULL : array_pop($this->back); 10157 } 10158 10159 /** 10160 * Iterated hole advancement. 10161 * @param $t Element to fill hole with 10162 * @param $i How many forward to advance hole 10163 * @return Original contents of new hole, i away 10164 */ 10165 public function advance($t, $n) { 10166 for ($i = 0; $i < $n; $i++) { 10167 $t = $this->next($t); 10168 } 10169 return $t; 10170 } 10171 10172 /** 10173 * Move hole to the previous element 10174 * @param $t Element to fill hole with 10175 * @return Original contents of new hole. 10176 */ 10177 public function prev($t) { 10178 if ($t !== NULL) array_push($this->back, $t); 10179 return empty($this->front) ? NULL : array_pop($this->front); 10180 } 10181 10182 /** 10183 * Delete contents of current hole, shifting hole to 10184 * next element. 10185 * @return Original contents of new hole. 10186 */ 10187 public function delete() { 10188 return empty($this->back) ? NULL : array_pop($this->back); 10189 } 10190 10191 /** 10192 * Returns true if we are at the end of the list. 10193 * @return bool 10194 */ 10195 public function done() { 10196 return empty($this->back); 10197 } 10198 10199 /** 10200 * Insert element before hole. 10201 * @param Element to insert 10202 */ 10203 public function insertBefore($t) { 10204 if ($t !== NULL) array_push($this->front, $t); 10205 } 10206 10207 /** 10208 * Insert element after hole. 10209 * @param Element to insert 10210 */ 10211 public function insertAfter($t) { 10212 if ($t !== NULL) array_push($this->back, $t); 10213 } 10214 10215 /** 10216 * Splice in multiple elements at hole. Functional specification 10217 * in terms of array_splice: 10218 * 10219 * $arr1 = $arr; 10220 * $old1 = array_splice($arr1, $i, $delete, $replacement); 10221 * 10222 * list($z, $t) = HTMLPurifier_Zipper::fromArray($arr); 10223 * $t = $z->advance($t, $i); 10224 * list($old2, $t) = $z->splice($t, $delete, $replacement); 10225 * $arr2 = $z->toArray($t); 10226 * 10227 * assert($old1 === $old2); 10228 * assert($arr1 === $arr2); 10229 * 10230 * NB: the absolute index location after this operation is 10231 * *unchanged!* 10232 * 10233 * @param Current contents of hole. 10234 */ 10235 public function splice($t, $delete, $replacement) { 10236 // delete 10237 $old = array(); 10238 $r = $t; 10239 for ($i = $delete; $i > 0; $i--) { 10240 $old[] = $r; 10241 $r = $this->delete(); 10242 } 10243 // insert 10244 for ($i = count($replacement)-1; $i >= 0; $i--) { 10245 $this->insertAfter($r); 10246 $r = $replacement[$i]; 10247 } 10248 return array($old, $r); 10249 } 10250} 10251 10252 10253 10254/** 10255 * Validates the HTML attribute style, otherwise known as CSS. 10256 * @note We don't implement the whole CSS specification, so it might be 10257 * difficult to reuse this component in the context of validating 10258 * actual stylesheet declarations. 10259 * @note If we were really serious about validating the CSS, we would 10260 * tokenize the styles and then parse the tokens. Obviously, we 10261 * are not doing that. Doing that could seriously harm performance, 10262 * but would make these components a lot more viable for a CSS 10263 * filtering solution. 10264 */ 10265class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef 10266{ 10267 10268 /** 10269 * @param string $css 10270 * @param HTMLPurifier_Config $config 10271 * @param HTMLPurifier_Context $context 10272 * @return bool|string 10273 */ 10274 public function validate($css, $config, $context) 10275 { 10276 $css = $this->parseCDATA($css); 10277 10278 $definition = $config->getCSSDefinition(); 10279 $allow_duplicates = $config->get("CSS.AllowDuplicates"); 10280 10281 10282 // According to the CSS2.1 spec, the places where a 10283 // non-delimiting semicolon can appear are in strings 10284 // escape sequences. So here is some dumb hack to 10285 // handle quotes. 10286 $len = strlen($css); 10287 $accum = ""; 10288 $declarations = array(); 10289 $quoted = false; 10290 for ($i = 0; $i < $len; $i++) { 10291 $c = strcspn($css, ";'\"", $i); 10292 $accum .= substr($css, $i, $c); 10293 $i += $c; 10294 if ($i == $len) break; 10295 $d = $css[$i]; 10296 if ($quoted) { 10297 $accum .= $d; 10298 if ($d == $quoted) { 10299 $quoted = false; 10300 } 10301 } else { 10302 if ($d == ";") { 10303 $declarations[] = $accum; 10304 $accum = ""; 10305 } else { 10306 $accum .= $d; 10307 $quoted = $d; 10308 } 10309 } 10310 } 10311 if ($accum != "") $declarations[] = $accum; 10312 10313 $propvalues = array(); 10314 $new_declarations = ''; 10315 10316 /** 10317 * Name of the current CSS property being validated. 10318 */ 10319 $property = false; 10320 $context->register('CurrentCSSProperty', $property); 10321 10322 foreach ($declarations as $declaration) { 10323 if (!$declaration) { 10324 continue; 10325 } 10326 if (!strpos($declaration, ':')) { 10327 continue; 10328 } 10329 list($property, $value) = explode(':', $declaration, 2); 10330 $property = trim($property); 10331 $value = trim($value); 10332 $ok = false; 10333 do { 10334 if (isset($definition->info[$property])) { 10335 $ok = true; 10336 break; 10337 } 10338 if (ctype_lower($property)) { 10339 break; 10340 } 10341 $property = strtolower($property); 10342 if (isset($definition->info[$property])) { 10343 $ok = true; 10344 break; 10345 } 10346 } while (0); 10347 if (!$ok) { 10348 continue; 10349 } 10350 // inefficient call, since the validator will do this again 10351 if (strtolower(trim($value)) !== 'inherit') { 10352 // inherit works for everything (but only on the base property) 10353 $result = $definition->info[$property]->validate( 10354 $value, 10355 $config, 10356 $context 10357 ); 10358 } else { 10359 $result = 'inherit'; 10360 } 10361 if ($result === false) { 10362 continue; 10363 } 10364 if ($allow_duplicates) { 10365 $new_declarations .= "$property:$result;"; 10366 } else { 10367 $propvalues[$property] = $result; 10368 } 10369 } 10370 10371 $context->destroy('CurrentCSSProperty'); 10372 10373 // procedure does not write the new CSS simultaneously, so it's 10374 // slightly inefficient, but it's the only way of getting rid of 10375 // duplicates. Perhaps config to optimize it, but not now. 10376 10377 foreach ($propvalues as $prop => $value) { 10378 $new_declarations .= "$prop:$value;"; 10379 } 10380 10381 return $new_declarations ? $new_declarations : false; 10382 10383 } 10384 10385} 10386 10387 10388 10389 10390 10391/** 10392 * Dummy AttrDef that mimics another AttrDef, BUT it generates clones 10393 * with make. 10394 */ 10395class HTMLPurifier_AttrDef_Clone extends HTMLPurifier_AttrDef 10396{ 10397 /** 10398 * What we're cloning. 10399 * @type HTMLPurifier_AttrDef 10400 */ 10401 protected $clone; 10402 10403 /** 10404 * @param HTMLPurifier_AttrDef $clone 10405 */ 10406 public function __construct($clone) 10407 { 10408 $this->clone = $clone; 10409 } 10410 10411 /** 10412 * @param string $v 10413 * @param HTMLPurifier_Config $config 10414 * @param HTMLPurifier_Context $context 10415 * @return bool|string 10416 */ 10417 public function validate($v, $config, $context) 10418 { 10419 return $this->clone->validate($v, $config, $context); 10420 } 10421 10422 /** 10423 * @param string $string 10424 * @return HTMLPurifier_AttrDef 10425 */ 10426 public function make($string) 10427 { 10428 return clone $this->clone; 10429 } 10430} 10431 10432 10433 10434 10435 10436// Enum = Enumerated 10437/** 10438 * Validates a keyword against a list of valid values. 10439 * @warning The case-insensitive compare of this function uses PHP's 10440 * built-in strtolower and ctype_lower functions, which may 10441 * cause problems with international comparisons 10442 */ 10443class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef 10444{ 10445 10446 /** 10447 * Lookup table of valid values. 10448 * @type array 10449 * @todo Make protected 10450 */ 10451 public $valid_values = array(); 10452 10453 /** 10454 * Bool indicating whether or not enumeration is case sensitive. 10455 * @note In general this is always case insensitive. 10456 */ 10457 protected $case_sensitive = false; // values according to W3C spec 10458 10459 /** 10460 * @param array $valid_values List of valid values 10461 * @param bool $case_sensitive Whether or not case sensitive 10462 */ 10463 public function __construct($valid_values = array(), $case_sensitive = false) 10464 { 10465 $this->valid_values = array_flip($valid_values); 10466 $this->case_sensitive = $case_sensitive; 10467 } 10468 10469 /** 10470 * @param string $string 10471 * @param HTMLPurifier_Config $config 10472 * @param HTMLPurifier_Context $context 10473 * @return bool|string 10474 */ 10475 public function validate($string, $config, $context) 10476 { 10477 $string = trim($string); 10478 if (!$this->case_sensitive) { 10479 // we may want to do full case-insensitive libraries 10480 $string = ctype_lower($string) ? $string : strtolower($string); 10481 } 10482 $result = isset($this->valid_values[$string]); 10483 10484 return $result ? $string : false; 10485 } 10486 10487 /** 10488 * @param string $string In form of comma-delimited list of case-insensitive 10489 * valid values. Example: "foo,bar,baz". Prepend "s:" to make 10490 * case sensitive 10491 * @return HTMLPurifier_AttrDef_Enum 10492 */ 10493 public function make($string) 10494 { 10495 if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') { 10496 $string = substr($string, 2); 10497 $sensitive = true; 10498 } else { 10499 $sensitive = false; 10500 } 10501 $values = explode(',', $string); 10502 return new HTMLPurifier_AttrDef_Enum($values, $sensitive); 10503 } 10504} 10505 10506 10507 10508 10509 10510/** 10511 * Validates an integer. 10512 * @note While this class was modeled off the CSS definition, no currently 10513 * allowed CSS uses this type. The properties that do are: widows, 10514 * orphans, z-index, counter-increment, counter-reset. Some of the 10515 * HTML attributes, however, find use for a non-negative version of this. 10516 */ 10517class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef 10518{ 10519 10520 /** 10521 * Whether or not negative values are allowed. 10522 * @type bool 10523 */ 10524 protected $negative = true; 10525 10526 /** 10527 * Whether or not zero is allowed. 10528 * @type bool 10529 */ 10530 protected $zero = true; 10531 10532 /** 10533 * Whether or not positive values are allowed. 10534 * @type bool 10535 */ 10536 protected $positive = true; 10537 10538 /** 10539 * @param $negative Bool indicating whether or not negative values are allowed 10540 * @param $zero Bool indicating whether or not zero is allowed 10541 * @param $positive Bool indicating whether or not positive values are allowed 10542 */ 10543 public function __construct($negative = true, $zero = true, $positive = true) 10544 { 10545 $this->negative = $negative; 10546 $this->zero = $zero; 10547 $this->positive = $positive; 10548 } 10549 10550 /** 10551 * @param string $integer 10552 * @param HTMLPurifier_Config $config 10553 * @param HTMLPurifier_Context $context 10554 * @return bool|string 10555 */ 10556 public function validate($integer, $config, $context) 10557 { 10558 $integer = $this->parseCDATA($integer); 10559 if ($integer === '') { 10560 return false; 10561 } 10562 10563 // we could possibly simply typecast it to integer, but there are 10564 // certain fringe cases that must not return an integer. 10565 10566 // clip leading sign 10567 if ($this->negative && $integer[0] === '-') { 10568 $digits = substr($integer, 1); 10569 if ($digits === '0') { 10570 $integer = '0'; 10571 } // rm minus sign for zero 10572 } elseif ($this->positive && $integer[0] === '+') { 10573 $digits = $integer = substr($integer, 1); // rm unnecessary plus 10574 } else { 10575 $digits = $integer; 10576 } 10577 10578 // test if it's numeric 10579 if (!ctype_digit($digits)) { 10580 return false; 10581 } 10582 10583 // perform scope tests 10584 if (!$this->zero && $integer == 0) { 10585 return false; 10586 } 10587 if (!$this->positive && $integer > 0) { 10588 return false; 10589 } 10590 if (!$this->negative && $integer < 0) { 10591 return false; 10592 } 10593 10594 return $integer; 10595 } 10596} 10597 10598 10599 10600 10601 10602/** 10603 * Validates the HTML attribute lang, effectively a language code. 10604 * @note Built according to RFC 3066, which obsoleted RFC 1766 10605 */ 10606class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef 10607{ 10608 10609 /** 10610 * @param string $string 10611 * @param HTMLPurifier_Config $config 10612 * @param HTMLPurifier_Context $context 10613 * @return bool|string 10614 */ 10615 public function validate($string, $config, $context) 10616 { 10617 $string = trim($string); 10618 if (!$string) { 10619 return false; 10620 } 10621 10622 $subtags = explode('-', $string); 10623 $num_subtags = count($subtags); 10624 10625 if ($num_subtags == 0) { // sanity check 10626 return false; 10627 } 10628 10629 // process primary subtag : $subtags[0] 10630 $length = strlen($subtags[0]); 10631 switch ($length) { 10632 case 0: 10633 return false; 10634 case 1: 10635 if (!($subtags[0] == 'x' || $subtags[0] == 'i')) { 10636 return false; 10637 } 10638 break; 10639 case 2: 10640 case 3: 10641 if (!ctype_alpha($subtags[0])) { 10642 return false; 10643 } elseif (!ctype_lower($subtags[0])) { 10644 $subtags[0] = strtolower($subtags[0]); 10645 } 10646 break; 10647 default: 10648 return false; 10649 } 10650 10651 $new_string = $subtags[0]; 10652 if ($num_subtags == 1) { 10653 return $new_string; 10654 } 10655 10656 // process second subtag : $subtags[1] 10657 $length = strlen($subtags[1]); 10658 if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) { 10659 return $new_string; 10660 } 10661 if (!ctype_lower($subtags[1])) { 10662 $subtags[1] = strtolower($subtags[1]); 10663 } 10664 10665 $new_string .= '-' . $subtags[1]; 10666 if ($num_subtags == 2) { 10667 return $new_string; 10668 } 10669 10670 // process all other subtags, index 2 and up 10671 for ($i = 2; $i < $num_subtags; $i++) { 10672 $length = strlen($subtags[$i]); 10673 if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) { 10674 return $new_string; 10675 } 10676 if (!ctype_lower($subtags[$i])) { 10677 $subtags[$i] = strtolower($subtags[$i]); 10678 } 10679 $new_string .= '-' . $subtags[$i]; 10680 } 10681 return $new_string; 10682 } 10683} 10684 10685 10686 10687 10688 10689/** 10690 * Decorator that, depending on a token, switches between two definitions. 10691 */ 10692class HTMLPurifier_AttrDef_Switch 10693{ 10694 10695 /** 10696 * @type string 10697 */ 10698 protected $tag; 10699 10700 /** 10701 * @type HTMLPurifier_AttrDef 10702 */ 10703 protected $withTag; 10704 10705 /** 10706 * @type HTMLPurifier_AttrDef 10707 */ 10708 protected $withoutTag; 10709 10710 /** 10711 * @param string $tag Tag name to switch upon 10712 * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag 10713 * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token 10714 */ 10715 public function __construct($tag, $with_tag, $without_tag) 10716 { 10717 $this->tag = $tag; 10718 $this->withTag = $with_tag; 10719 $this->withoutTag = $without_tag; 10720 } 10721 10722 /** 10723 * @param string $string 10724 * @param HTMLPurifier_Config $config 10725 * @param HTMLPurifier_Context $context 10726 * @return bool|string 10727 */ 10728 public function validate($string, $config, $context) 10729 { 10730 $token = $context->get('CurrentToken', true); 10731 if (!$token || $token->name !== $this->tag) { 10732 return $this->withoutTag->validate($string, $config, $context); 10733 } else { 10734 return $this->withTag->validate($string, $config, $context); 10735 } 10736 } 10737} 10738 10739 10740 10741 10742 10743/** 10744 * Validates arbitrary text according to the HTML spec. 10745 */ 10746class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef 10747{ 10748 10749 /** 10750 * @param string $string 10751 * @param HTMLPurifier_Config $config 10752 * @param HTMLPurifier_Context $context 10753 * @return bool|string 10754 */ 10755 public function validate($string, $config, $context) 10756 { 10757 return $this->parseCDATA($string); 10758 } 10759} 10760 10761 10762 10763 10764 10765/** 10766 * Validates a URI as defined by RFC 3986. 10767 * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme 10768 */ 10769class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef 10770{ 10771 10772 /** 10773 * @type HTMLPurifier_URIParser 10774 */ 10775 protected $parser; 10776 10777 /** 10778 * @type bool 10779 */ 10780 protected $embedsResource; 10781 10782 /** 10783 * @param bool $embeds_resource Does the URI here result in an extra HTTP request? 10784 */ 10785 public function __construct($embeds_resource = false) 10786 { 10787 $this->parser = new HTMLPurifier_URIParser(); 10788 $this->embedsResource = (bool)$embeds_resource; 10789 } 10790 10791 /** 10792 * @param string $string 10793 * @return HTMLPurifier_AttrDef_URI 10794 */ 10795 public function make($string) 10796 { 10797 $embeds = ($string === 'embedded'); 10798 return new HTMLPurifier_AttrDef_URI($embeds); 10799 } 10800 10801 /** 10802 * @param string $uri 10803 * @param HTMLPurifier_Config $config 10804 * @param HTMLPurifier_Context $context 10805 * @return bool|string 10806 */ 10807 public function validate($uri, $config, $context) 10808 { 10809 if ($config->get('URI.Disable')) { 10810 return false; 10811 } 10812 10813 $uri = $this->parseCDATA($uri); 10814 10815 // parse the URI 10816 $uri = $this->parser->parse($uri); 10817 if ($uri === false) { 10818 return false; 10819 } 10820 10821 // add embedded flag to context for validators 10822 $context->register('EmbeddedURI', $this->embedsResource); 10823 10824 $ok = false; 10825 do { 10826 10827 // generic validation 10828 $result = $uri->validate($config, $context); 10829 if (!$result) { 10830 break; 10831 } 10832 10833 // chained filtering 10834 $uri_def = $config->getDefinition('URI'); 10835 $result = $uri_def->filter($uri, $config, $context); 10836 if (!$result) { 10837 break; 10838 } 10839 10840 // scheme-specific validation 10841 $scheme_obj = $uri->getSchemeObj($config, $context); 10842 if (!$scheme_obj) { 10843 break; 10844 } 10845 if ($this->embedsResource && !$scheme_obj->browsable) { 10846 break; 10847 } 10848 $result = $scheme_obj->validate($uri, $config, $context); 10849 if (!$result) { 10850 break; 10851 } 10852 10853 // Post chained filtering 10854 $result = $uri_def->postFilter($uri, $config, $context); 10855 if (!$result) { 10856 break; 10857 } 10858 10859 // survived gauntlet 10860 $ok = true; 10861 10862 } while (false); 10863 10864 $context->destroy('EmbeddedURI'); 10865 if (!$ok) { 10866 return false; 10867 } 10868 // back to string 10869 return $uri->toString(); 10870 } 10871} 10872 10873 10874 10875 10876 10877/** 10878 * Validates a number as defined by the CSS spec. 10879 */ 10880class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef 10881{ 10882 10883 /** 10884 * Indicates whether or not only positive values are allowed. 10885 * @type bool 10886 */ 10887 protected $non_negative = false; 10888 10889 /** 10890 * @param bool $non_negative indicates whether negatives are forbidden 10891 */ 10892 public function __construct($non_negative = false) 10893 { 10894 $this->non_negative = $non_negative; 10895 } 10896 10897 /** 10898 * @param string $number 10899 * @param HTMLPurifier_Config $config 10900 * @param HTMLPurifier_Context $context 10901 * @return string|bool 10902 * @warning Some contexts do not pass $config, $context. These 10903 * variables should not be used without checking HTMLPurifier_Length 10904 */ 10905 public function validate($number, $config, $context) 10906 { 10907 $number = $this->parseCDATA($number); 10908 10909 if ($number === '') { 10910 return false; 10911 } 10912 if ($number === '0') { 10913 return '0'; 10914 } 10915 10916 $sign = ''; 10917 switch ($number[0]) { 10918 case '-': 10919 if ($this->non_negative) { 10920 return false; 10921 } 10922 $sign = '-'; 10923 case '+': 10924 $number = substr($number, 1); 10925 } 10926 10927 if (ctype_digit($number)) { 10928 $number = ltrim($number, '0'); 10929 return $number ? $sign . $number : '0'; 10930 } 10931 10932 // Period is the only non-numeric character allowed 10933 if (strpos($number, '.') === false) { 10934 return false; 10935 } 10936 10937 list($left, $right) = explode('.', $number, 2); 10938 10939 if ($left === '' && $right === '') { 10940 return false; 10941 } 10942 if ($left !== '' && !ctype_digit($left)) { 10943 return false; 10944 } 10945 10946 $left = ltrim($left, '0'); 10947 $right = rtrim($right, '0'); 10948 10949 if ($right === '') { 10950 return $left ? $sign . $left : '0'; 10951 } elseif (!ctype_digit($right)) { 10952 return false; 10953 } 10954 return $sign . $left . '.' . $right; 10955 } 10956} 10957 10958 10959 10960 10961 10962class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number 10963{ 10964 10965 public function __construct() 10966 { 10967 parent::__construct(false); // opacity is non-negative, but we will clamp it 10968 } 10969 10970 /** 10971 * @param string $number 10972 * @param HTMLPurifier_Config $config 10973 * @param HTMLPurifier_Context $context 10974 * @return string 10975 */ 10976 public function validate($number, $config, $context) 10977 { 10978 $result = parent::validate($number, $config, $context); 10979 if ($result === false) { 10980 return $result; 10981 } 10982 $float = (float)$result; 10983 if ($float < 0.0) { 10984 $result = '0'; 10985 } 10986 if ($float > 1.0) { 10987 $result = '1'; 10988 } 10989 return $result; 10990 } 10991} 10992 10993 10994 10995 10996 10997/** 10998 * Validates shorthand CSS property background. 10999 * @warning Does not support url tokens that have internal spaces. 11000 */ 11001class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef 11002{ 11003 11004 /** 11005 * Local copy of component validators. 11006 * @type HTMLPurifier_AttrDef[] 11007 * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl. 11008 */ 11009 protected $info; 11010 11011 /** 11012 * @param HTMLPurifier_Config $config 11013 */ 11014 public function __construct($config) 11015 { 11016 $def = $config->getCSSDefinition(); 11017 $this->info['background-color'] = $def->info['background-color']; 11018 $this->info['background-image'] = $def->info['background-image']; 11019 $this->info['background-repeat'] = $def->info['background-repeat']; 11020 $this->info['background-attachment'] = $def->info['background-attachment']; 11021 $this->info['background-position'] = $def->info['background-position']; 11022 } 11023 11024 /** 11025 * @param string $string 11026 * @param HTMLPurifier_Config $config 11027 * @param HTMLPurifier_Context $context 11028 * @return bool|string 11029 */ 11030 public function validate($string, $config, $context) 11031 { 11032 // regular pre-processing 11033 $string = $this->parseCDATA($string); 11034 if ($string === '') { 11035 return false; 11036 } 11037 11038 // munge rgb() decl if necessary 11039 $string = $this->mungeRgb($string); 11040 11041 // assumes URI doesn't have spaces in it 11042 $bits = explode(' ', $string); // bits to process 11043 11044 $caught = array(); 11045 $caught['color'] = false; 11046 $caught['image'] = false; 11047 $caught['repeat'] = false; 11048 $caught['attachment'] = false; 11049 $caught['position'] = false; 11050 11051 $i = 0; // number of catches 11052 11053 foreach ($bits as $bit) { 11054 if ($bit === '') { 11055 continue; 11056 } 11057 foreach ($caught as $key => $status) { 11058 if ($key != 'position') { 11059 if ($status !== false) { 11060 continue; 11061 } 11062 $r = $this->info['background-' . $key]->validate($bit, $config, $context); 11063 } else { 11064 $r = $bit; 11065 } 11066 if ($r === false) { 11067 continue; 11068 } 11069 if ($key == 'position') { 11070 if ($caught[$key] === false) { 11071 $caught[$key] = ''; 11072 } 11073 $caught[$key] .= $r . ' '; 11074 } else { 11075 $caught[$key] = $r; 11076 } 11077 $i++; 11078 break; 11079 } 11080 } 11081 11082 if (!$i) { 11083 return false; 11084 } 11085 if ($caught['position'] !== false) { 11086 $caught['position'] = $this->info['background-position']-> 11087 validate($caught['position'], $config, $context); 11088 } 11089 11090 $ret = array(); 11091 foreach ($caught as $value) { 11092 if ($value === false) { 11093 continue; 11094 } 11095 $ret[] = $value; 11096 } 11097 11098 if (empty($ret)) { 11099 return false; 11100 } 11101 return implode(' ', $ret); 11102 } 11103} 11104 11105 11106 11107 11108 11109/* W3C says: 11110 [ // adjective and number must be in correct order, even if 11111 // you could switch them without introducing ambiguity. 11112 // some browsers support that syntax 11113 [ 11114 <percentage> | <length> | left | center | right 11115 ] 11116 [ 11117 <percentage> | <length> | top | center | bottom 11118 ]? 11119 ] | 11120 [ // this signifies that the vertical and horizontal adjectives 11121 // can be arbitrarily ordered, however, there can only be two, 11122 // one of each, or none at all 11123 [ 11124 left | center | right 11125 ] || 11126 [ 11127 top | center | bottom 11128 ] 11129 ] 11130 top, left = 0% 11131 center, (none) = 50% 11132 bottom, right = 100% 11133*/ 11134 11135/* QuirksMode says: 11136 keyword + length/percentage must be ordered correctly, as per W3C 11137 11138 Internet Explorer and Opera, however, support arbitrary ordering. We 11139 should fix it up. 11140 11141 Minor issue though, not strictly necessary. 11142*/ 11143 11144// control freaks may appreciate the ability to convert these to 11145// percentages or something, but it's not necessary 11146 11147/** 11148 * Validates the value of background-position. 11149 */ 11150class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef 11151{ 11152 11153 /** 11154 * @type HTMLPurifier_AttrDef_CSS_Length 11155 */ 11156 protected $length; 11157 11158 /** 11159 * @type HTMLPurifier_AttrDef_CSS_Percentage 11160 */ 11161 protected $percentage; 11162 11163 public function __construct() 11164 { 11165 $this->length = new HTMLPurifier_AttrDef_CSS_Length(); 11166 $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage(); 11167 } 11168 11169 /** 11170 * @param string $string 11171 * @param HTMLPurifier_Config $config 11172 * @param HTMLPurifier_Context $context 11173 * @return bool|string 11174 */ 11175 public function validate($string, $config, $context) 11176 { 11177 $string = $this->parseCDATA($string); 11178 $bits = explode(' ', $string); 11179 11180 $keywords = array(); 11181 $keywords['h'] = false; // left, right 11182 $keywords['v'] = false; // top, bottom 11183 $keywords['ch'] = false; // center (first word) 11184 $keywords['cv'] = false; // center (second word) 11185 $measures = array(); 11186 11187 $i = 0; 11188 11189 $lookup = array( 11190 'top' => 'v', 11191 'bottom' => 'v', 11192 'left' => 'h', 11193 'right' => 'h', 11194 'center' => 'c' 11195 ); 11196 11197 foreach ($bits as $bit) { 11198 if ($bit === '') { 11199 continue; 11200 } 11201 11202 // test for keyword 11203 $lbit = ctype_lower($bit) ? $bit : strtolower($bit); 11204 if (isset($lookup[$lbit])) { 11205 $status = $lookup[$lbit]; 11206 if ($status == 'c') { 11207 if ($i == 0) { 11208 $status = 'ch'; 11209 } else { 11210 $status = 'cv'; 11211 } 11212 } 11213 $keywords[$status] = $lbit; 11214 $i++; 11215 } 11216 11217 // test for length 11218 $r = $this->length->validate($bit, $config, $context); 11219 if ($r !== false) { 11220 $measures[] = $r; 11221 $i++; 11222 } 11223 11224 // test for percentage 11225 $r = $this->percentage->validate($bit, $config, $context); 11226 if ($r !== false) { 11227 $measures[] = $r; 11228 $i++; 11229 } 11230 } 11231 11232 if (!$i) { 11233 return false; 11234 } // no valid values were caught 11235 11236 $ret = array(); 11237 11238 // first keyword 11239 if ($keywords['h']) { 11240 $ret[] = $keywords['h']; 11241 } elseif ($keywords['ch']) { 11242 $ret[] = $keywords['ch']; 11243 $keywords['cv'] = false; // prevent re-use: center = center center 11244 } elseif (count($measures)) { 11245 $ret[] = array_shift($measures); 11246 } 11247 11248 if ($keywords['v']) { 11249 $ret[] = $keywords['v']; 11250 } elseif ($keywords['cv']) { 11251 $ret[] = $keywords['cv']; 11252 } elseif (count($measures)) { 11253 $ret[] = array_shift($measures); 11254 } 11255 11256 if (empty($ret)) { 11257 return false; 11258 } 11259 return implode(' ', $ret); 11260 } 11261} 11262 11263 11264 11265 11266 11267/** 11268 * Validates the border property as defined by CSS. 11269 */ 11270class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef 11271{ 11272 11273 /** 11274 * Local copy of properties this property is shorthand for. 11275 * @type HTMLPurifier_AttrDef[] 11276 */ 11277 protected $info = array(); 11278 11279 /** 11280 * @param HTMLPurifier_Config $config 11281 */ 11282 public function __construct($config) 11283 { 11284 $def = $config->getCSSDefinition(); 11285 $this->info['border-width'] = $def->info['border-width']; 11286 $this->info['border-style'] = $def->info['border-style']; 11287 $this->info['border-top-color'] = $def->info['border-top-color']; 11288 } 11289 11290 /** 11291 * @param string $string 11292 * @param HTMLPurifier_Config $config 11293 * @param HTMLPurifier_Context $context 11294 * @return bool|string 11295 */ 11296 public function validate($string, $config, $context) 11297 { 11298 $string = $this->parseCDATA($string); 11299 $string = $this->mungeRgb($string); 11300 $bits = explode(' ', $string); 11301 $done = array(); // segments we've finished 11302 $ret = ''; // return value 11303 foreach ($bits as $bit) { 11304 foreach ($this->info as $propname => $validator) { 11305 if (isset($done[$propname])) { 11306 continue; 11307 } 11308 $r = $validator->validate($bit, $config, $context); 11309 if ($r !== false) { 11310 $ret .= $r . ' '; 11311 $done[$propname] = true; 11312 break; 11313 } 11314 } 11315 } 11316 return rtrim($ret); 11317 } 11318} 11319 11320 11321 11322 11323 11324/** 11325 * Validates Color as defined by CSS. 11326 */ 11327class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef 11328{ 11329 11330 /** 11331 * @type HTMLPurifier_AttrDef_CSS_AlphaValue 11332 */ 11333 protected $alpha; 11334 11335 public function __construct() 11336 { 11337 $this->alpha = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 11338 } 11339 11340 /** 11341 * @param string $color 11342 * @param HTMLPurifier_Config $config 11343 * @param HTMLPurifier_Context $context 11344 * @return bool|string 11345 */ 11346 public function validate($color, $config, $context) 11347 { 11348 static $colors = null; 11349 if ($colors === null) { 11350 $colors = $config->get('Core.ColorKeywords'); 11351 } 11352 11353 $color = trim($color); 11354 if ($color === '') { 11355 return false; 11356 } 11357 11358 $lower = strtolower($color); 11359 if (isset($colors[$lower])) { 11360 return $colors[$lower]; 11361 } 11362 11363 if (preg_match('#(rgb|rgba|hsl|hsla)\(#', $color, $matches) === 1) { 11364 $length = strlen($color); 11365 if (strpos($color, ')') !== $length - 1) { 11366 return false; 11367 } 11368 11369 // get used function : rgb, rgba, hsl or hsla 11370 $function = $matches[1]; 11371 11372 $parameters_size = 3; 11373 $alpha_channel = false; 11374 if (substr($function, -1) === 'a') { 11375 $parameters_size = 4; 11376 $alpha_channel = true; 11377 } 11378 11379 /* 11380 * Allowed types for values : 11381 * parameter_position => [type => max_value] 11382 */ 11383 $allowed_types = array( 11384 1 => array('percentage' => 100, 'integer' => 255), 11385 2 => array('percentage' => 100, 'integer' => 255), 11386 3 => array('percentage' => 100, 'integer' => 255), 11387 ); 11388 $allow_different_types = false; 11389 11390 if (strpos($function, 'hsl') !== false) { 11391 $allowed_types = array( 11392 1 => array('integer' => 360), 11393 2 => array('percentage' => 100), 11394 3 => array('percentage' => 100), 11395 ); 11396 $allow_different_types = true; 11397 } 11398 11399 $values = trim(str_replace($function, '', $color), ' ()'); 11400 11401 $parts = explode(',', $values); 11402 if (count($parts) !== $parameters_size) { 11403 return false; 11404 } 11405 11406 $type = false; 11407 $new_parts = array(); 11408 $i = 0; 11409 11410 foreach ($parts as $part) { 11411 $i++; 11412 $part = trim($part); 11413 11414 if ($part === '') { 11415 return false; 11416 } 11417 11418 // different check for alpha channel 11419 if ($alpha_channel === true && $i === count($parts)) { 11420 $result = $this->alpha->validate($part, $config, $context); 11421 11422 if ($result === false) { 11423 return false; 11424 } 11425 11426 $new_parts[] = (string)$result; 11427 continue; 11428 } 11429 11430 if (substr($part, -1) === '%') { 11431 $current_type = 'percentage'; 11432 } else { 11433 $current_type = 'integer'; 11434 } 11435 11436 if (!array_key_exists($current_type, $allowed_types[$i])) { 11437 return false; 11438 } 11439 11440 if (!$type) { 11441 $type = $current_type; 11442 } 11443 11444 if ($allow_different_types === false && $type != $current_type) { 11445 return false; 11446 } 11447 11448 $max_value = $allowed_types[$i][$current_type]; 11449 11450 if ($current_type == 'integer') { 11451 // Return value between range 0 -> $max_value 11452 $new_parts[] = (int)max(min($part, $max_value), 0); 11453 } elseif ($current_type == 'percentage') { 11454 $new_parts[] = (float)max(min(rtrim($part, '%'), $max_value), 0) . '%'; 11455 } 11456 } 11457 11458 $new_values = implode(',', $new_parts); 11459 11460 $color = $function . '(' . $new_values . ')'; 11461 } else { 11462 // hexadecimal handling 11463 if ($color[0] === '#') { 11464 $hex = substr($color, 1); 11465 } else { 11466 $hex = $color; 11467 $color = '#' . $color; 11468 } 11469 $length = strlen($hex); 11470 if ($length !== 3 && $length !== 6) { 11471 return false; 11472 } 11473 if (!ctype_xdigit($hex)) { 11474 return false; 11475 } 11476 } 11477 return $color; 11478 } 11479 11480} 11481 11482 11483 11484 11485 11486/** 11487 * Allows multiple validators to attempt to validate attribute. 11488 * 11489 * Composite is just what it sounds like: a composite of many validators. 11490 * This means that multiple HTMLPurifier_AttrDef objects will have a whack 11491 * at the string. If one of them passes, that's what is returned. This is 11492 * especially useful for CSS values, which often are a choice between 11493 * an enumerated set of predefined values or a flexible data type. 11494 */ 11495class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef 11496{ 11497 11498 /** 11499 * List of objects that may process strings. 11500 * @type HTMLPurifier_AttrDef[] 11501 * @todo Make protected 11502 */ 11503 public $defs; 11504 11505 /** 11506 * @param HTMLPurifier_AttrDef[] $defs List of HTMLPurifier_AttrDef objects 11507 */ 11508 public function __construct($defs) 11509 { 11510 $this->defs = $defs; 11511 } 11512 11513 /** 11514 * @param string $string 11515 * @param HTMLPurifier_Config $config 11516 * @param HTMLPurifier_Context $context 11517 * @return bool|string 11518 */ 11519 public function validate($string, $config, $context) 11520 { 11521 foreach ($this->defs as $i => $def) { 11522 $result = $this->defs[$i]->validate($string, $config, $context); 11523 if ($result !== false) { 11524 return $result; 11525 } 11526 } 11527 return false; 11528 } 11529} 11530 11531 11532 11533 11534 11535/** 11536 * Decorator which enables CSS properties to be disabled for specific elements. 11537 */ 11538class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef 11539{ 11540 /** 11541 * @type HTMLPurifier_AttrDef 11542 */ 11543 public $def; 11544 /** 11545 * @type string 11546 */ 11547 public $element; 11548 11549 /** 11550 * @param HTMLPurifier_AttrDef $def Definition to wrap 11551 * @param string $element Element to deny 11552 */ 11553 public function __construct($def, $element) 11554 { 11555 $this->def = $def; 11556 $this->element = $element; 11557 } 11558 11559 /** 11560 * Checks if CurrentToken is set and equal to $this->element 11561 * @param string $string 11562 * @param HTMLPurifier_Config $config 11563 * @param HTMLPurifier_Context $context 11564 * @return bool|string 11565 */ 11566 public function validate($string, $config, $context) 11567 { 11568 $token = $context->get('CurrentToken', true); 11569 if ($token && $token->name == $this->element) { 11570 return false; 11571 } 11572 return $this->def->validate($string, $config, $context); 11573 } 11574} 11575 11576 11577 11578 11579 11580/** 11581 * Microsoft's proprietary filter: CSS property 11582 * @note Currently supports the alpha filter. In the future, this will 11583 * probably need an extensible framework 11584 */ 11585class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef 11586{ 11587 /** 11588 * @type HTMLPurifier_AttrDef_Integer 11589 */ 11590 protected $intValidator; 11591 11592 public function __construct() 11593 { 11594 $this->intValidator = new HTMLPurifier_AttrDef_Integer(); 11595 } 11596 11597 /** 11598 * @param string $value 11599 * @param HTMLPurifier_Config $config 11600 * @param HTMLPurifier_Context $context 11601 * @return bool|string 11602 */ 11603 public function validate($value, $config, $context) 11604 { 11605 $value = $this->parseCDATA($value); 11606 if ($value === 'none') { 11607 return $value; 11608 } 11609 // if we looped this we could support multiple filters 11610 $function_length = strcspn($value, '('); 11611 $function = trim(substr($value, 0, $function_length)); 11612 if ($function !== 'alpha' && 11613 $function !== 'Alpha' && 11614 $function !== 'progid:DXImageTransform.Microsoft.Alpha' 11615 ) { 11616 return false; 11617 } 11618 $cursor = $function_length + 1; 11619 $parameters_length = strcspn($value, ')', $cursor); 11620 $parameters = substr($value, $cursor, $parameters_length); 11621 $params = explode(',', $parameters); 11622 $ret_params = array(); 11623 $lookup = array(); 11624 foreach ($params as $param) { 11625 list($key, $value) = explode('=', $param); 11626 $key = trim($key); 11627 $value = trim($value); 11628 if (isset($lookup[$key])) { 11629 continue; 11630 } 11631 if ($key !== 'opacity') { 11632 continue; 11633 } 11634 $value = $this->intValidator->validate($value, $config, $context); 11635 if ($value === false) { 11636 continue; 11637 } 11638 $int = (int)$value; 11639 if ($int > 100) { 11640 $value = '100'; 11641 } 11642 if ($int < 0) { 11643 $value = '0'; 11644 } 11645 $ret_params[] = "$key=$value"; 11646 $lookup[$key] = true; 11647 } 11648 $ret_parameters = implode(',', $ret_params); 11649 $ret_function = "$function($ret_parameters)"; 11650 return $ret_function; 11651 } 11652} 11653 11654 11655 11656 11657 11658/** 11659 * Validates shorthand CSS property font. 11660 */ 11661class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef 11662{ 11663 11664 /** 11665 * Local copy of validators 11666 * @type HTMLPurifier_AttrDef[] 11667 * @note If we moved specific CSS property definitions to their own 11668 * classes instead of having them be assembled at run time by 11669 * CSSDefinition, this wouldn't be necessary. We'd instantiate 11670 * our own copies. 11671 */ 11672 protected $info = array(); 11673 11674 /** 11675 * @param HTMLPurifier_Config $config 11676 */ 11677 public function __construct($config) 11678 { 11679 $def = $config->getCSSDefinition(); 11680 $this->info['font-style'] = $def->info['font-style']; 11681 $this->info['font-variant'] = $def->info['font-variant']; 11682 $this->info['font-weight'] = $def->info['font-weight']; 11683 $this->info['font-size'] = $def->info['font-size']; 11684 $this->info['line-height'] = $def->info['line-height']; 11685 $this->info['font-family'] = $def->info['font-family']; 11686 } 11687 11688 /** 11689 * @param string $string 11690 * @param HTMLPurifier_Config $config 11691 * @param HTMLPurifier_Context $context 11692 * @return bool|string 11693 */ 11694 public function validate($string, $config, $context) 11695 { 11696 static $system_fonts = array( 11697 'caption' => true, 11698 'icon' => true, 11699 'menu' => true, 11700 'message-box' => true, 11701 'small-caption' => true, 11702 'status-bar' => true 11703 ); 11704 11705 // regular pre-processing 11706 $string = $this->parseCDATA($string); 11707 if ($string === '') { 11708 return false; 11709 } 11710 11711 // check if it's one of the keywords 11712 $lowercase_string = strtolower($string); 11713 if (isset($system_fonts[$lowercase_string])) { 11714 return $lowercase_string; 11715 } 11716 11717 $bits = explode(' ', $string); // bits to process 11718 $stage = 0; // this indicates what we're looking for 11719 $caught = array(); // which stage 0 properties have we caught? 11720 $stage_1 = array('font-style', 'font-variant', 'font-weight'); 11721 $final = ''; // output 11722 11723 for ($i = 0, $size = count($bits); $i < $size; $i++) { 11724 if ($bits[$i] === '') { 11725 continue; 11726 } 11727 switch ($stage) { 11728 case 0: // attempting to catch font-style, font-variant or font-weight 11729 foreach ($stage_1 as $validator_name) { 11730 if (isset($caught[$validator_name])) { 11731 continue; 11732 } 11733 $r = $this->info[$validator_name]->validate( 11734 $bits[$i], 11735 $config, 11736 $context 11737 ); 11738 if ($r !== false) { 11739 $final .= $r . ' '; 11740 $caught[$validator_name] = true; 11741 break; 11742 } 11743 } 11744 // all three caught, continue on 11745 if (count($caught) >= 3) { 11746 $stage = 1; 11747 } 11748 if ($r !== false) { 11749 break; 11750 } 11751 case 1: // attempting to catch font-size and perhaps line-height 11752 $found_slash = false; 11753 if (strpos($bits[$i], '/') !== false) { 11754 list($font_size, $line_height) = 11755 explode('/', $bits[$i]); 11756 if ($line_height === '') { 11757 // ooh, there's a space after the slash! 11758 $line_height = false; 11759 $found_slash = true; 11760 } 11761 } else { 11762 $font_size = $bits[$i]; 11763 $line_height = false; 11764 } 11765 $r = $this->info['font-size']->validate( 11766 $font_size, 11767 $config, 11768 $context 11769 ); 11770 if ($r !== false) { 11771 $final .= $r; 11772 // attempt to catch line-height 11773 if ($line_height === false) { 11774 // we need to scroll forward 11775 for ($j = $i + 1; $j < $size; $j++) { 11776 if ($bits[$j] === '') { 11777 continue; 11778 } 11779 if ($bits[$j] === '/') { 11780 if ($found_slash) { 11781 return false; 11782 } else { 11783 $found_slash = true; 11784 continue; 11785 } 11786 } 11787 $line_height = $bits[$j]; 11788 break; 11789 } 11790 } else { 11791 // slash already found 11792 $found_slash = true; 11793 $j = $i; 11794 } 11795 if ($found_slash) { 11796 $i = $j; 11797 $r = $this->info['line-height']->validate( 11798 $line_height, 11799 $config, 11800 $context 11801 ); 11802 if ($r !== false) { 11803 $final .= '/' . $r; 11804 } 11805 } 11806 $final .= ' '; 11807 $stage = 2; 11808 break; 11809 } 11810 return false; 11811 case 2: // attempting to catch font-family 11812 $font_family = 11813 implode(' ', array_slice($bits, $i, $size - $i)); 11814 $r = $this->info['font-family']->validate( 11815 $font_family, 11816 $config, 11817 $context 11818 ); 11819 if ($r !== false) { 11820 $final .= $r . ' '; 11821 // processing completed successfully 11822 return rtrim($final); 11823 } 11824 return false; 11825 } 11826 } 11827 return false; 11828 } 11829} 11830 11831 11832 11833 11834 11835/** 11836 * Validates a font family list according to CSS spec 11837 */ 11838class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef 11839{ 11840 11841 protected $mask = null; 11842 11843 public function __construct() 11844 { 11845 $this->mask = '_- '; 11846 for ($c = 'a'; $c <= 'z'; $c++) { 11847 $this->mask .= $c; 11848 } 11849 for ($c = 'A'; $c <= 'Z'; $c++) { 11850 $this->mask .= $c; 11851 } 11852 for ($c = '0'; $c <= '9'; $c++) { 11853 $this->mask .= $c; 11854 } // cast-y, but should be fine 11855 // special bytes used by UTF-8 11856 for ($i = 0x80; $i <= 0xFF; $i++) { 11857 // We don't bother excluding invalid bytes in this range, 11858 // because the our restriction of well-formed UTF-8 will 11859 // prevent these from ever occurring. 11860 $this->mask .= chr($i); 11861 } 11862 11863 /* 11864 PHP's internal strcspn implementation is 11865 O(length of string * length of mask), making it inefficient 11866 for large masks. However, it's still faster than 11867 preg_match 8) 11868 for (p = s1;;) { 11869 spanp = s2; 11870 do { 11871 if (*spanp == c || p == s1_end) { 11872 return p - s1; 11873 } 11874 } while (spanp++ < (s2_end - 1)); 11875 c = *++p; 11876 } 11877 */ 11878 // possible optimization: invert the mask. 11879 } 11880 11881 /** 11882 * @param string $string 11883 * @param HTMLPurifier_Config $config 11884 * @param HTMLPurifier_Context $context 11885 * @return bool|string 11886 */ 11887 public function validate($string, $config, $context) 11888 { 11889 static $generic_names = array( 11890 'serif' => true, 11891 'sans-serif' => true, 11892 'monospace' => true, 11893 'fantasy' => true, 11894 'cursive' => true 11895 ); 11896 $allowed_fonts = $config->get('CSS.AllowedFonts'); 11897 11898 // assume that no font names contain commas in them 11899 $fonts = explode(',', $string); 11900 $final = ''; 11901 foreach ($fonts as $font) { 11902 $font = trim($font); 11903 if ($font === '') { 11904 continue; 11905 } 11906 // match a generic name 11907 if (isset($generic_names[$font])) { 11908 if ($allowed_fonts === null || isset($allowed_fonts[$font])) { 11909 $final .= $font . ', '; 11910 } 11911 continue; 11912 } 11913 // match a quoted name 11914 if ($font[0] === '"' || $font[0] === "'") { 11915 $length = strlen($font); 11916 if ($length <= 2) { 11917 continue; 11918 } 11919 $quote = $font[0]; 11920 if ($font[$length - 1] !== $quote) { 11921 continue; 11922 } 11923 $font = substr($font, 1, $length - 2); 11924 } 11925 11926 $font = $this->expandCSSEscape($font); 11927 11928 // $font is a pure representation of the font name 11929 11930 if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) { 11931 continue; 11932 } 11933 11934 if (ctype_alnum($font) && $font !== '') { 11935 // very simple font, allow it in unharmed 11936 $final .= $font . ', '; 11937 continue; 11938 } 11939 11940 // bugger out on whitespace. form feed (0C) really 11941 // shouldn't show up regardless 11942 $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font); 11943 11944 // Here, there are various classes of characters which need 11945 // to be treated differently: 11946 // - Alphanumeric characters are essentially safe. We 11947 // handled these above. 11948 // - Spaces require quoting, though most parsers will do 11949 // the right thing if there aren't any characters that 11950 // can be misinterpreted 11951 // - Dashes rarely occur, but they fairly unproblematic 11952 // for parsing/rendering purposes. 11953 // The above characters cover the majority of Western font 11954 // names. 11955 // - Arbitrary Unicode characters not in ASCII. Because 11956 // most parsers give little thought to Unicode, treatment 11957 // of these codepoints is basically uniform, even for 11958 // punctuation-like codepoints. These characters can 11959 // show up in non-Western pages and are supported by most 11960 // major browsers, for example: "MS 明朝" is a 11961 // legitimate font-name 11962 // <http://ja.wikipedia.org/wiki/MS_明朝>. See 11963 // the CSS3 spec for more examples: 11964 // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png> 11965 // You can see live samples of these on the Internet: 11966 // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック> 11967 // However, most of these fonts have ASCII equivalents: 11968 // for example, 'MS Mincho', and it's considered 11969 // professional to use ASCII font names instead of 11970 // Unicode font names. Thanks Takeshi Terada for 11971 // providing this information. 11972 // The following characters, to my knowledge, have not been 11973 // used to name font names. 11974 // - Single quote. While theoretically you might find a 11975 // font name that has a single quote in its name (serving 11976 // as an apostrophe, e.g. Dave's Scribble), I haven't 11977 // been able to find any actual examples of this. 11978 // Internet Explorer's cssText translation (which I 11979 // believe is invoked by innerHTML) normalizes any 11980 // quoting to single quotes, and fails to escape single 11981 // quotes. (Note that this is not IE's behavior for all 11982 // CSS properties, just some sort of special casing for 11983 // font-family). So a single quote *cannot* be used 11984 // safely in the font-family context if there will be an 11985 // innerHTML/cssText translation. Note that Firefox 3.x 11986 // does this too. 11987 // - Double quote. In IE, these get normalized to 11988 // single-quotes, no matter what the encoding. (Fun 11989 // fact, in IE8, the 'content' CSS property gained 11990 // support, where they special cased to preserve encoded 11991 // double quotes, but still translate unadorned double 11992 // quotes into single quotes.) So, because their 11993 // fixpoint behavior is identical to single quotes, they 11994 // cannot be allowed either. Firefox 3.x displays 11995 // single-quote style behavior. 11996 // - Backslashes are reduced by one (so \\ -> \) every 11997 // iteration, so they cannot be used safely. This shows 11998 // up in IE7, IE8 and FF3 11999 // - Semicolons, commas and backticks are handled properly. 12000 // - The rest of the ASCII punctuation is handled properly. 12001 // We haven't checked what browsers do to unadorned 12002 // versions, but this is not important as long as the 12003 // browser doesn't /remove/ surrounding quotes (as IE does 12004 // for HTML). 12005 // 12006 // With these results in hand, we conclude that there are 12007 // various levels of safety: 12008 // - Paranoid: alphanumeric, spaces and dashes(?) 12009 // - International: Paranoid + non-ASCII Unicode 12010 // - Edgy: Everything except quotes, backslashes 12011 // - NoJS: Standards compliance, e.g. sod IE. Note that 12012 // with some judicious character escaping (since certain 12013 // types of escaping doesn't work) this is theoretically 12014 // OK as long as innerHTML/cssText is not called. 12015 // We believe that international is a reasonable default 12016 // (that we will implement now), and once we do more 12017 // extensive research, we may feel comfortable with dropping 12018 // it down to edgy. 12019 12020 // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of 12021 // str(c)spn assumes that the string was already well formed 12022 // Unicode (which of course it is). 12023 if (strspn($font, $this->mask) !== strlen($font)) { 12024 continue; 12025 } 12026 12027 // Historical: 12028 // In the absence of innerHTML/cssText, these ugly 12029 // transforms don't pose a security risk (as \\ and \" 12030 // might--these escapes are not supported by most browsers). 12031 // We could try to be clever and use single-quote wrapping 12032 // when there is a double quote present, but I have choosen 12033 // not to implement that. (NOTE: you can reduce the amount 12034 // of escapes by one depending on what quoting style you use) 12035 // $font = str_replace('\\', '\\5C ', $font); 12036 // $font = str_replace('"', '\\22 ', $font); 12037 // $font = str_replace("'", '\\27 ', $font); 12038 12039 // font possibly with spaces, requires quoting 12040 $final .= "'$font', "; 12041 } 12042 $final = rtrim($final, ', '); 12043 if ($final === '') { 12044 return false; 12045 } 12046 return $final; 12047 } 12048 12049} 12050 12051 12052 12053 12054 12055/** 12056 * Validates based on {ident} CSS grammar production 12057 */ 12058class HTMLPurifier_AttrDef_CSS_Ident extends HTMLPurifier_AttrDef 12059{ 12060 12061 /** 12062 * @param string $string 12063 * @param HTMLPurifier_Config $config 12064 * @param HTMLPurifier_Context $context 12065 * @return bool|string 12066 */ 12067 public function validate($string, $config, $context) 12068 { 12069 $string = trim($string); 12070 12071 // early abort: '' and '0' (strings that convert to false) are invalid 12072 if (!$string) { 12073 return false; 12074 } 12075 12076 $pattern = '/^(-?[A-Za-z_][A-Za-z_\-0-9]*)$/'; 12077 if (!preg_match($pattern, $string)) { 12078 return false; 12079 } 12080 return $string; 12081 } 12082} 12083 12084 12085 12086 12087 12088/** 12089 * Decorator which enables !important to be used in CSS values. 12090 */ 12091class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef 12092{ 12093 /** 12094 * @type HTMLPurifier_AttrDef 12095 */ 12096 public $def; 12097 /** 12098 * @type bool 12099 */ 12100 public $allow; 12101 12102 /** 12103 * @param HTMLPurifier_AttrDef $def Definition to wrap 12104 * @param bool $allow Whether or not to allow !important 12105 */ 12106 public function __construct($def, $allow = false) 12107 { 12108 $this->def = $def; 12109 $this->allow = $allow; 12110 } 12111 12112 /** 12113 * Intercepts and removes !important if necessary 12114 * @param string $string 12115 * @param HTMLPurifier_Config $config 12116 * @param HTMLPurifier_Context $context 12117 * @return bool|string 12118 */ 12119 public function validate($string, $config, $context) 12120 { 12121 // test for ! and important tokens 12122 $string = trim($string); 12123 $is_important = false; 12124 // :TODO: optimization: test directly for !important and ! important 12125 if (strlen($string) >= 9 && substr($string, -9) === 'important') { 12126 $temp = rtrim(substr($string, 0, -9)); 12127 // use a temp, because we might want to restore important 12128 if (strlen($temp) >= 1 && substr($temp, -1) === '!') { 12129 $string = rtrim(substr($temp, 0, -1)); 12130 $is_important = true; 12131 } 12132 } 12133 $string = $this->def->validate($string, $config, $context); 12134 if ($this->allow && $is_important) { 12135 $string .= ' !important'; 12136 } 12137 return $string; 12138 } 12139} 12140 12141 12142 12143 12144 12145/** 12146 * Represents a Length as defined by CSS. 12147 */ 12148class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef 12149{ 12150 12151 /** 12152 * @type HTMLPurifier_Length|string 12153 */ 12154 protected $min; 12155 12156 /** 12157 * @type HTMLPurifier_Length|string 12158 */ 12159 protected $max; 12160 12161 /** 12162 * @param HTMLPurifier_Length|string $min Minimum length, or null for no bound. String is also acceptable. 12163 * @param HTMLPurifier_Length|string $max Maximum length, or null for no bound. String is also acceptable. 12164 */ 12165 public function __construct($min = null, $max = null) 12166 { 12167 $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null; 12168 $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null; 12169 } 12170 12171 /** 12172 * @param string $string 12173 * @param HTMLPurifier_Config $config 12174 * @param HTMLPurifier_Context $context 12175 * @return bool|string 12176 */ 12177 public function validate($string, $config, $context) 12178 { 12179 $string = $this->parseCDATA($string); 12180 12181 // Optimizations 12182 if ($string === '') { 12183 return false; 12184 } 12185 if ($string === '0') { 12186 return '0'; 12187 } 12188 if (strlen($string) === 1) { 12189 return false; 12190 } 12191 12192 $length = HTMLPurifier_Length::make($string); 12193 if (!$length->isValid()) { 12194 return false; 12195 } 12196 12197 if ($this->min) { 12198 $c = $length->compareTo($this->min); 12199 if ($c === false) { 12200 return false; 12201 } 12202 if ($c < 0) { 12203 return false; 12204 } 12205 } 12206 if ($this->max) { 12207 $c = $length->compareTo($this->max); 12208 if ($c === false) { 12209 return false; 12210 } 12211 if ($c > 0) { 12212 return false; 12213 } 12214 } 12215 return $length->toString(); 12216 } 12217} 12218 12219 12220 12221 12222 12223/** 12224 * Validates shorthand CSS property list-style. 12225 * @warning Does not support url tokens that have internal spaces. 12226 */ 12227class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef 12228{ 12229 12230 /** 12231 * Local copy of validators. 12232 * @type HTMLPurifier_AttrDef[] 12233 * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl. 12234 */ 12235 protected $info; 12236 12237 /** 12238 * @param HTMLPurifier_Config $config 12239 */ 12240 public function __construct($config) 12241 { 12242 $def = $config->getCSSDefinition(); 12243 $this->info['list-style-type'] = $def->info['list-style-type']; 12244 $this->info['list-style-position'] = $def->info['list-style-position']; 12245 $this->info['list-style-image'] = $def->info['list-style-image']; 12246 } 12247 12248 /** 12249 * @param string $string 12250 * @param HTMLPurifier_Config $config 12251 * @param HTMLPurifier_Context $context 12252 * @return bool|string 12253 */ 12254 public function validate($string, $config, $context) 12255 { 12256 // regular pre-processing 12257 $string = $this->parseCDATA($string); 12258 if ($string === '') { 12259 return false; 12260 } 12261 12262 // assumes URI doesn't have spaces in it 12263 $bits = explode(' ', strtolower($string)); // bits to process 12264 12265 $caught = array(); 12266 $caught['type'] = false; 12267 $caught['position'] = false; 12268 $caught['image'] = false; 12269 12270 $i = 0; // number of catches 12271 $none = false; 12272 12273 foreach ($bits as $bit) { 12274 if ($i >= 3) { 12275 return; 12276 } // optimization bit 12277 if ($bit === '') { 12278 continue; 12279 } 12280 foreach ($caught as $key => $status) { 12281 if ($status !== false) { 12282 continue; 12283 } 12284 $r = $this->info['list-style-' . $key]->validate($bit, $config, $context); 12285 if ($r === false) { 12286 continue; 12287 } 12288 if ($r === 'none') { 12289 if ($none) { 12290 continue; 12291 } else { 12292 $none = true; 12293 } 12294 if ($key == 'image') { 12295 continue; 12296 } 12297 } 12298 $caught[$key] = $r; 12299 $i++; 12300 break; 12301 } 12302 } 12303 12304 if (!$i) { 12305 return false; 12306 } 12307 12308 $ret = array(); 12309 12310 // construct type 12311 if ($caught['type']) { 12312 $ret[] = $caught['type']; 12313 } 12314 12315 // construct image 12316 if ($caught['image']) { 12317 $ret[] = $caught['image']; 12318 } 12319 12320 // construct position 12321 if ($caught['position']) { 12322 $ret[] = $caught['position']; 12323 } 12324 12325 if (empty($ret)) { 12326 return false; 12327 } 12328 return implode(' ', $ret); 12329 } 12330} 12331 12332 12333 12334 12335 12336/** 12337 * Framework class for strings that involve multiple values. 12338 * 12339 * Certain CSS properties such as border-width and margin allow multiple 12340 * lengths to be specified. This class can take a vanilla border-width 12341 * definition and multiply it, usually into a max of four. 12342 * 12343 * @note Even though the CSS specification isn't clear about it, inherit 12344 * can only be used alone: it will never manifest as part of a multi 12345 * shorthand declaration. Thus, this class does not allow inherit. 12346 */ 12347class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef 12348{ 12349 /** 12350 * Instance of component definition to defer validation to. 12351 * @type HTMLPurifier_AttrDef 12352 * @todo Make protected 12353 */ 12354 public $single; 12355 12356 /** 12357 * Max number of values allowed. 12358 * @todo Make protected 12359 */ 12360 public $max; 12361 12362 /** 12363 * @param HTMLPurifier_AttrDef $single HTMLPurifier_AttrDef to multiply 12364 * @param int $max Max number of values allowed (usually four) 12365 */ 12366 public function __construct($single, $max = 4) 12367 { 12368 $this->single = $single; 12369 $this->max = $max; 12370 } 12371 12372 /** 12373 * @param string $string 12374 * @param HTMLPurifier_Config $config 12375 * @param HTMLPurifier_Context $context 12376 * @return bool|string 12377 */ 12378 public function validate($string, $config, $context) 12379 { 12380 $string = $this->mungeRgb($this->parseCDATA($string)); 12381 if ($string === '') { 12382 return false; 12383 } 12384 $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n 12385 $length = count($parts); 12386 $final = ''; 12387 for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) { 12388 if (ctype_space($parts[$i])) { 12389 continue; 12390 } 12391 $result = $this->single->validate($parts[$i], $config, $context); 12392 if ($result !== false) { 12393 $final .= $result . ' '; 12394 $num++; 12395 } 12396 } 12397 if ($final === '') { 12398 return false; 12399 } 12400 return rtrim($final); 12401 } 12402} 12403 12404 12405 12406 12407 12408/** 12409 * Validates a Percentage as defined by the CSS spec. 12410 */ 12411class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef 12412{ 12413 12414 /** 12415 * Instance to defer number validation to. 12416 * @type HTMLPurifier_AttrDef_CSS_Number 12417 */ 12418 protected $number_def; 12419 12420 /** 12421 * @param bool $non_negative Whether to forbid negative values 12422 */ 12423 public function __construct($non_negative = false) 12424 { 12425 $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative); 12426 } 12427 12428 /** 12429 * @param string $string 12430 * @param HTMLPurifier_Config $config 12431 * @param HTMLPurifier_Context $context 12432 * @return bool|string 12433 */ 12434 public function validate($string, $config, $context) 12435 { 12436 $string = $this->parseCDATA($string); 12437 12438 if ($string === '') { 12439 return false; 12440 } 12441 $length = strlen($string); 12442 if ($length === 1) { 12443 return false; 12444 } 12445 if ($string[$length - 1] !== '%') { 12446 return false; 12447 } 12448 12449 $number = substr($string, 0, $length - 1); 12450 $number = $this->number_def->validate($number, $config, $context); 12451 12452 if ($number === false) { 12453 return false; 12454 } 12455 return "$number%"; 12456 } 12457} 12458 12459 12460 12461 12462 12463/** 12464 * Validates the value for the CSS property text-decoration 12465 * @note This class could be generalized into a version that acts sort of 12466 * like Enum except you can compound the allowed values. 12467 */ 12468class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef 12469{ 12470 12471 /** 12472 * @param string $string 12473 * @param HTMLPurifier_Config $config 12474 * @param HTMLPurifier_Context $context 12475 * @return bool|string 12476 */ 12477 public function validate($string, $config, $context) 12478 { 12479 static $allowed_values = array( 12480 'line-through' => true, 12481 'overline' => true, 12482 'underline' => true, 12483 ); 12484 12485 $string = strtolower($this->parseCDATA($string)); 12486 12487 if ($string === 'none') { 12488 return $string; 12489 } 12490 12491 $parts = explode(' ', $string); 12492 $final = ''; 12493 foreach ($parts as $part) { 12494 if (isset($allowed_values[$part])) { 12495 $final .= $part . ' '; 12496 } 12497 } 12498 $final = rtrim($final); 12499 if ($final === '') { 12500 return false; 12501 } 12502 return $final; 12503 } 12504} 12505 12506 12507 12508 12509 12510/** 12511 * Validates a URI in CSS syntax, which uses url('http://example.com') 12512 * @note While theoretically speaking a URI in a CSS document could 12513 * be non-embedded, as of CSS2 there is no such usage so we're 12514 * generalizing it. This may need to be changed in the future. 12515 * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as 12516 * the separator, you cannot put a literal semicolon in 12517 * in the URI. Try percent encoding it, in that case. 12518 */ 12519class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI 12520{ 12521 12522 public function __construct() 12523 { 12524 parent::__construct(true); // always embedded 12525 } 12526 12527 /** 12528 * @param string $uri_string 12529 * @param HTMLPurifier_Config $config 12530 * @param HTMLPurifier_Context $context 12531 * @return bool|string 12532 */ 12533 public function validate($uri_string, $config, $context) 12534 { 12535 // parse the URI out of the string and then pass it onto 12536 // the parent object 12537 12538 $uri_string = $this->parseCDATA($uri_string); 12539 if (strpos($uri_string, 'url(') !== 0) { 12540 return false; 12541 } 12542 $uri_string = substr($uri_string, 4); 12543 if (strlen($uri_string) == 0) { 12544 return false; 12545 } 12546 $new_length = strlen($uri_string) - 1; 12547 if ($uri_string[$new_length] != ')') { 12548 return false; 12549 } 12550 $uri = trim(substr($uri_string, 0, $new_length)); 12551 12552 if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) { 12553 $quote = $uri[0]; 12554 $new_length = strlen($uri) - 1; 12555 if ($uri[$new_length] !== $quote) { 12556 return false; 12557 } 12558 $uri = substr($uri, 1, $new_length - 1); 12559 } 12560 12561 $uri = $this->expandCSSEscape($uri); 12562 12563 $result = parent::validate($uri, $config, $context); 12564 12565 if ($result === false) { 12566 return false; 12567 } 12568 12569 // extra sanity check; should have been done by URI 12570 $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result); 12571 12572 // suspicious characters are ()'; we're going to percent encode 12573 // them for safety. 12574 $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result); 12575 12576 // there's an extra bug where ampersands lose their escaping on 12577 // an innerHTML cycle, so a very unlucky query parameter could 12578 // then change the meaning of the URL. Unfortunately, there's 12579 // not much we can do about that... 12580 return "url(\"$result\")"; 12581 } 12582} 12583 12584 12585 12586 12587 12588/** 12589 * Validates a boolean attribute 12590 */ 12591class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef 12592{ 12593 12594 /** 12595 * @type bool 12596 */ 12597 protected $name; 12598 12599 /** 12600 * @type bool 12601 */ 12602 public $minimized = true; 12603 12604 /** 12605 * @param bool $name 12606 */ 12607 public function __construct($name = false) 12608 { 12609 $this->name = $name; 12610 } 12611 12612 /** 12613 * @param string $string 12614 * @param HTMLPurifier_Config $config 12615 * @param HTMLPurifier_Context $context 12616 * @return bool|string 12617 */ 12618 public function validate($string, $config, $context) 12619 { 12620 return $this->name; 12621 } 12622 12623 /** 12624 * @param string $string Name of attribute 12625 * @return HTMLPurifier_AttrDef_HTML_Bool 12626 */ 12627 public function make($string) 12628 { 12629 return new HTMLPurifier_AttrDef_HTML_Bool($string); 12630 } 12631} 12632 12633 12634 12635 12636 12637/** 12638 * Validates contents based on NMTOKENS attribute type. 12639 */ 12640class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef 12641{ 12642 12643 /** 12644 * @param string $string 12645 * @param HTMLPurifier_Config $config 12646 * @param HTMLPurifier_Context $context 12647 * @return bool|string 12648 */ 12649 public function validate($string, $config, $context) 12650 { 12651 $string = trim($string); 12652 12653 // early abort: '' and '0' (strings that convert to false) are invalid 12654 if (!$string) { 12655 return false; 12656 } 12657 12658 $tokens = $this->split($string, $config, $context); 12659 $tokens = $this->filter($tokens, $config, $context); 12660 if (empty($tokens)) { 12661 return false; 12662 } 12663 return implode(' ', $tokens); 12664 } 12665 12666 /** 12667 * Splits a space separated list of tokens into its constituent parts. 12668 * @param string $string 12669 * @param HTMLPurifier_Config $config 12670 * @param HTMLPurifier_Context $context 12671 * @return array 12672 */ 12673 protected function split($string, $config, $context) 12674 { 12675 // OPTIMIZABLE! 12676 // do the preg_match, capture all subpatterns for reformulation 12677 12678 // we don't support U+00A1 and up codepoints or 12679 // escaping because I don't know how to do that with regexps 12680 // and plus it would complicate optimization efforts (you never 12681 // see that anyway). 12682 $pattern = '/(?:(?<=\s)|\A)' . // look behind for space or string start 12683 '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)' . 12684 '(?:(?=\s)|\z)/'; // look ahead for space or string end 12685 preg_match_all($pattern, $string, $matches); 12686 return $matches[1]; 12687 } 12688 12689 /** 12690 * Template method for removing certain tokens based on arbitrary criteria. 12691 * @note If we wanted to be really functional, we'd do an array_filter 12692 * with a callback. But... we're not. 12693 * @param array $tokens 12694 * @param HTMLPurifier_Config $config 12695 * @param HTMLPurifier_Context $context 12696 * @return array 12697 */ 12698 protected function filter($tokens, $config, $context) 12699 { 12700 return $tokens; 12701 } 12702} 12703 12704 12705 12706 12707 12708/** 12709 * Implements special behavior for class attribute (normally NMTOKENS) 12710 */ 12711class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens 12712{ 12713 /** 12714 * @param string $string 12715 * @param HTMLPurifier_Config $config 12716 * @param HTMLPurifier_Context $context 12717 * @return bool|string 12718 */ 12719 protected function split($string, $config, $context) 12720 { 12721 // really, this twiddle should be lazy loaded 12722 $name = $config->getDefinition('HTML')->doctype->name; 12723 if ($name == "XHTML 1.1" || $name == "XHTML 2.0") { 12724 return parent::split($string, $config, $context); 12725 } else { 12726 return preg_split('/\s+/', $string); 12727 } 12728 } 12729 12730 /** 12731 * @param array $tokens 12732 * @param HTMLPurifier_Config $config 12733 * @param HTMLPurifier_Context $context 12734 * @return array 12735 */ 12736 protected function filter($tokens, $config, $context) 12737 { 12738 $allowed = $config->get('Attr.AllowedClasses'); 12739 $forbidden = $config->get('Attr.ForbiddenClasses'); 12740 $ret = array(); 12741 foreach ($tokens as $token) { 12742 if (($allowed === null || isset($allowed[$token])) && 12743 !isset($forbidden[$token]) && 12744 // We need this O(n) check because of PHP's array 12745 // implementation that casts -0 to 0. 12746 !in_array($token, $ret, true) 12747 ) { 12748 $ret[] = $token; 12749 } 12750 } 12751 return $ret; 12752 } 12753} 12754 12755 12756 12757/** 12758 * Validates a color according to the HTML spec. 12759 */ 12760class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef 12761{ 12762 12763 /** 12764 * @param string $string 12765 * @param HTMLPurifier_Config $config 12766 * @param HTMLPurifier_Context $context 12767 * @return bool|string 12768 */ 12769 public function validate($string, $config, $context) 12770 { 12771 static $colors = null; 12772 if ($colors === null) { 12773 $colors = $config->get('Core.ColorKeywords'); 12774 } 12775 12776 $string = trim($string); 12777 12778 if (empty($string)) { 12779 return false; 12780 } 12781 $lower = strtolower($string); 12782 if (isset($colors[$lower])) { 12783 return $colors[$lower]; 12784 } 12785 if ($string[0] === '#') { 12786 $hex = substr($string, 1); 12787 } else { 12788 $hex = $string; 12789 } 12790 12791 $length = strlen($hex); 12792 if ($length !== 3 && $length !== 6) { 12793 return false; 12794 } 12795 if (!ctype_xdigit($hex)) { 12796 return false; 12797 } 12798 if ($length === 3) { 12799 $hex = $hex[0] . $hex[0] . $hex[1] . $hex[1] . $hex[2] . $hex[2]; 12800 } 12801 return "#$hex"; 12802 } 12803} 12804 12805 12806 12807 12808 12809/** 12810 * Special-case enum attribute definition that lazy loads allowed frame targets 12811 */ 12812class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum 12813{ 12814 12815 /** 12816 * @type array 12817 */ 12818 public $valid_values = false; // uninitialized value 12819 12820 /** 12821 * @type bool 12822 */ 12823 protected $case_sensitive = false; 12824 12825 public function __construct() 12826 { 12827 } 12828 12829 /** 12830 * @param string $string 12831 * @param HTMLPurifier_Config $config 12832 * @param HTMLPurifier_Context $context 12833 * @return bool|string 12834 */ 12835 public function validate($string, $config, $context) 12836 { 12837 if ($this->valid_values === false) { 12838 $this->valid_values = $config->get('Attr.AllowedFrameTargets'); 12839 } 12840 return parent::validate($string, $config, $context); 12841 } 12842} 12843 12844 12845 12846 12847 12848/** 12849 * Validates the HTML attribute ID. 12850 * @warning Even though this is the id processor, it 12851 * will ignore the directive Attr:IDBlacklist, since it will only 12852 * go according to the ID accumulator. Since the accumulator is 12853 * automatically generated, it will have already absorbed the 12854 * blacklist. If you're hacking around, make sure you use load()! 12855 */ 12856 12857class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef 12858{ 12859 12860 // selector is NOT a valid thing to use for IDREFs, because IDREFs 12861 // *must* target IDs that exist, whereas selector #ids do not. 12862 12863 /** 12864 * Determines whether or not we're validating an ID in a CSS 12865 * selector context. 12866 * @type bool 12867 */ 12868 protected $selector; 12869 12870 /** 12871 * @param bool $selector 12872 */ 12873 public function __construct($selector = false) 12874 { 12875 $this->selector = $selector; 12876 } 12877 12878 /** 12879 * @param string $id 12880 * @param HTMLPurifier_Config $config 12881 * @param HTMLPurifier_Context $context 12882 * @return bool|string 12883 */ 12884 public function validate($id, $config, $context) 12885 { 12886 if (!$this->selector && !$config->get('Attr.EnableID')) { 12887 return false; 12888 } 12889 12890 $id = trim($id); // trim it first 12891 12892 if ($id === '') { 12893 return false; 12894 } 12895 12896 $prefix = $config->get('Attr.IDPrefix'); 12897 if ($prefix !== '') { 12898 $prefix .= $config->get('Attr.IDPrefixLocal'); 12899 // prevent re-appending the prefix 12900 if (strpos($id, $prefix) !== 0) { 12901 $id = $prefix . $id; 12902 } 12903 } elseif ($config->get('Attr.IDPrefixLocal') !== '') { 12904 trigger_error( 12905 '%Attr.IDPrefixLocal cannot be used unless ' . 12906 '%Attr.IDPrefix is set', 12907 E_USER_WARNING 12908 ); 12909 } 12910 12911 if (!$this->selector) { 12912 $id_accumulator =& $context->get('IDAccumulator'); 12913 if (isset($id_accumulator->ids[$id])) { 12914 return false; 12915 } 12916 } 12917 12918 // we purposely avoid using regex, hopefully this is faster 12919 12920 if ($config->get('Attr.ID.HTML5') === true) { 12921 if (preg_match('/[\t\n\x0b\x0c ]/', $id)) { 12922 return false; 12923 } 12924 } else { 12925 if (ctype_alpha($id)) { 12926 // OK 12927 } else { 12928 if (!ctype_alpha(@$id[0])) { 12929 return false; 12930 } 12931 // primitive style of regexps, I suppose 12932 $trim = trim( 12933 $id, 12934 'A..Za..z0..9:-._' 12935 ); 12936 if ($trim !== '') { 12937 return false; 12938 } 12939 } 12940 } 12941 12942 $regexp = $config->get('Attr.IDBlacklistRegexp'); 12943 if ($regexp && preg_match($regexp, $id)) { 12944 return false; 12945 } 12946 12947 if (!$this->selector) { 12948 $id_accumulator->add($id); 12949 } 12950 12951 // if no change was made to the ID, return the result 12952 // else, return the new id if stripping whitespace made it 12953 // valid, or return false. 12954 return $id; 12955 } 12956} 12957 12958 12959 12960 12961 12962/** 12963 * Validates an integer representation of pixels according to the HTML spec. 12964 */ 12965class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef 12966{ 12967 12968 /** 12969 * @type int 12970 */ 12971 protected $max; 12972 12973 /** 12974 * @param int $max 12975 */ 12976 public function __construct($max = null) 12977 { 12978 $this->max = $max; 12979 } 12980 12981 /** 12982 * @param string $string 12983 * @param HTMLPurifier_Config $config 12984 * @param HTMLPurifier_Context $context 12985 * @return bool|string 12986 */ 12987 public function validate($string, $config, $context) 12988 { 12989 $string = trim($string); 12990 if ($string === '0') { 12991 return $string; 12992 } 12993 if ($string === '') { 12994 return false; 12995 } 12996 $length = strlen($string); 12997 if (substr($string, $length - 2) == 'px') { 12998 $string = substr($string, 0, $length - 2); 12999 } 13000 if (!is_numeric($string)) { 13001 return false; 13002 } 13003 $int = (int)$string; 13004 13005 if ($int < 0) { 13006 return '0'; 13007 } 13008 13009 // upper-bound value, extremely high values can 13010 // crash operating systems, see <http://ha.ckers.org/imagecrash.html> 13011 // WARNING, above link WILL crash you if you're using Windows 13012 13013 if ($this->max !== null && $int > $this->max) { 13014 return (string)$this->max; 13015 } 13016 return (string)$int; 13017 } 13018 13019 /** 13020 * @param string $string 13021 * @return HTMLPurifier_AttrDef 13022 */ 13023 public function make($string) 13024 { 13025 if ($string === '') { 13026 $max = null; 13027 } else { 13028 $max = (int)$string; 13029 } 13030 $class = get_class($this); 13031 return new $class($max); 13032 } 13033} 13034 13035 13036 13037 13038 13039/** 13040 * Validates the HTML type length (not to be confused with CSS's length). 13041 * 13042 * This accepts integer pixels or percentages as lengths for certain 13043 * HTML attributes. 13044 */ 13045 13046class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels 13047{ 13048 13049 /** 13050 * @param string $string 13051 * @param HTMLPurifier_Config $config 13052 * @param HTMLPurifier_Context $context 13053 * @return bool|string 13054 */ 13055 public function validate($string, $config, $context) 13056 { 13057 $string = trim($string); 13058 if ($string === '') { 13059 return false; 13060 } 13061 13062 $parent_result = parent::validate($string, $config, $context); 13063 if ($parent_result !== false) { 13064 return $parent_result; 13065 } 13066 13067 $length = strlen($string); 13068 $last_char = $string[$length - 1]; 13069 13070 if ($last_char !== '%') { 13071 return false; 13072 } 13073 13074 $points = substr($string, 0, $length - 1); 13075 13076 if (!is_numeric($points)) { 13077 return false; 13078 } 13079 13080 $points = (int)$points; 13081 13082 if ($points < 0) { 13083 return '0%'; 13084 } 13085 if ($points > 100) { 13086 return '100%'; 13087 } 13088 return ((string)$points) . '%'; 13089 } 13090} 13091 13092 13093 13094 13095 13096/** 13097 * Validates a rel/rev link attribute against a directive of allowed values 13098 * @note We cannot use Enum because link types allow multiple 13099 * values. 13100 * @note Assumes link types are ASCII text 13101 */ 13102class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef 13103{ 13104 13105 /** 13106 * Name config attribute to pull. 13107 * @type string 13108 */ 13109 protected $name; 13110 13111 /** 13112 * @param string $name 13113 */ 13114 public function __construct($name) 13115 { 13116 $configLookup = array( 13117 'rel' => 'AllowedRel', 13118 'rev' => 'AllowedRev' 13119 ); 13120 if (!isset($configLookup[$name])) { 13121 trigger_error( 13122 'Unrecognized attribute name for link ' . 13123 'relationship.', 13124 E_USER_ERROR 13125 ); 13126 return; 13127 } 13128 $this->name = $configLookup[$name]; 13129 } 13130 13131 /** 13132 * @param string $string 13133 * @param HTMLPurifier_Config $config 13134 * @param HTMLPurifier_Context $context 13135 * @return bool|string 13136 */ 13137 public function validate($string, $config, $context) 13138 { 13139 $allowed = $config->get('Attr.' . $this->name); 13140 if (empty($allowed)) { 13141 return false; 13142 } 13143 13144 $string = $this->parseCDATA($string); 13145 $parts = explode(' ', $string); 13146 13147 // lookup to prevent duplicates 13148 $ret_lookup = array(); 13149 foreach ($parts as $part) { 13150 $part = strtolower(trim($part)); 13151 if (!isset($allowed[$part])) { 13152 continue; 13153 } 13154 $ret_lookup[$part] = true; 13155 } 13156 13157 if (empty($ret_lookup)) { 13158 return false; 13159 } 13160 $string = implode(' ', array_keys($ret_lookup)); 13161 return $string; 13162 } 13163} 13164 13165 13166 13167 13168 13169/** 13170 * Validates a MultiLength as defined by the HTML spec. 13171 * 13172 * A multilength is either a integer (pixel count), a percentage, or 13173 * a relative number. 13174 */ 13175class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length 13176{ 13177 13178 /** 13179 * @param string $string 13180 * @param HTMLPurifier_Config $config 13181 * @param HTMLPurifier_Context $context 13182 * @return bool|string 13183 */ 13184 public function validate($string, $config, $context) 13185 { 13186 $string = trim($string); 13187 if ($string === '') { 13188 return false; 13189 } 13190 13191 $parent_result = parent::validate($string, $config, $context); 13192 if ($parent_result !== false) { 13193 return $parent_result; 13194 } 13195 13196 $length = strlen($string); 13197 $last_char = $string[$length - 1]; 13198 13199 if ($last_char !== '*') { 13200 return false; 13201 } 13202 13203 $int = substr($string, 0, $length - 1); 13204 13205 if ($int == '') { 13206 return '*'; 13207 } 13208 if (!is_numeric($int)) { 13209 return false; 13210 } 13211 13212 $int = (int)$int; 13213 if ($int < 0) { 13214 return false; 13215 } 13216 if ($int == 0) { 13217 return '0'; 13218 } 13219 if ($int == 1) { 13220 return '*'; 13221 } 13222 return ((string)$int) . '*'; 13223 } 13224} 13225 13226 13227 13228 13229 13230abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef 13231{ 13232 13233 /** 13234 * Unpacks a mailbox into its display-name and address 13235 * @param string $string 13236 * @return mixed 13237 */ 13238 public function unpack($string) 13239 { 13240 // needs to be implemented 13241 } 13242 13243} 13244 13245// sub-implementations 13246 13247 13248 13249 13250 13251/** 13252 * Validates a host according to the IPv4, IPv6 and DNS (future) specifications. 13253 */ 13254class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef 13255{ 13256 13257 /** 13258 * IPv4 sub-validator. 13259 * @type HTMLPurifier_AttrDef_URI_IPv4 13260 */ 13261 protected $ipv4; 13262 13263 /** 13264 * IPv6 sub-validator. 13265 * @type HTMLPurifier_AttrDef_URI_IPv6 13266 */ 13267 protected $ipv6; 13268 13269 public function __construct() 13270 { 13271 $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4(); 13272 $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6(); 13273 } 13274 13275 /** 13276 * @param string $string 13277 * @param HTMLPurifier_Config $config 13278 * @param HTMLPurifier_Context $context 13279 * @return bool|string 13280 */ 13281 public function validate($string, $config, $context) 13282 { 13283 $length = strlen($string); 13284 // empty hostname is OK; it's usually semantically equivalent: 13285 // the default host as defined by a URI scheme is used: 13286 // 13287 // If the URI scheme defines a default for host, then that 13288 // default applies when the host subcomponent is undefined 13289 // or when the registered name is empty (zero length). 13290 if ($string === '') { 13291 return ''; 13292 } 13293 if ($length > 1 && $string[0] === '[' && $string[$length - 1] === ']') { 13294 //IPv6 13295 $ip = substr($string, 1, $length - 2); 13296 $valid = $this->ipv6->validate($ip, $config, $context); 13297 if ($valid === false) { 13298 return false; 13299 } 13300 return '[' . $valid . ']'; 13301 } 13302 13303 // need to do checks on unusual encodings too 13304 $ipv4 = $this->ipv4->validate($string, $config, $context); 13305 if ($ipv4 !== false) { 13306 return $ipv4; 13307 } 13308 13309 // A regular domain name. 13310 13311 // This doesn't match I18N domain names, but we don't have proper IRI support, 13312 // so force users to insert Punycode. 13313 13314 // There is not a good sense in which underscores should be 13315 // allowed, since it's technically not! (And if you go as 13316 // far to allow everything as specified by the DNS spec... 13317 // well, that's literally everything, modulo some space limits 13318 // for the components and the overall name (which, by the way, 13319 // we are NOT checking!). So we (arbitrarily) decide this: 13320 // let's allow underscores wherever we would have allowed 13321 // hyphens, if they are enabled. This is a pretty good match 13322 // for browser behavior, for example, a large number of browsers 13323 // cannot handle foo_.example.com, but foo_bar.example.com is 13324 // fairly well supported. 13325 $underscore = $config->get('Core.AllowHostnameUnderscore') ? '_' : ''; 13326 13327 // Based off of RFC 1738, but amended so that 13328 // as per RFC 3696, the top label need only not be all numeric. 13329 // The productions describing this are: 13330 $a = '[a-z]'; // alpha 13331 $an = '[a-z0-9]'; // alphanum 13332 $and = "[a-z0-9-$underscore]"; // alphanum | "-" 13333 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 13334 $domainlabel = "$an(?:$and*$an)?"; 13335 // AMENDED as per RFC 3696 13336 // toplabel = alphanum | alphanum *( alphanum | "-" ) alphanum 13337 // side condition: not all numeric 13338 $toplabel = "$an(?:$and*$an)?"; 13339 // hostname = *( domainlabel "." ) toplabel [ "." ] 13340 if (preg_match("/^(?:$domainlabel\.)*($toplabel)\.?$/i", $string, $matches)) { 13341 if (!ctype_digit($matches[1])) { 13342 return $string; 13343 } 13344 } 13345 13346 // PHP 5.3 and later support this functionality natively 13347 if (function_exists('idn_to_ascii')) { 13348 if (defined('IDNA_NONTRANSITIONAL_TO_ASCII') && defined('INTL_IDNA_VARIANT_UTS46')) { 13349 $string = idn_to_ascii($string, IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); 13350 } else { 13351 $string = idn_to_ascii($string); 13352 } 13353 13354 // If we have Net_IDNA2 support, we can support IRIs by 13355 // punycoding them. (This is the most portable thing to do, 13356 // since otherwise we have to assume browsers support 13357 } elseif ($config->get('Core.EnableIDNA')) { 13358 $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true)); 13359 // we need to encode each period separately 13360 $parts = explode('.', $string); 13361 try { 13362 $new_parts = array(); 13363 foreach ($parts as $part) { 13364 $encodable = false; 13365 for ($i = 0, $c = strlen($part); $i < $c; $i++) { 13366 if (ord($part[$i]) > 0x7a) { 13367 $encodable = true; 13368 break; 13369 } 13370 } 13371 if (!$encodable) { 13372 $new_parts[] = $part; 13373 } else { 13374 $new_parts[] = $idna->encode($part); 13375 } 13376 } 13377 $string = implode('.', $new_parts); 13378 } catch (Exception $e) { 13379 // XXX error reporting 13380 } 13381 } 13382 // Try again 13383 if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) { 13384 return $string; 13385 } 13386 return false; 13387 } 13388} 13389 13390 13391 13392 13393 13394/** 13395 * Validates an IPv4 address 13396 * @author Feyd @ forums.devnetwork.net (public domain) 13397 */ 13398class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef 13399{ 13400 13401 /** 13402 * IPv4 regex, protected so that IPv6 can reuse it. 13403 * @type string 13404 */ 13405 protected $ip4; 13406 13407 /** 13408 * @param string $aIP 13409 * @param HTMLPurifier_Config $config 13410 * @param HTMLPurifier_Context $context 13411 * @return bool|string 13412 */ 13413 public function validate($aIP, $config, $context) 13414 { 13415 if (!$this->ip4) { 13416 $this->_loadRegex(); 13417 } 13418 13419 if (preg_match('#^' . $this->ip4 . '$#s', $aIP)) { 13420 return $aIP; 13421 } 13422 return false; 13423 } 13424 13425 /** 13426 * Lazy load function to prevent regex from being stuffed in 13427 * cache. 13428 */ 13429 protected function _loadRegex() 13430 { 13431 $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 13432 $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; 13433 } 13434} 13435 13436 13437 13438 13439 13440/** 13441 * Validates an IPv6 address. 13442 * @author Feyd @ forums.devnetwork.net (public domain) 13443 * @note This function requires brackets to have been removed from address 13444 * in URI. 13445 */ 13446class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4 13447{ 13448 13449 /** 13450 * @param string $aIP 13451 * @param HTMLPurifier_Config $config 13452 * @param HTMLPurifier_Context $context 13453 * @return bool|string 13454 */ 13455 public function validate($aIP, $config, $context) 13456 { 13457 if (!$this->ip4) { 13458 $this->_loadRegex(); 13459 } 13460 13461 $original = $aIP; 13462 13463 $hex = '[0-9a-fA-F]'; 13464 $blk = '(?:' . $hex . '{1,4})'; 13465 $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128 13466 13467 // prefix check 13468 if (strpos($aIP, '/') !== false) { 13469 if (preg_match('#' . $pre . '$#s', $aIP, $find)) { 13470 $aIP = substr($aIP, 0, 0 - strlen($find[0])); 13471 unset($find); 13472 } else { 13473 return false; 13474 } 13475 } 13476 13477 // IPv4-compatiblity check 13478 if (preg_match('#(?<=:' . ')' . $this->ip4 . '$#s', $aIP, $find)) { 13479 $aIP = substr($aIP, 0, 0 - strlen($find[0])); 13480 $ip = explode('.', $find[0]); 13481 $ip = array_map('dechex', $ip); 13482 $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3]; 13483 unset($find, $ip); 13484 } 13485 13486 // compression check 13487 $aIP = explode('::', $aIP); 13488 $c = count($aIP); 13489 if ($c > 2) { 13490 return false; 13491 } elseif ($c == 2) { 13492 list($first, $second) = $aIP; 13493 $first = explode(':', $first); 13494 $second = explode(':', $second); 13495 13496 if (count($first) + count($second) > 8) { 13497 return false; 13498 } 13499 13500 while (count($first) < 8) { 13501 array_push($first, '0'); 13502 } 13503 13504 array_splice($first, 8 - count($second), 8, $second); 13505 $aIP = $first; 13506 unset($first, $second); 13507 } else { 13508 $aIP = explode(':', $aIP[0]); 13509 } 13510 $c = count($aIP); 13511 13512 if ($c != 8) { 13513 return false; 13514 } 13515 13516 // All the pieces should be 16-bit hex strings. Are they? 13517 foreach ($aIP as $piece) { 13518 if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece))) { 13519 return false; 13520 } 13521 } 13522 return $original; 13523 } 13524} 13525 13526 13527 13528 13529 13530/** 13531 * Primitive email validation class based on the regexp found at 13532 * http://www.regular-expressions.info/email.html 13533 */ 13534class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email 13535{ 13536 13537 /** 13538 * @param string $string 13539 * @param HTMLPurifier_Config $config 13540 * @param HTMLPurifier_Context $context 13541 * @return bool|string 13542 */ 13543 public function validate($string, $config, $context) 13544 { 13545 // no support for named mailboxes i.e. "Bob <bob@example.com>" 13546 // that needs more percent encoding to be done 13547 if ($string == '') { 13548 return false; 13549 } 13550 $string = trim($string); 13551 $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string); 13552 return $result ? $string : false; 13553 } 13554} 13555 13556 13557 13558 13559 13560/** 13561 * Pre-transform that changes proprietary background attribute to CSS. 13562 */ 13563class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform 13564{ 13565 /** 13566 * @param array $attr 13567 * @param HTMLPurifier_Config $config 13568 * @param HTMLPurifier_Context $context 13569 * @return array 13570 */ 13571 public function transform($attr, $config, $context) 13572 { 13573 if (!isset($attr['background'])) { 13574 return $attr; 13575 } 13576 13577 $background = $this->confiscateAttr($attr, 'background'); 13578 // some validation should happen here 13579 13580 $this->prependCSS($attr, "background-image:url($background);"); 13581 return $attr; 13582 } 13583} 13584 13585 13586 13587 13588 13589// this MUST be placed in post, as it assumes that any value in dir is valid 13590 13591/** 13592 * Post-trasnform that ensures that bdo tags have the dir attribute set. 13593 */ 13594class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform 13595{ 13596 13597 /** 13598 * @param array $attr 13599 * @param HTMLPurifier_Config $config 13600 * @param HTMLPurifier_Context $context 13601 * @return array 13602 */ 13603 public function transform($attr, $config, $context) 13604 { 13605 if (isset($attr['dir'])) { 13606 return $attr; 13607 } 13608 $attr['dir'] = $config->get('Attr.DefaultTextDir'); 13609 return $attr; 13610 } 13611} 13612 13613 13614 13615 13616 13617/** 13618 * Pre-transform that changes deprecated bgcolor attribute to CSS. 13619 */ 13620class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform 13621{ 13622 /** 13623 * @param array $attr 13624 * @param HTMLPurifier_Config $config 13625 * @param HTMLPurifier_Context $context 13626 * @return array 13627 */ 13628 public function transform($attr, $config, $context) 13629 { 13630 if (!isset($attr['bgcolor'])) { 13631 return $attr; 13632 } 13633 13634 $bgcolor = $this->confiscateAttr($attr, 'bgcolor'); 13635 // some validation should happen here 13636 13637 $this->prependCSS($attr, "background-color:$bgcolor;"); 13638 return $attr; 13639 } 13640} 13641 13642 13643 13644 13645 13646/** 13647 * Pre-transform that changes converts a boolean attribute to fixed CSS 13648 */ 13649class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform 13650{ 13651 /** 13652 * Name of boolean attribute that is trigger. 13653 * @type string 13654 */ 13655 protected $attr; 13656 13657 /** 13658 * CSS declarations to add to style, needs trailing semicolon. 13659 * @type string 13660 */ 13661 protected $css; 13662 13663 /** 13664 * @param string $attr attribute name to convert from 13665 * @param string $css CSS declarations to add to style (needs semicolon) 13666 */ 13667 public function __construct($attr, $css) 13668 { 13669 $this->attr = $attr; 13670 $this->css = $css; 13671 } 13672 13673 /** 13674 * @param array $attr 13675 * @param HTMLPurifier_Config $config 13676 * @param HTMLPurifier_Context $context 13677 * @return array 13678 */ 13679 public function transform($attr, $config, $context) 13680 { 13681 if (!isset($attr[$this->attr])) { 13682 return $attr; 13683 } 13684 unset($attr[$this->attr]); 13685 $this->prependCSS($attr, $this->css); 13686 return $attr; 13687 } 13688} 13689 13690 13691 13692 13693 13694/** 13695 * Pre-transform that changes deprecated border attribute to CSS. 13696 */ 13697class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform 13698{ 13699 /** 13700 * @param array $attr 13701 * @param HTMLPurifier_Config $config 13702 * @param HTMLPurifier_Context $context 13703 * @return array 13704 */ 13705 public function transform($attr, $config, $context) 13706 { 13707 if (!isset($attr['border'])) { 13708 return $attr; 13709 } 13710 $border_width = $this->confiscateAttr($attr, 'border'); 13711 // some validation should happen here 13712 $this->prependCSS($attr, "border:{$border_width}px solid;"); 13713 return $attr; 13714 } 13715} 13716 13717 13718 13719 13720 13721/** 13722 * Generic pre-transform that converts an attribute with a fixed number of 13723 * values (enumerated) to CSS. 13724 */ 13725class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform 13726{ 13727 /** 13728 * Name of attribute to transform from. 13729 * @type string 13730 */ 13731 protected $attr; 13732 13733 /** 13734 * Lookup array of attribute values to CSS. 13735 * @type array 13736 */ 13737 protected $enumToCSS = array(); 13738 13739 /** 13740 * Case sensitivity of the matching. 13741 * @type bool 13742 * @warning Currently can only be guaranteed to work with ASCII 13743 * values. 13744 */ 13745 protected $caseSensitive = false; 13746 13747 /** 13748 * @param string $attr Attribute name to transform from 13749 * @param array $enum_to_css Lookup array of attribute values to CSS 13750 * @param bool $case_sensitive Case sensitivity indicator, default false 13751 */ 13752 public function __construct($attr, $enum_to_css, $case_sensitive = false) 13753 { 13754 $this->attr = $attr; 13755 $this->enumToCSS = $enum_to_css; 13756 $this->caseSensitive = (bool)$case_sensitive; 13757 } 13758 13759 /** 13760 * @param array $attr 13761 * @param HTMLPurifier_Config $config 13762 * @param HTMLPurifier_Context $context 13763 * @return array 13764 */ 13765 public function transform($attr, $config, $context) 13766 { 13767 if (!isset($attr[$this->attr])) { 13768 return $attr; 13769 } 13770 13771 $value = trim($attr[$this->attr]); 13772 unset($attr[$this->attr]); 13773 13774 if (!$this->caseSensitive) { 13775 $value = strtolower($value); 13776 } 13777 13778 if (!isset($this->enumToCSS[$value])) { 13779 return $attr; 13780 } 13781 $this->prependCSS($attr, $this->enumToCSS[$value]); 13782 return $attr; 13783 } 13784} 13785 13786 13787 13788 13789 13790// must be called POST validation 13791 13792/** 13793 * Transform that supplies default values for the src and alt attributes 13794 * in img tags, as well as prevents the img tag from being removed 13795 * because of a missing alt tag. This needs to be registered as both 13796 * a pre and post attribute transform. 13797 */ 13798class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform 13799{ 13800 13801 /** 13802 * @param array $attr 13803 * @param HTMLPurifier_Config $config 13804 * @param HTMLPurifier_Context $context 13805 * @return array 13806 */ 13807 public function transform($attr, $config, $context) 13808 { 13809 $src = true; 13810 if (!isset($attr['src'])) { 13811 if ($config->get('Core.RemoveInvalidImg')) { 13812 return $attr; 13813 } 13814 $attr['src'] = $config->get('Attr.DefaultInvalidImage'); 13815 $src = false; 13816 } 13817 13818 if (!isset($attr['alt'])) { 13819 if ($src) { 13820 $alt = $config->get('Attr.DefaultImageAlt'); 13821 if ($alt === null) { 13822 $attr['alt'] = basename($attr['src']); 13823 } else { 13824 $attr['alt'] = $alt; 13825 } 13826 } else { 13827 $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt'); 13828 } 13829 } 13830 return $attr; 13831 } 13832} 13833 13834 13835 13836 13837 13838/** 13839 * Pre-transform that changes deprecated hspace and vspace attributes to CSS 13840 */ 13841class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform 13842{ 13843 /** 13844 * @type string 13845 */ 13846 protected $attr; 13847 13848 /** 13849 * @type array 13850 */ 13851 protected $css = array( 13852 'hspace' => array('left', 'right'), 13853 'vspace' => array('top', 'bottom') 13854 ); 13855 13856 /** 13857 * @param string $attr 13858 */ 13859 public function __construct($attr) 13860 { 13861 $this->attr = $attr; 13862 if (!isset($this->css[$attr])) { 13863 trigger_error(htmlspecialchars($attr) . ' is not valid space attribute'); 13864 } 13865 } 13866 13867 /** 13868 * @param array $attr 13869 * @param HTMLPurifier_Config $config 13870 * @param HTMLPurifier_Context $context 13871 * @return array 13872 */ 13873 public function transform($attr, $config, $context) 13874 { 13875 if (!isset($attr[$this->attr])) { 13876 return $attr; 13877 } 13878 13879 $width = $this->confiscateAttr($attr, $this->attr); 13880 // some validation could happen here 13881 13882 if (!isset($this->css[$this->attr])) { 13883 return $attr; 13884 } 13885 13886 $style = ''; 13887 foreach ($this->css[$this->attr] as $suffix) { 13888 $property = "margin-$suffix"; 13889 $style .= "$property:{$width}px;"; 13890 } 13891 $this->prependCSS($attr, $style); 13892 return $attr; 13893 } 13894} 13895 13896 13897 13898 13899 13900/** 13901 * Performs miscellaneous cross attribute validation and filtering for 13902 * input elements. This is meant to be a post-transform. 13903 */ 13904class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform 13905{ 13906 /** 13907 * @type HTMLPurifier_AttrDef_HTML_Pixels 13908 */ 13909 protected $pixels; 13910 13911 public function __construct() 13912 { 13913 $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels(); 13914 } 13915 13916 /** 13917 * @param array $attr 13918 * @param HTMLPurifier_Config $config 13919 * @param HTMLPurifier_Context $context 13920 * @return array 13921 */ 13922 public function transform($attr, $config, $context) 13923 { 13924 if (!isset($attr['type'])) { 13925 $t = 'text'; 13926 } else { 13927 $t = strtolower($attr['type']); 13928 } 13929 if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') { 13930 unset($attr['checked']); 13931 } 13932 if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') { 13933 unset($attr['maxlength']); 13934 } 13935 if (isset($attr['size']) && $t !== 'text' && $t !== 'password') { 13936 $result = $this->pixels->validate($attr['size'], $config, $context); 13937 if ($result === false) { 13938 unset($attr['size']); 13939 } else { 13940 $attr['size'] = $result; 13941 } 13942 } 13943 if (isset($attr['src']) && $t !== 'image') { 13944 unset($attr['src']); 13945 } 13946 if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) { 13947 $attr['value'] = ''; 13948 } 13949 return $attr; 13950 } 13951} 13952 13953 13954 13955 13956 13957/** 13958 * Post-transform that copies lang's value to xml:lang (and vice-versa) 13959 * @note Theoretically speaking, this could be a pre-transform, but putting 13960 * post is more efficient. 13961 */ 13962class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform 13963{ 13964 13965 /** 13966 * @param array $attr 13967 * @param HTMLPurifier_Config $config 13968 * @param HTMLPurifier_Context $context 13969 * @return array 13970 */ 13971 public function transform($attr, $config, $context) 13972 { 13973 $lang = isset($attr['lang']) ? $attr['lang'] : false; 13974 $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false; 13975 13976 if ($lang !== false && $xml_lang === false) { 13977 $attr['xml:lang'] = $lang; 13978 } elseif ($xml_lang !== false) { 13979 $attr['lang'] = $xml_lang; 13980 } 13981 return $attr; 13982 } 13983} 13984 13985 13986 13987 13988 13989/** 13990 * Class for handling width/height length attribute transformations to CSS 13991 */ 13992class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform 13993{ 13994 13995 /** 13996 * @type string 13997 */ 13998 protected $name; 13999 14000 /** 14001 * @type string 14002 */ 14003 protected $cssName; 14004 14005 public function __construct($name, $css_name = null) 14006 { 14007 $this->name = $name; 14008 $this->cssName = $css_name ? $css_name : $name; 14009 } 14010 14011 /** 14012 * @param array $attr 14013 * @param HTMLPurifier_Config $config 14014 * @param HTMLPurifier_Context $context 14015 * @return array 14016 */ 14017 public function transform($attr, $config, $context) 14018 { 14019 if (!isset($attr[$this->name])) { 14020 return $attr; 14021 } 14022 $length = $this->confiscateAttr($attr, $this->name); 14023 if (ctype_digit($length)) { 14024 $length .= 'px'; 14025 } 14026 $this->prependCSS($attr, $this->cssName . ":$length;"); 14027 return $attr; 14028 } 14029} 14030 14031 14032 14033 14034 14035/** 14036 * Pre-transform that changes deprecated name attribute to ID if necessary 14037 */ 14038class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform 14039{ 14040 14041 /** 14042 * @param array $attr 14043 * @param HTMLPurifier_Config $config 14044 * @param HTMLPurifier_Context $context 14045 * @return array 14046 */ 14047 public function transform($attr, $config, $context) 14048 { 14049 // Abort early if we're using relaxed definition of name 14050 if ($config->get('HTML.Attr.Name.UseCDATA')) { 14051 return $attr; 14052 } 14053 if (!isset($attr['name'])) { 14054 return $attr; 14055 } 14056 $id = $this->confiscateAttr($attr, 'name'); 14057 if (isset($attr['id'])) { 14058 return $attr; 14059 } 14060 $attr['id'] = $id; 14061 return $attr; 14062 } 14063} 14064 14065 14066 14067 14068 14069/** 14070 * Post-transform that performs validation to the name attribute; if 14071 * it is present with an equivalent id attribute, it is passed through; 14072 * otherwise validation is performed. 14073 */ 14074class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform 14075{ 14076 14077 public function __construct() 14078 { 14079 $this->idDef = new HTMLPurifier_AttrDef_HTML_ID(); 14080 } 14081 14082 /** 14083 * @param array $attr 14084 * @param HTMLPurifier_Config $config 14085 * @param HTMLPurifier_Context $context 14086 * @return array 14087 */ 14088 public function transform($attr, $config, $context) 14089 { 14090 if (!isset($attr['name'])) { 14091 return $attr; 14092 } 14093 $name = $attr['name']; 14094 if (isset($attr['id']) && $attr['id'] === $name) { 14095 return $attr; 14096 } 14097 $result = $this->idDef->validate($name, $config, $context); 14098 if ($result === false) { 14099 unset($attr['name']); 14100 } else { 14101 $attr['name'] = $result; 14102 } 14103 return $attr; 14104 } 14105} 14106 14107 14108 14109 14110 14111// must be called POST validation 14112 14113/** 14114 * Adds rel="nofollow" to all outbound links. This transform is 14115 * only attached if Attr.Nofollow is TRUE. 14116 */ 14117class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform 14118{ 14119 /** 14120 * @type HTMLPurifier_URIParser 14121 */ 14122 private $parser; 14123 14124 public function __construct() 14125 { 14126 $this->parser = new HTMLPurifier_URIParser(); 14127 } 14128 14129 /** 14130 * @param array $attr 14131 * @param HTMLPurifier_Config $config 14132 * @param HTMLPurifier_Context $context 14133 * @return array 14134 */ 14135 public function transform($attr, $config, $context) 14136 { 14137 if (!isset($attr['href'])) { 14138 return $attr; 14139 } 14140 14141 // XXX Kind of inefficient 14142 $url = $this->parser->parse($attr['href']); 14143 $scheme = $url->getSchemeObj($config, $context); 14144 14145 if ($scheme->browsable && !$url->isLocal($config, $context)) { 14146 if (isset($attr['rel'])) { 14147 $rels = explode(' ', $attr['rel']); 14148 if (!in_array('nofollow', $rels)) { 14149 $rels[] = 'nofollow'; 14150 } 14151 $attr['rel'] = implode(' ', $rels); 14152 } else { 14153 $attr['rel'] = 'nofollow'; 14154 } 14155 } 14156 return $attr; 14157 } 14158} 14159 14160 14161 14162 14163 14164class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform 14165{ 14166 /** 14167 * @type string 14168 */ 14169 public $name = "SafeEmbed"; 14170 14171 /** 14172 * @param array $attr 14173 * @param HTMLPurifier_Config $config 14174 * @param HTMLPurifier_Context $context 14175 * @return array 14176 */ 14177 public function transform($attr, $config, $context) 14178 { 14179 $attr['allowscriptaccess'] = 'never'; 14180 $attr['allownetworking'] = 'internal'; 14181 $attr['type'] = 'application/x-shockwave-flash'; 14182 return $attr; 14183 } 14184} 14185 14186 14187 14188 14189 14190/** 14191 * Writes default type for all objects. Currently only supports flash. 14192 */ 14193class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform 14194{ 14195 /** 14196 * @type string 14197 */ 14198 public $name = "SafeObject"; 14199 14200 /** 14201 * @param array $attr 14202 * @param HTMLPurifier_Config $config 14203 * @param HTMLPurifier_Context $context 14204 * @return array 14205 */ 14206 public function transform($attr, $config, $context) 14207 { 14208 if (!isset($attr['type'])) { 14209 $attr['type'] = 'application/x-shockwave-flash'; 14210 } 14211 return $attr; 14212 } 14213} 14214 14215 14216 14217 14218 14219/** 14220 * Validates name/value pairs in param tags to be used in safe objects. This 14221 * will only allow name values it recognizes, and pre-fill certain attributes 14222 * with required values. 14223 * 14224 * @note 14225 * This class only supports Flash. In the future, Quicktime support 14226 * may be added. 14227 * 14228 * @warning 14229 * This class expects an injector to add the necessary parameters tags. 14230 */ 14231class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform 14232{ 14233 /** 14234 * @type string 14235 */ 14236 public $name = "SafeParam"; 14237 14238 /** 14239 * @type HTMLPurifier_AttrDef_URI 14240 */ 14241 private $uri; 14242 14243 public function __construct() 14244 { 14245 $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded 14246 $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent')); 14247 } 14248 14249 /** 14250 * @param array $attr 14251 * @param HTMLPurifier_Config $config 14252 * @param HTMLPurifier_Context $context 14253 * @return array 14254 */ 14255 public function transform($attr, $config, $context) 14256 { 14257 // If we add support for other objects, we'll need to alter the 14258 // transforms. 14259 switch ($attr['name']) { 14260 // application/x-shockwave-flash 14261 // Keep this synchronized with Injector/SafeObject.php 14262 case 'allowScriptAccess': 14263 $attr['value'] = 'never'; 14264 break; 14265 case 'allowNetworking': 14266 $attr['value'] = 'internal'; 14267 break; 14268 case 'allowFullScreen': 14269 if ($config->get('HTML.FlashAllowFullScreen')) { 14270 $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false'; 14271 } else { 14272 $attr['value'] = 'false'; 14273 } 14274 break; 14275 case 'wmode': 14276 $attr['value'] = $this->wmode->validate($attr['value'], $config, $context); 14277 break; 14278 case 'movie': 14279 case 'src': 14280 $attr['name'] = "movie"; 14281 $attr['value'] = $this->uri->validate($attr['value'], $config, $context); 14282 break; 14283 case 'flashvars': 14284 // we're going to allow arbitrary inputs to the SWF, on 14285 // the reasoning that it could only hack the SWF, not us. 14286 break; 14287 // add other cases to support other param name/value pairs 14288 default: 14289 $attr['name'] = $attr['value'] = null; 14290 } 14291 return $attr; 14292 } 14293} 14294 14295 14296 14297 14298 14299/** 14300 * Implements required attribute stipulation for <script> 14301 */ 14302class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform 14303{ 14304 /** 14305 * @param array $attr 14306 * @param HTMLPurifier_Config $config 14307 * @param HTMLPurifier_Context $context 14308 * @return array 14309 */ 14310 public function transform($attr, $config, $context) 14311 { 14312 if (!isset($attr['type'])) { 14313 $attr['type'] = 'text/javascript'; 14314 } 14315 return $attr; 14316 } 14317} 14318 14319 14320 14321 14322 14323// must be called POST validation 14324 14325/** 14326 * Adds target="blank" to all outbound links. This transform is 14327 * only attached if Attr.TargetBlank is TRUE. This works regardless 14328 * of whether or not Attr.AllowedFrameTargets 14329 */ 14330class HTMLPurifier_AttrTransform_TargetBlank extends HTMLPurifier_AttrTransform 14331{ 14332 /** 14333 * @type HTMLPurifier_URIParser 14334 */ 14335 private $parser; 14336 14337 public function __construct() 14338 { 14339 $this->parser = new HTMLPurifier_URIParser(); 14340 } 14341 14342 /** 14343 * @param array $attr 14344 * @param HTMLPurifier_Config $config 14345 * @param HTMLPurifier_Context $context 14346 * @return array 14347 */ 14348 public function transform($attr, $config, $context) 14349 { 14350 if (!isset($attr['href'])) { 14351 return $attr; 14352 } 14353 14354 // XXX Kind of inefficient 14355 $url = $this->parser->parse($attr['href']); 14356 $scheme = $url->getSchemeObj($config, $context); 14357 14358 if ($scheme->browsable && !$url->isBenign($config, $context)) { 14359 $attr['target'] = '_blank'; 14360 } 14361 return $attr; 14362 } 14363} 14364 14365 14366 14367 14368 14369// must be called POST validation 14370 14371/** 14372 * Adds rel="noopener" to any links which target a different window 14373 * than the current one. This is used to prevent malicious websites 14374 * from silently replacing the original window, which could be used 14375 * to do phishing. 14376 * This transform is controlled by %HTML.TargetNoopener. 14377 */ 14378class HTMLPurifier_AttrTransform_TargetNoopener extends HTMLPurifier_AttrTransform 14379{ 14380 /** 14381 * @param array $attr 14382 * @param HTMLPurifier_Config $config 14383 * @param HTMLPurifier_Context $context 14384 * @return array 14385 */ 14386 public function transform($attr, $config, $context) 14387 { 14388 if (isset($attr['rel'])) { 14389 $rels = explode(' ', $attr['rel']); 14390 } else { 14391 $rels = array(); 14392 } 14393 if (isset($attr['target']) && !in_array('noopener', $rels)) { 14394 $rels[] = 'noopener'; 14395 } 14396 if (!empty($rels) || isset($attr['rel'])) { 14397 $attr['rel'] = implode(' ', $rels); 14398 } 14399 14400 return $attr; 14401 } 14402} 14403 14404 14405 14406 14407// must be called POST validation 14408 14409/** 14410 * Adds rel="noreferrer" to any links which target a different window 14411 * than the current one. This is used to prevent malicious websites 14412 * from silently replacing the original window, which could be used 14413 * to do phishing. 14414 * This transform is controlled by %HTML.TargetNoreferrer. 14415 */ 14416class HTMLPurifier_AttrTransform_TargetNoreferrer extends HTMLPurifier_AttrTransform 14417{ 14418 /** 14419 * @param array $attr 14420 * @param HTMLPurifier_Config $config 14421 * @param HTMLPurifier_Context $context 14422 * @return array 14423 */ 14424 public function transform($attr, $config, $context) 14425 { 14426 if (isset($attr['rel'])) { 14427 $rels = explode(' ', $attr['rel']); 14428 } else { 14429 $rels = array(); 14430 } 14431 if (isset($attr['target']) && !in_array('noreferrer', $rels)) { 14432 $rels[] = 'noreferrer'; 14433 } 14434 if (!empty($rels) || isset($attr['rel'])) { 14435 $attr['rel'] = implode(' ', $rels); 14436 } 14437 14438 return $attr; 14439 } 14440} 14441 14442 14443 14444 14445/** 14446 * Sets height/width defaults for <textarea> 14447 */ 14448class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform 14449{ 14450 /** 14451 * @param array $attr 14452 * @param HTMLPurifier_Config $config 14453 * @param HTMLPurifier_Context $context 14454 * @return array 14455 */ 14456 public function transform($attr, $config, $context) 14457 { 14458 // Calculated from Firefox 14459 if (!isset($attr['cols'])) { 14460 $attr['cols'] = '22'; 14461 } 14462 if (!isset($attr['rows'])) { 14463 $attr['rows'] = '3'; 14464 } 14465 return $attr; 14466 } 14467} 14468 14469 14470 14471 14472 14473/** 14474 * Definition that uses different definitions depending on context. 14475 * 14476 * The del and ins tags are notable because they allow different types of 14477 * elements depending on whether or not they're in a block or inline context. 14478 * Chameleon allows this behavior to happen by using two different 14479 * definitions depending on context. While this somewhat generalized, 14480 * it is specifically intended for those two tags. 14481 */ 14482class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef 14483{ 14484 14485 /** 14486 * Instance of the definition object to use when inline. Usually stricter. 14487 * @type HTMLPurifier_ChildDef_Optional 14488 */ 14489 public $inline; 14490 14491 /** 14492 * Instance of the definition object to use when block. 14493 * @type HTMLPurifier_ChildDef_Optional 14494 */ 14495 public $block; 14496 14497 /** 14498 * @type string 14499 */ 14500 public $type = 'chameleon'; 14501 14502 /** 14503 * @param array $inline List of elements to allow when inline. 14504 * @param array $block List of elements to allow when block. 14505 */ 14506 public function __construct($inline, $block) 14507 { 14508 $this->inline = new HTMLPurifier_ChildDef_Optional($inline); 14509 $this->block = new HTMLPurifier_ChildDef_Optional($block); 14510 $this->elements = $this->block->elements; 14511 } 14512 14513 /** 14514 * @param HTMLPurifier_Node[] $children 14515 * @param HTMLPurifier_Config $config 14516 * @param HTMLPurifier_Context $context 14517 * @return bool 14518 */ 14519 public function validateChildren($children, $config, $context) 14520 { 14521 if ($context->get('IsInline') === false) { 14522 return $this->block->validateChildren( 14523 $children, 14524 $config, 14525 $context 14526 ); 14527 } else { 14528 return $this->inline->validateChildren( 14529 $children, 14530 $config, 14531 $context 14532 ); 14533 } 14534 } 14535} 14536 14537 14538 14539 14540 14541/** 14542 * Custom validation class, accepts DTD child definitions 14543 * 14544 * @warning Currently this class is an all or nothing proposition, that is, 14545 * it will only give a bool return value. 14546 */ 14547class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef 14548{ 14549 /** 14550 * @type string 14551 */ 14552 public $type = 'custom'; 14553 14554 /** 14555 * @type bool 14556 */ 14557 public $allow_empty = false; 14558 14559 /** 14560 * Allowed child pattern as defined by the DTD. 14561 * @type string 14562 */ 14563 public $dtd_regex; 14564 14565 /** 14566 * PCRE regex derived from $dtd_regex. 14567 * @type string 14568 */ 14569 private $_pcre_regex; 14570 14571 /** 14572 * @param $dtd_regex Allowed child pattern from the DTD 14573 */ 14574 public function __construct($dtd_regex) 14575 { 14576 $this->dtd_regex = $dtd_regex; 14577 $this->_compileRegex(); 14578 } 14579 14580 /** 14581 * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex) 14582 */ 14583 protected function _compileRegex() 14584 { 14585 $raw = str_replace(' ', '', $this->dtd_regex); 14586 if ($raw[0] != '(') { 14587 $raw = "($raw)"; 14588 } 14589 $el = '[#a-zA-Z0-9_.-]+'; 14590 $reg = $raw; 14591 14592 // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M 14593 // DOING! Seriously: if there's problems, please report them. 14594 14595 // collect all elements into the $elements array 14596 preg_match_all("/$el/", $reg, $matches); 14597 foreach ($matches[0] as $match) { 14598 $this->elements[$match] = true; 14599 } 14600 14601 // setup all elements as parentheticals with leading commas 14602 $reg = preg_replace("/$el/", '(,\\0)', $reg); 14603 14604 // remove commas when they were not solicited 14605 $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg); 14606 14607 // remove all non-paranthetical commas: they are handled by first regex 14608 $reg = preg_replace("/,\(/", '(', $reg); 14609 14610 $this->_pcre_regex = $reg; 14611 } 14612 14613 /** 14614 * @param HTMLPurifier_Node[] $children 14615 * @param HTMLPurifier_Config $config 14616 * @param HTMLPurifier_Context $context 14617 * @return bool 14618 */ 14619 public function validateChildren($children, $config, $context) 14620 { 14621 $list_of_children = ''; 14622 $nesting = 0; // depth into the nest 14623 foreach ($children as $node) { 14624 if (!empty($node->is_whitespace)) { 14625 continue; 14626 } 14627 $list_of_children .= $node->name . ','; 14628 } 14629 // add leading comma to deal with stray comma declarations 14630 $list_of_children = ',' . rtrim($list_of_children, ','); 14631 $okay = 14632 preg_match( 14633 '/^,?' . $this->_pcre_regex . '$/', 14634 $list_of_children 14635 ); 14636 return (bool)$okay; 14637 } 14638} 14639 14640 14641 14642 14643 14644/** 14645 * Definition that disallows all elements. 14646 * @warning validateChildren() in this class is actually never called, because 14647 * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed 14648 * before child definitions are parsed in earnest by 14649 * HTMLPurifier_Strategy_FixNesting. 14650 */ 14651class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef 14652{ 14653 /** 14654 * @type bool 14655 */ 14656 public $allow_empty = true; 14657 14658 /** 14659 * @type string 14660 */ 14661 public $type = 'empty'; 14662 14663 public function __construct() 14664 { 14665 } 14666 14667 /** 14668 * @param HTMLPurifier_Node[] $children 14669 * @param HTMLPurifier_Config $config 14670 * @param HTMLPurifier_Context $context 14671 * @return array 14672 */ 14673 public function validateChildren($children, $config, $context) 14674 { 14675 return array(); 14676 } 14677} 14678 14679 14680 14681 14682 14683/** 14684 * Definition for list containers ul and ol. 14685 * 14686 * What does this do? The big thing is to handle ol/ul at the top 14687 * level of list nodes, which should be handled specially by /folding/ 14688 * them into the previous list node. We generally shouldn't ever 14689 * see other disallowed elements, because the autoclose behavior 14690 * in MakeWellFormed handles it. 14691 */ 14692class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef 14693{ 14694 /** 14695 * @type string 14696 */ 14697 public $type = 'list'; 14698 /** 14699 * @type array 14700 */ 14701 // lying a little bit, so that we can handle ul and ol ourselves 14702 // XXX: This whole business with 'wrap' is all a bit unsatisfactory 14703 public $elements = array('li' => true, 'ul' => true, 'ol' => true); 14704 14705 /** 14706 * @param array $children 14707 * @param HTMLPurifier_Config $config 14708 * @param HTMLPurifier_Context $context 14709 * @return array 14710 */ 14711 public function validateChildren($children, $config, $context) 14712 { 14713 // Flag for subclasses 14714 $this->whitespace = false; 14715 14716 // if there are no tokens, delete parent node 14717 if (empty($children)) { 14718 return false; 14719 } 14720 14721 // if li is not allowed, delete parent node 14722 if (!isset($config->getHTMLDefinition()->info['li'])) { 14723 trigger_error("Cannot allow ul/ol without allowing li", E_USER_WARNING); 14724 return false; 14725 } 14726 14727 // the new set of children 14728 $result = array(); 14729 14730 // a little sanity check to make sure it's not ALL whitespace 14731 $all_whitespace = true; 14732 14733 $current_li = null; 14734 14735 foreach ($children as $node) { 14736 if (!empty($node->is_whitespace)) { 14737 $result[] = $node; 14738 continue; 14739 } 14740 $all_whitespace = false; // phew, we're not talking about whitespace 14741 14742 if ($node->name === 'li') { 14743 // good 14744 $current_li = $node; 14745 $result[] = $node; 14746 } else { 14747 // we want to tuck this into the previous li 14748 // Invariant: we expect the node to be ol/ul 14749 // ToDo: Make this more robust in the case of not ol/ul 14750 // by distinguishing between existing li and li created 14751 // to handle non-list elements; non-list elements should 14752 // not be appended to an existing li; only li created 14753 // for non-list. This distinction is not currently made. 14754 if ($current_li === null) { 14755 $current_li = new HTMLPurifier_Node_Element('li'); 14756 $result[] = $current_li; 14757 } 14758 $current_li->children[] = $node; 14759 $current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo 14760 } 14761 } 14762 if (empty($result)) { 14763 return false; 14764 } 14765 if ($all_whitespace) { 14766 return false; 14767 } 14768 return $result; 14769 } 14770} 14771 14772 14773 14774 14775 14776/** 14777 * Definition that allows a set of elements, but disallows empty children. 14778 */ 14779class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef 14780{ 14781 /** 14782 * Lookup table of allowed elements. 14783 * @type array 14784 */ 14785 public $elements = array(); 14786 14787 /** 14788 * Whether or not the last passed node was all whitespace. 14789 * @type bool 14790 */ 14791 protected $whitespace = false; 14792 14793 /** 14794 * @param array|string $elements List of allowed element names (lowercase). 14795 */ 14796 public function __construct($elements) 14797 { 14798 if (is_string($elements)) { 14799 $elements = str_replace(' ', '', $elements); 14800 $elements = explode('|', $elements); 14801 } 14802 $keys = array_keys($elements); 14803 if ($keys == array_keys($keys)) { 14804 $elements = array_flip($elements); 14805 foreach ($elements as $i => $x) { 14806 $elements[$i] = true; 14807 if (empty($i)) { 14808 unset($elements[$i]); 14809 } // remove blank 14810 } 14811 } 14812 $this->elements = $elements; 14813 } 14814 14815 /** 14816 * @type bool 14817 */ 14818 public $allow_empty = false; 14819 14820 /** 14821 * @type string 14822 */ 14823 public $type = 'required'; 14824 14825 /** 14826 * @param array $children 14827 * @param HTMLPurifier_Config $config 14828 * @param HTMLPurifier_Context $context 14829 * @return array 14830 */ 14831 public function validateChildren($children, $config, $context) 14832 { 14833 // Flag for subclasses 14834 $this->whitespace = false; 14835 14836 // if there are no tokens, delete parent node 14837 if (empty($children)) { 14838 return false; 14839 } 14840 14841 // the new set of children 14842 $result = array(); 14843 14844 // whether or not parsed character data is allowed 14845 // this controls whether or not we silently drop a tag 14846 // or generate escaped HTML from it 14847 $pcdata_allowed = isset($this->elements['#PCDATA']); 14848 14849 // a little sanity check to make sure it's not ALL whitespace 14850 $all_whitespace = true; 14851 14852 $stack = array_reverse($children); 14853 while (!empty($stack)) { 14854 $node = array_pop($stack); 14855 if (!empty($node->is_whitespace)) { 14856 $result[] = $node; 14857 continue; 14858 } 14859 $all_whitespace = false; // phew, we're not talking about whitespace 14860 14861 if (!isset($this->elements[$node->name])) { 14862 // special case text 14863 // XXX One of these ought to be redundant or something 14864 if ($pcdata_allowed && $node instanceof HTMLPurifier_Node_Text) { 14865 $result[] = $node; 14866 continue; 14867 } 14868 // spill the child contents in 14869 // ToDo: Make configurable 14870 if ($node instanceof HTMLPurifier_Node_Element) { 14871 for ($i = count($node->children) - 1; $i >= 0; $i--) { 14872 $stack[] = $node->children[$i]; 14873 } 14874 continue; 14875 } 14876 continue; 14877 } 14878 $result[] = $node; 14879 } 14880 if (empty($result)) { 14881 return false; 14882 } 14883 if ($all_whitespace) { 14884 $this->whitespace = true; 14885 return false; 14886 } 14887 return $result; 14888 } 14889} 14890 14891 14892 14893 14894 14895/** 14896 * Definition that allows a set of elements, and allows no children. 14897 * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required, 14898 * really, one shouldn't inherit from the other. Only altered behavior 14899 * is to overload a returned false with an array. Thus, it will never 14900 * return false. 14901 */ 14902class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required 14903{ 14904 /** 14905 * @type bool 14906 */ 14907 public $allow_empty = true; 14908 14909 /** 14910 * @type string 14911 */ 14912 public $type = 'optional'; 14913 14914 /** 14915 * @param array $children 14916 * @param HTMLPurifier_Config $config 14917 * @param HTMLPurifier_Context $context 14918 * @return array 14919 */ 14920 public function validateChildren($children, $config, $context) 14921 { 14922 $result = parent::validateChildren($children, $config, $context); 14923 // we assume that $children is not modified 14924 if ($result === false) { 14925 if (empty($children)) { 14926 return true; 14927 } elseif ($this->whitespace) { 14928 return $children; 14929 } else { 14930 return array(); 14931 } 14932 } 14933 return $result; 14934 } 14935} 14936 14937 14938 14939 14940 14941/** 14942 * Takes the contents of blockquote when in strict and reformats for validation. 14943 */ 14944class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required 14945{ 14946 /** 14947 * @type array 14948 */ 14949 protected $real_elements; 14950 14951 /** 14952 * @type array 14953 */ 14954 protected $fake_elements; 14955 14956 /** 14957 * @type bool 14958 */ 14959 public $allow_empty = true; 14960 14961 /** 14962 * @type string 14963 */ 14964 public $type = 'strictblockquote'; 14965 14966 /** 14967 * @type bool 14968 */ 14969 protected $init = false; 14970 14971 /** 14972 * @param HTMLPurifier_Config $config 14973 * @return array 14974 * @note We don't want MakeWellFormed to auto-close inline elements since 14975 * they might be allowed. 14976 */ 14977 public function getAllowedElements($config) 14978 { 14979 $this->init($config); 14980 return $this->fake_elements; 14981 } 14982 14983 /** 14984 * @param array $children 14985 * @param HTMLPurifier_Config $config 14986 * @param HTMLPurifier_Context $context 14987 * @return array 14988 */ 14989 public function validateChildren($children, $config, $context) 14990 { 14991 $this->init($config); 14992 14993 // trick the parent class into thinking it allows more 14994 $this->elements = $this->fake_elements; 14995 $result = parent::validateChildren($children, $config, $context); 14996 $this->elements = $this->real_elements; 14997 14998 if ($result === false) { 14999 return array(); 15000 } 15001 if ($result === true) { 15002 $result = $children; 15003 } 15004 15005 $def = $config->getHTMLDefinition(); 15006 $block_wrap_name = $def->info_block_wrapper; 15007 $block_wrap = false; 15008 $ret = array(); 15009 15010 foreach ($result as $node) { 15011 if ($block_wrap === false) { 15012 if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) || 15013 ($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) { 15014 $block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper); 15015 $ret[] = $block_wrap; 15016 } 15017 } else { 15018 if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) { 15019 $block_wrap = false; 15020 15021 } 15022 } 15023 if ($block_wrap) { 15024 $block_wrap->children[] = $node; 15025 } else { 15026 $ret[] = $node; 15027 } 15028 } 15029 return $ret; 15030 } 15031 15032 /** 15033 * @param HTMLPurifier_Config $config 15034 */ 15035 private function init($config) 15036 { 15037 if (!$this->init) { 15038 $def = $config->getHTMLDefinition(); 15039 // allow all inline elements 15040 $this->real_elements = $this->elements; 15041 $this->fake_elements = $def->info_content_sets['Flow']; 15042 $this->fake_elements['#PCDATA'] = true; 15043 $this->init = true; 15044 } 15045 } 15046} 15047 15048 15049 15050 15051 15052/** 15053 * Definition for tables. The general idea is to extract out all of the 15054 * essential bits, and then reconstruct it later. 15055 * 15056 * This is a bit confusing, because the DTDs and the W3C 15057 * validators seem to disagree on the appropriate definition. The 15058 * DTD claims: 15059 * 15060 * (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) 15061 * 15062 * But actually, the HTML4 spec then has this to say: 15063 * 15064 * The TBODY start tag is always required except when the table 15065 * contains only one table body and no table head or foot sections. 15066 * The TBODY end tag may always be safely omitted. 15067 * 15068 * So the DTD is kind of wrong. The validator is, unfortunately, kind 15069 * of on crack. 15070 * 15071 * The definition changed again in XHTML1.1; and in my opinion, this 15072 * formulation makes the most sense. 15073 * 15074 * caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ )) 15075 * 15076 * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode. 15077 * If we encounter a thead, tfoot or tbody, we are placed in the former 15078 * mode, and we *must* wrap any stray tr segments with a tbody. But if 15079 * we don't run into any of them, just have tr tags is OK. 15080 */ 15081class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef 15082{ 15083 /** 15084 * @type bool 15085 */ 15086 public $allow_empty = false; 15087 15088 /** 15089 * @type string 15090 */ 15091 public $type = 'table'; 15092 15093 /** 15094 * @type array 15095 */ 15096 public $elements = array( 15097 'tr' => true, 15098 'tbody' => true, 15099 'thead' => true, 15100 'tfoot' => true, 15101 'caption' => true, 15102 'colgroup' => true, 15103 'col' => true 15104 ); 15105 15106 public function __construct() 15107 { 15108 } 15109 15110 /** 15111 * @param array $children 15112 * @param HTMLPurifier_Config $config 15113 * @param HTMLPurifier_Context $context 15114 * @return array 15115 */ 15116 public function validateChildren($children, $config, $context) 15117 { 15118 if (empty($children)) { 15119 return false; 15120 } 15121 15122 // only one of these elements is allowed in a table 15123 $caption = false; 15124 $thead = false; 15125 $tfoot = false; 15126 15127 // whitespace 15128 $initial_ws = array(); 15129 $after_caption_ws = array(); 15130 $after_thead_ws = array(); 15131 $after_tfoot_ws = array(); 15132 15133 // as many of these as you want 15134 $cols = array(); 15135 $content = array(); 15136 15137 $tbody_mode = false; // if true, then we need to wrap any stray 15138 // <tr>s with a <tbody>. 15139 15140 $ws_accum =& $initial_ws; 15141 15142 foreach ($children as $node) { 15143 if ($node instanceof HTMLPurifier_Node_Comment) { 15144 $ws_accum[] = $node; 15145 continue; 15146 } 15147 switch ($node->name) { 15148 case 'tbody': 15149 $tbody_mode = true; 15150 // fall through 15151 case 'tr': 15152 $content[] = $node; 15153 $ws_accum =& $content; 15154 break; 15155 case 'caption': 15156 // there can only be one caption! 15157 if ($caption !== false) break; 15158 $caption = $node; 15159 $ws_accum =& $after_caption_ws; 15160 break; 15161 case 'thead': 15162 $tbody_mode = true; 15163 // XXX This breaks rendering properties with 15164 // Firefox, which never floats a <thead> to 15165 // the top. Ever. (Our scheme will float the 15166 // first <thead> to the top.) So maybe 15167 // <thead>s that are not first should be 15168 // turned into <tbody>? Very tricky, indeed. 15169 if ($thead === false) { 15170 $thead = $node; 15171 $ws_accum =& $after_thead_ws; 15172 } else { 15173 // Oops, there's a second one! What 15174 // should we do? Current behavior is to 15175 // transmutate the first and last entries into 15176 // tbody tags, and then put into content. 15177 // Maybe a better idea is to *attach 15178 // it* to the existing thead or tfoot? 15179 // We don't do this, because Firefox 15180 // doesn't float an extra tfoot to the 15181 // bottom like it does for the first one. 15182 $node->name = 'tbody'; 15183 $content[] = $node; 15184 $ws_accum =& $content; 15185 } 15186 break; 15187 case 'tfoot': 15188 // see above for some aveats 15189 $tbody_mode = true; 15190 if ($tfoot === false) { 15191 $tfoot = $node; 15192 $ws_accum =& $after_tfoot_ws; 15193 } else { 15194 $node->name = 'tbody'; 15195 $content[] = $node; 15196 $ws_accum =& $content; 15197 } 15198 break; 15199 case 'colgroup': 15200 case 'col': 15201 $cols[] = $node; 15202 $ws_accum =& $cols; 15203 break; 15204 case '#PCDATA': 15205 // How is whitespace handled? We treat is as sticky to 15206 // the *end* of the previous element. So all of the 15207 // nonsense we have worked on is to keep things 15208 // together. 15209 if (!empty($node->is_whitespace)) { 15210 $ws_accum[] = $node; 15211 } 15212 break; 15213 } 15214 } 15215 15216 if (empty($content)) { 15217 return false; 15218 } 15219 15220 $ret = $initial_ws; 15221 if ($caption !== false) { 15222 $ret[] = $caption; 15223 $ret = array_merge($ret, $after_caption_ws); 15224 } 15225 if ($cols !== false) { 15226 $ret = array_merge($ret, $cols); 15227 } 15228 if ($thead !== false) { 15229 $ret[] = $thead; 15230 $ret = array_merge($ret, $after_thead_ws); 15231 } 15232 if ($tfoot !== false) { 15233 $ret[] = $tfoot; 15234 $ret = array_merge($ret, $after_tfoot_ws); 15235 } 15236 15237 if ($tbody_mode) { 15238 // we have to shuffle tr into tbody 15239 $current_tr_tbody = null; 15240 15241 foreach($content as $node) { 15242 switch ($node->name) { 15243 case 'tbody': 15244 $current_tr_tbody = null; 15245 $ret[] = $node; 15246 break; 15247 case 'tr': 15248 if ($current_tr_tbody === null) { 15249 $current_tr_tbody = new HTMLPurifier_Node_Element('tbody'); 15250 $ret[] = $current_tr_tbody; 15251 } 15252 $current_tr_tbody->children[] = $node; 15253 break; 15254 case '#PCDATA': 15255 //assert($node->is_whitespace); 15256 if ($current_tr_tbody === null) { 15257 $ret[] = $node; 15258 } else { 15259 $current_tr_tbody->children[] = $node; 15260 } 15261 break; 15262 } 15263 } 15264 } else { 15265 $ret = array_merge($ret, $content); 15266 } 15267 15268 return $ret; 15269 15270 } 15271} 15272 15273 15274 15275 15276 15277class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache 15278{ 15279 15280 /** 15281 * Cache object we are decorating 15282 * @type HTMLPurifier_DefinitionCache 15283 */ 15284 public $cache; 15285 15286 /** 15287 * The name of the decorator 15288 * @var string 15289 */ 15290 public $name; 15291 15292 public function __construct() 15293 { 15294 } 15295 15296 /** 15297 * Lazy decorator function 15298 * @param HTMLPurifier_DefinitionCache $cache Reference to cache object to decorate 15299 * @return HTMLPurifier_DefinitionCache_Decorator 15300 */ 15301 public function decorate(&$cache) 15302 { 15303 $decorator = $this->copy(); 15304 // reference is necessary for mocks in PHP 4 15305 $decorator->cache =& $cache; 15306 $decorator->type = $cache->type; 15307 return $decorator; 15308 } 15309 15310 /** 15311 * Cross-compatible clone substitute 15312 * @return HTMLPurifier_DefinitionCache_Decorator 15313 */ 15314 public function copy() 15315 { 15316 return new HTMLPurifier_DefinitionCache_Decorator(); 15317 } 15318 15319 /** 15320 * @param HTMLPurifier_Definition $def 15321 * @param HTMLPurifier_Config $config 15322 * @return mixed 15323 */ 15324 public function add($def, $config) 15325 { 15326 return $this->cache->add($def, $config); 15327 } 15328 15329 /** 15330 * @param HTMLPurifier_Definition $def 15331 * @param HTMLPurifier_Config $config 15332 * @return mixed 15333 */ 15334 public function set($def, $config) 15335 { 15336 return $this->cache->set($def, $config); 15337 } 15338 15339 /** 15340 * @param HTMLPurifier_Definition $def 15341 * @param HTMLPurifier_Config $config 15342 * @return mixed 15343 */ 15344 public function replace($def, $config) 15345 { 15346 return $this->cache->replace($def, $config); 15347 } 15348 15349 /** 15350 * @param HTMLPurifier_Config $config 15351 * @return mixed 15352 */ 15353 public function get($config) 15354 { 15355 return $this->cache->get($config); 15356 } 15357 15358 /** 15359 * @param HTMLPurifier_Config $config 15360 * @return mixed 15361 */ 15362 public function remove($config) 15363 { 15364 return $this->cache->remove($config); 15365 } 15366 15367 /** 15368 * @param HTMLPurifier_Config $config 15369 * @return mixed 15370 */ 15371 public function flush($config) 15372 { 15373 return $this->cache->flush($config); 15374 } 15375 15376 /** 15377 * @param HTMLPurifier_Config $config 15378 * @return mixed 15379 */ 15380 public function cleanup($config) 15381 { 15382 return $this->cache->cleanup($config); 15383 } 15384} 15385 15386 15387 15388 15389 15390/** 15391 * Null cache object to use when no caching is on. 15392 */ 15393class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache 15394{ 15395 15396 /** 15397 * @param HTMLPurifier_Definition $def 15398 * @param HTMLPurifier_Config $config 15399 * @return bool 15400 */ 15401 public function add($def, $config) 15402 { 15403 return false; 15404 } 15405 15406 /** 15407 * @param HTMLPurifier_Definition $def 15408 * @param HTMLPurifier_Config $config 15409 * @return bool 15410 */ 15411 public function set($def, $config) 15412 { 15413 return false; 15414 } 15415 15416 /** 15417 * @param HTMLPurifier_Definition $def 15418 * @param HTMLPurifier_Config $config 15419 * @return bool 15420 */ 15421 public function replace($def, $config) 15422 { 15423 return false; 15424 } 15425 15426 /** 15427 * @param HTMLPurifier_Config $config 15428 * @return bool 15429 */ 15430 public function remove($config) 15431 { 15432 return false; 15433 } 15434 15435 /** 15436 * @param HTMLPurifier_Config $config 15437 * @return bool 15438 */ 15439 public function get($config) 15440 { 15441 return false; 15442 } 15443 15444 /** 15445 * @param HTMLPurifier_Config $config 15446 * @return bool 15447 */ 15448 public function flush($config) 15449 { 15450 return false; 15451 } 15452 15453 /** 15454 * @param HTMLPurifier_Config $config 15455 * @return bool 15456 */ 15457 public function cleanup($config) 15458 { 15459 return false; 15460 } 15461} 15462 15463 15464 15465 15466 15467class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCache 15468{ 15469 15470 /** 15471 * @param HTMLPurifier_Definition $def 15472 * @param HTMLPurifier_Config $config 15473 * @return int|bool 15474 */ 15475 public function add($def, $config) 15476 { 15477 if (!$this->checkDefType($def)) { 15478 return; 15479 } 15480 $file = $this->generateFilePath($config); 15481 if (file_exists($file)) { 15482 return false; 15483 } 15484 if (!$this->_prepareDir($config)) { 15485 return false; 15486 } 15487 return $this->_write($file, serialize($def), $config); 15488 } 15489 15490 /** 15491 * @param HTMLPurifier_Definition $def 15492 * @param HTMLPurifier_Config $config 15493 * @return int|bool 15494 */ 15495 public function set($def, $config) 15496 { 15497 if (!$this->checkDefType($def)) { 15498 return; 15499 } 15500 $file = $this->generateFilePath($config); 15501 if (!$this->_prepareDir($config)) { 15502 return false; 15503 } 15504 return $this->_write($file, serialize($def), $config); 15505 } 15506 15507 /** 15508 * @param HTMLPurifier_Definition $def 15509 * @param HTMLPurifier_Config $config 15510 * @return int|bool 15511 */ 15512 public function replace($def, $config) 15513 { 15514 if (!$this->checkDefType($def)) { 15515 return; 15516 } 15517 $file = $this->generateFilePath($config); 15518 if (!file_exists($file)) { 15519 return false; 15520 } 15521 if (!$this->_prepareDir($config)) { 15522 return false; 15523 } 15524 return $this->_write($file, serialize($def), $config); 15525 } 15526 15527 /** 15528 * @param HTMLPurifier_Config $config 15529 * @return bool|HTMLPurifier_Config 15530 */ 15531 public function get($config) 15532 { 15533 $file = $this->generateFilePath($config); 15534 if (!file_exists($file)) { 15535 return false; 15536 } 15537 return unserialize(file_get_contents($file)); 15538 } 15539 15540 /** 15541 * @param HTMLPurifier_Config $config 15542 * @return bool 15543 */ 15544 public function remove($config) 15545 { 15546 $file = $this->generateFilePath($config); 15547 if (!file_exists($file)) { 15548 return false; 15549 } 15550 return unlink($file); 15551 } 15552 15553 /** 15554 * @param HTMLPurifier_Config $config 15555 * @return bool 15556 */ 15557 public function flush($config) 15558 { 15559 if (!$this->_prepareDir($config)) { 15560 return false; 15561 } 15562 $dir = $this->generateDirectoryPath($config); 15563 $dh = opendir($dir); 15564 // Apparently, on some versions of PHP, readdir will return 15565 // an empty string if you pass an invalid argument to readdir. 15566 // So you need this test. See #49. 15567 if (false === $dh) { 15568 return false; 15569 } 15570 while (false !== ($filename = readdir($dh))) { 15571 if (empty($filename)) { 15572 continue; 15573 } 15574 if ($filename[0] === '.') { 15575 continue; 15576 } 15577 unlink($dir . '/' . $filename); 15578 } 15579 closedir($dh); 15580 return true; 15581 } 15582 15583 /** 15584 * @param HTMLPurifier_Config $config 15585 * @return bool 15586 */ 15587 public function cleanup($config) 15588 { 15589 if (!$this->_prepareDir($config)) { 15590 return false; 15591 } 15592 $dir = $this->generateDirectoryPath($config); 15593 $dh = opendir($dir); 15594 // See #49 (and above). 15595 if (false === $dh) { 15596 return false; 15597 } 15598 while (false !== ($filename = readdir($dh))) { 15599 if (empty($filename)) { 15600 continue; 15601 } 15602 if ($filename[0] === '.') { 15603 continue; 15604 } 15605 $key = substr($filename, 0, strlen($filename) - 4); 15606 if ($this->isOld($key, $config)) { 15607 unlink($dir . '/' . $filename); 15608 } 15609 } 15610 closedir($dh); 15611 return true; 15612 } 15613 15614 /** 15615 * Generates the file path to the serial file corresponding to 15616 * the configuration and definition name 15617 * @param HTMLPurifier_Config $config 15618 * @return string 15619 * @todo Make protected 15620 */ 15621 public function generateFilePath($config) 15622 { 15623 $key = $this->generateKey($config); 15624 return $this->generateDirectoryPath($config) . '/' . $key . '.ser'; 15625 } 15626 15627 /** 15628 * Generates the path to the directory contain this cache's serial files 15629 * @param HTMLPurifier_Config $config 15630 * @return string 15631 * @note No trailing slash 15632 * @todo Make protected 15633 */ 15634 public function generateDirectoryPath($config) 15635 { 15636 $base = $this->generateBaseDirectoryPath($config); 15637 return $base . '/' . $this->type; 15638 } 15639 15640 /** 15641 * Generates path to base directory that contains all definition type 15642 * serials 15643 * @param HTMLPurifier_Config $config 15644 * @return mixed|string 15645 * @todo Make protected 15646 */ 15647 public function generateBaseDirectoryPath($config) 15648 { 15649 $base = $config->get('Cache.SerializerPath'); 15650 $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base; 15651 return $base; 15652 } 15653 15654 /** 15655 * Convenience wrapper function for file_put_contents 15656 * @param string $file File name to write to 15657 * @param string $data Data to write into file 15658 * @param HTMLPurifier_Config $config 15659 * @return int|bool Number of bytes written if success, or false if failure. 15660 */ 15661 private function _write($file, $data, $config) 15662 { 15663 $result = file_put_contents($file, $data); 15664 if ($result !== false) { 15665 // set permissions of the new file (no execute) 15666 $chmod = $config->get('Cache.SerializerPermissions'); 15667 if ($chmod !== null) { 15668 chmod($file, $chmod & 0666); 15669 } 15670 } 15671 return $result; 15672 } 15673 15674 /** 15675 * Prepares the directory that this type stores the serials in 15676 * @param HTMLPurifier_Config $config 15677 * @return bool True if successful 15678 */ 15679 private function _prepareDir($config) 15680 { 15681 $directory = $this->generateDirectoryPath($config); 15682 $chmod = $config->get('Cache.SerializerPermissions'); 15683 if ($chmod === null) { 15684 if (!@mkdir($directory) && !is_dir($directory)) { 15685 trigger_error( 15686 'Could not create directory ' . $directory . '', 15687 E_USER_WARNING 15688 ); 15689 return false; 15690 } 15691 return true; 15692 } 15693 if (!is_dir($directory)) { 15694 $base = $this->generateBaseDirectoryPath($config); 15695 if (!is_dir($base)) { 15696 trigger_error( 15697 'Base directory ' . $base . ' does not exist, 15698 please create or change using %Cache.SerializerPath', 15699 E_USER_WARNING 15700 ); 15701 return false; 15702 } elseif (!$this->_testPermissions($base, $chmod)) { 15703 return false; 15704 } 15705 if (!@mkdir($directory, $chmod) && !is_dir($directory)) { 15706 trigger_error( 15707 'Could not create directory ' . $directory . '', 15708 E_USER_WARNING 15709 ); 15710 return false; 15711 } 15712 if (!$this->_testPermissions($directory, $chmod)) { 15713 return false; 15714 } 15715 } elseif (!$this->_testPermissions($directory, $chmod)) { 15716 return false; 15717 } 15718 return true; 15719 } 15720 15721 /** 15722 * Tests permissions on a directory and throws out friendly 15723 * error messages and attempts to chmod it itself if possible 15724 * @param string $dir Directory path 15725 * @param int $chmod Permissions 15726 * @return bool True if directory is writable 15727 */ 15728 private function _testPermissions($dir, $chmod) 15729 { 15730 // early abort, if it is writable, everything is hunky-dory 15731 if (is_writable($dir)) { 15732 return true; 15733 } 15734 if (!is_dir($dir)) { 15735 // generally, you'll want to handle this beforehand 15736 // so a more specific error message can be given 15737 trigger_error( 15738 'Directory ' . $dir . ' does not exist', 15739 E_USER_WARNING 15740 ); 15741 return false; 15742 } 15743 if (function_exists('posix_getuid') && $chmod !== null) { 15744 // POSIX system, we can give more specific advice 15745 if (fileowner($dir) === posix_getuid()) { 15746 // we can chmod it ourselves 15747 $chmod = $chmod | 0700; 15748 if (chmod($dir, $chmod)) { 15749 return true; 15750 } 15751 } elseif (filegroup($dir) === posix_getgid()) { 15752 $chmod = $chmod | 0070; 15753 } else { 15754 // PHP's probably running as nobody, so we'll 15755 // need to give global permissions 15756 $chmod = $chmod | 0777; 15757 } 15758 trigger_error( 15759 'Directory ' . $dir . ' not writable, ' . 15760 'please chmod to ' . decoct($chmod), 15761 E_USER_WARNING 15762 ); 15763 } else { 15764 // generic error message 15765 trigger_error( 15766 'Directory ' . $dir . ' not writable, ' . 15767 'please alter file permissions', 15768 E_USER_WARNING 15769 ); 15770 } 15771 return false; 15772 } 15773} 15774 15775 15776 15777 15778 15779/** 15780 * Definition cache decorator class that cleans up the cache 15781 * whenever there is a cache miss. 15782 */ 15783class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends HTMLPurifier_DefinitionCache_Decorator 15784{ 15785 /** 15786 * @type string 15787 */ 15788 public $name = 'Cleanup'; 15789 15790 /** 15791 * @return HTMLPurifier_DefinitionCache_Decorator_Cleanup 15792 */ 15793 public function copy() 15794 { 15795 return new HTMLPurifier_DefinitionCache_Decorator_Cleanup(); 15796 } 15797 15798 /** 15799 * @param HTMLPurifier_Definition $def 15800 * @param HTMLPurifier_Config $config 15801 * @return mixed 15802 */ 15803 public function add($def, $config) 15804 { 15805 $status = parent::add($def, $config); 15806 if (!$status) { 15807 parent::cleanup($config); 15808 } 15809 return $status; 15810 } 15811 15812 /** 15813 * @param HTMLPurifier_Definition $def 15814 * @param HTMLPurifier_Config $config 15815 * @return mixed 15816 */ 15817 public function set($def, $config) 15818 { 15819 $status = parent::set($def, $config); 15820 if (!$status) { 15821 parent::cleanup($config); 15822 } 15823 return $status; 15824 } 15825 15826 /** 15827 * @param HTMLPurifier_Definition $def 15828 * @param HTMLPurifier_Config $config 15829 * @return mixed 15830 */ 15831 public function replace($def, $config) 15832 { 15833 $status = parent::replace($def, $config); 15834 if (!$status) { 15835 parent::cleanup($config); 15836 } 15837 return $status; 15838 } 15839 15840 /** 15841 * @param HTMLPurifier_Config $config 15842 * @return mixed 15843 */ 15844 public function get($config) 15845 { 15846 $ret = parent::get($config); 15847 if (!$ret) { 15848 parent::cleanup($config); 15849 } 15850 return $ret; 15851 } 15852} 15853 15854 15855 15856 15857 15858/** 15859 * Definition cache decorator class that saves all cache retrievals 15860 * to PHP's memory; good for unit tests or circumstances where 15861 * there are lots of configuration objects floating around. 15862 */ 15863class HTMLPurifier_DefinitionCache_Decorator_Memory extends HTMLPurifier_DefinitionCache_Decorator 15864{ 15865 /** 15866 * @type array 15867 */ 15868 protected $definitions; 15869 15870 /** 15871 * @type string 15872 */ 15873 public $name = 'Memory'; 15874 15875 /** 15876 * @return HTMLPurifier_DefinitionCache_Decorator_Memory 15877 */ 15878 public function copy() 15879 { 15880 return new HTMLPurifier_DefinitionCache_Decorator_Memory(); 15881 } 15882 15883 /** 15884 * @param HTMLPurifier_Definition $def 15885 * @param HTMLPurifier_Config $config 15886 * @return mixed 15887 */ 15888 public function add($def, $config) 15889 { 15890 $status = parent::add($def, $config); 15891 if ($status) { 15892 $this->definitions[$this->generateKey($config)] = $def; 15893 } 15894 return $status; 15895 } 15896 15897 /** 15898 * @param HTMLPurifier_Definition $def 15899 * @param HTMLPurifier_Config $config 15900 * @return mixed 15901 */ 15902 public function set($def, $config) 15903 { 15904 $status = parent::set($def, $config); 15905 if ($status) { 15906 $this->definitions[$this->generateKey($config)] = $def; 15907 } 15908 return $status; 15909 } 15910 15911 /** 15912 * @param HTMLPurifier_Definition $def 15913 * @param HTMLPurifier_Config $config 15914 * @return mixed 15915 */ 15916 public function replace($def, $config) 15917 { 15918 $status = parent::replace($def, $config); 15919 if ($status) { 15920 $this->definitions[$this->generateKey($config)] = $def; 15921 } 15922 return $status; 15923 } 15924 15925 /** 15926 * @param HTMLPurifier_Config $config 15927 * @return mixed 15928 */ 15929 public function get($config) 15930 { 15931 $key = $this->generateKey($config); 15932 if (isset($this->definitions[$key])) { 15933 return $this->definitions[$key]; 15934 } 15935 $this->definitions[$key] = parent::get($config); 15936 return $this->definitions[$key]; 15937 } 15938} 15939 15940 15941 15942 15943 15944/** 15945 * XHTML 1.1 Bi-directional Text Module, defines elements that 15946 * declare directionality of content. Text Extension Module. 15947 */ 15948class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule 15949{ 15950 15951 /** 15952 * @type string 15953 */ 15954 public $name = 'Bdo'; 15955 15956 /** 15957 * @type array 15958 */ 15959 public $attr_collections = array( 15960 'I18N' => array('dir' => false) 15961 ); 15962 15963 /** 15964 * @param HTMLPurifier_Config $config 15965 */ 15966 public function setup($config) 15967 { 15968 $bdo = $this->addElement( 15969 'bdo', 15970 'Inline', 15971 'Inline', 15972 array('Core', 'Lang'), 15973 array( 15974 'dir' => 'Enum#ltr,rtl', // required 15975 // The Abstract Module specification has the attribute 15976 // inclusions wrong for bdo: bdo allows Lang 15977 ) 15978 ); 15979 $bdo->attr_transform_post[] = new HTMLPurifier_AttrTransform_BdoDir(); 15980 15981 $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl'; 15982 } 15983} 15984 15985 15986 15987 15988 15989class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule 15990{ 15991 /** 15992 * @type string 15993 */ 15994 public $name = 'CommonAttributes'; 15995 15996 /** 15997 * @type array 15998 */ 15999 public $attr_collections = array( 16000 'Core' => array( 16001 0 => array('Style'), 16002 // 'xml:space' => false, 16003 'class' => 'Class', 16004 'id' => 'ID', 16005 'title' => 'CDATA', 16006 ), 16007 'Lang' => array(), 16008 'I18N' => array( 16009 0 => array('Lang'), // proprietary, for xml:lang/lang 16010 ), 16011 'Common' => array( 16012 0 => array('Core', 'I18N') 16013 ) 16014 ); 16015} 16016 16017 16018 16019 16020 16021/** 16022 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension 16023 * Module. 16024 */ 16025class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule 16026{ 16027 16028 /** 16029 * @type string 16030 */ 16031 public $name = 'Edit'; 16032 16033 /** 16034 * @param HTMLPurifier_Config $config 16035 */ 16036 public function setup($config) 16037 { 16038 $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow'; 16039 $attr = array( 16040 'cite' => 'URI', 16041 // 'datetime' => 'Datetime', // not implemented 16042 ); 16043 $this->addElement('del', 'Inline', $contents, 'Common', $attr); 16044 $this->addElement('ins', 'Inline', $contents, 'Common', $attr); 16045 } 16046 16047 // HTML 4.01 specifies that ins/del must not contain block 16048 // elements when used in an inline context, chameleon is 16049 // a complicated workaround to acheive this effect 16050 16051 // Inline context ! Block context (exclamation mark is 16052 // separator, see getChildDef for parsing) 16053 16054 /** 16055 * @type bool 16056 */ 16057 public $defines_child_def = true; 16058 16059 /** 16060 * @param HTMLPurifier_ElementDef $def 16061 * @return HTMLPurifier_ChildDef_Chameleon 16062 */ 16063 public function getChildDef($def) 16064 { 16065 if ($def->content_model_type != 'chameleon') { 16066 return false; 16067 } 16068 $value = explode('!', $def->content_model); 16069 return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]); 16070 } 16071} 16072 16073 16074 16075 16076 16077/** 16078 * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4. 16079 */ 16080class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule 16081{ 16082 /** 16083 * @type string 16084 */ 16085 public $name = 'Forms'; 16086 16087 /** 16088 * @type bool 16089 */ 16090 public $safe = false; 16091 16092 /** 16093 * @type array 16094 */ 16095 public $content_sets = array( 16096 'Block' => 'Form', 16097 'Inline' => 'Formctrl', 16098 ); 16099 16100 /** 16101 * @param HTMLPurifier_Config $config 16102 */ 16103 public function setup($config) 16104 { 16105 $form = $this->addElement( 16106 'form', 16107 'Form', 16108 'Required: Heading | List | Block | fieldset', 16109 'Common', 16110 array( 16111 'accept' => 'ContentTypes', 16112 'accept-charset' => 'Charsets', 16113 'action*' => 'URI', 16114 'method' => 'Enum#get,post', 16115 // really ContentType, but these two are the only ones used today 16116 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data', 16117 ) 16118 ); 16119 $form->excludes = array('form' => true); 16120 16121 $input = $this->addElement( 16122 'input', 16123 'Formctrl', 16124 'Empty', 16125 'Common', 16126 array( 16127 'accept' => 'ContentTypes', 16128 'accesskey' => 'Character', 16129 'alt' => 'Text', 16130 'checked' => 'Bool#checked', 16131 'disabled' => 'Bool#disabled', 16132 'maxlength' => 'Number', 16133 'name' => 'CDATA', 16134 'readonly' => 'Bool#readonly', 16135 'size' => 'Number', 16136 'src' => 'URI#embedded', 16137 'tabindex' => 'Number', 16138 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image', 16139 'value' => 'CDATA', 16140 ) 16141 ); 16142 $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input(); 16143 16144 $this->addElement( 16145 'select', 16146 'Formctrl', 16147 'Required: optgroup | option', 16148 'Common', 16149 array( 16150 'disabled' => 'Bool#disabled', 16151 'multiple' => 'Bool#multiple', 16152 'name' => 'CDATA', 16153 'size' => 'Number', 16154 'tabindex' => 'Number', 16155 ) 16156 ); 16157 16158 $this->addElement( 16159 'option', 16160 false, 16161 'Optional: #PCDATA', 16162 'Common', 16163 array( 16164 'disabled' => 'Bool#disabled', 16165 'label' => 'Text', 16166 'selected' => 'Bool#selected', 16167 'value' => 'CDATA', 16168 ) 16169 ); 16170 // It's illegal for there to be more than one selected, but not 16171 // be multiple. Also, no selected means undefined behavior. This might 16172 // be difficult to implement; perhaps an injector, or a context variable. 16173 16174 $textarea = $this->addElement( 16175 'textarea', 16176 'Formctrl', 16177 'Optional: #PCDATA', 16178 'Common', 16179 array( 16180 'accesskey' => 'Character', 16181 'cols*' => 'Number', 16182 'disabled' => 'Bool#disabled', 16183 'name' => 'CDATA', 16184 'readonly' => 'Bool#readonly', 16185 'rows*' => 'Number', 16186 'tabindex' => 'Number', 16187 ) 16188 ); 16189 $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea(); 16190 16191 $button = $this->addElement( 16192 'button', 16193 'Formctrl', 16194 'Optional: #PCDATA | Heading | List | Block | Inline', 16195 'Common', 16196 array( 16197 'accesskey' => 'Character', 16198 'disabled' => 'Bool#disabled', 16199 'name' => 'CDATA', 16200 'tabindex' => 'Number', 16201 'type' => 'Enum#button,submit,reset', 16202 'value' => 'CDATA', 16203 ) 16204 ); 16205 16206 // For exclusions, ideally we'd specify content sets, not literal elements 16207 $button->excludes = $this->makeLookup( 16208 'form', 16209 'fieldset', // Form 16210 'input', 16211 'select', 16212 'textarea', 16213 'label', 16214 'button', // Formctrl 16215 'a', // as per HTML 4.01 spec, this is omitted by modularization 16216 'isindex', 16217 'iframe' // legacy items 16218 ); 16219 16220 // Extra exclusion: img usemap="" is not permitted within this element. 16221 // We'll omit this for now, since we don't have any good way of 16222 // indicating it yet. 16223 16224 // This is HIGHLY user-unfriendly; we need a custom child-def for this 16225 $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common'); 16226 16227 $label = $this->addElement( 16228 'label', 16229 'Formctrl', 16230 'Optional: #PCDATA | Inline', 16231 'Common', 16232 array( 16233 'accesskey' => 'Character', 16234 // 'for' => 'IDREF', // IDREF not implemented, cannot allow 16235 ) 16236 ); 16237 $label->excludes = array('label' => true); 16238 16239 $this->addElement( 16240 'legend', 16241 false, 16242 'Optional: #PCDATA | Inline', 16243 'Common', 16244 array( 16245 'accesskey' => 'Character', 16246 ) 16247 ); 16248 16249 $this->addElement( 16250 'optgroup', 16251 false, 16252 'Required: option', 16253 'Common', 16254 array( 16255 'disabled' => 'Bool#disabled', 16256 'label*' => 'Text', 16257 ) 16258 ); 16259 // Don't forget an injector for <isindex>. This one's a little complex 16260 // because it maps to multiple elements. 16261 } 16262} 16263 16264 16265 16266 16267 16268/** 16269 * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module. 16270 */ 16271class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule 16272{ 16273 16274 /** 16275 * @type string 16276 */ 16277 public $name = 'Hypertext'; 16278 16279 /** 16280 * @param HTMLPurifier_Config $config 16281 */ 16282 public function setup($config) 16283 { 16284 $a = $this->addElement( 16285 'a', 16286 'Inline', 16287 'Inline', 16288 'Common', 16289 array( 16290 // 'accesskey' => 'Character', 16291 // 'charset' => 'Charset', 16292 'href' => 'URI', 16293 // 'hreflang' => 'LanguageCode', 16294 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'), 16295 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'), 16296 // 'tabindex' => 'Number', 16297 // 'type' => 'ContentType', 16298 ) 16299 ); 16300 $a->formatting = true; 16301 $a->excludes = array('a' => true); 16302 } 16303} 16304 16305 16306 16307 16308 16309/** 16310 * XHTML 1.1 Iframe Module provides inline frames. 16311 * 16312 * @note This module is not considered safe unless an Iframe 16313 * whitelisting mechanism is specified. Currently, the only 16314 * such mechanism is %URL.SafeIframeRegexp 16315 */ 16316class HTMLPurifier_HTMLModule_Iframe extends HTMLPurifier_HTMLModule 16317{ 16318 16319 /** 16320 * @type string 16321 */ 16322 public $name = 'Iframe'; 16323 16324 /** 16325 * @type bool 16326 */ 16327 public $safe = false; 16328 16329 /** 16330 * @param HTMLPurifier_Config $config 16331 */ 16332 public function setup($config) 16333 { 16334 if ($config->get('HTML.SafeIframe')) { 16335 $this->safe = true; 16336 } 16337 $this->addElement( 16338 'iframe', 16339 'Inline', 16340 'Flow', 16341 'Common', 16342 array( 16343 'src' => 'URI#embedded', 16344 'width' => 'Length', 16345 'height' => 'Length', 16346 'name' => 'ID', 16347 'scrolling' => 'Enum#yes,no,auto', 16348 'frameborder' => 'Enum#0,1', 16349 'longdesc' => 'URI', 16350 'marginheight' => 'Pixels', 16351 'marginwidth' => 'Pixels', 16352 ) 16353 ); 16354 } 16355} 16356 16357 16358 16359 16360 16361/** 16362 * XHTML 1.1 Image Module provides basic image embedding. 16363 * @note There is specialized code for removing empty images in 16364 * HTMLPurifier_Strategy_RemoveForeignElements 16365 */ 16366class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule 16367{ 16368 16369 /** 16370 * @type string 16371 */ 16372 public $name = 'Image'; 16373 16374 /** 16375 * @param HTMLPurifier_Config $config 16376 */ 16377 public function setup($config) 16378 { 16379 $max = $config->get('HTML.MaxImgLength'); 16380 $img = $this->addElement( 16381 'img', 16382 'Inline', 16383 'Empty', 16384 'Common', 16385 array( 16386 'alt*' => 'Text', 16387 // According to the spec, it's Length, but percents can 16388 // be abused, so we allow only Pixels. 16389 'height' => 'Pixels#' . $max, 16390 'width' => 'Pixels#' . $max, 16391 'longdesc' => 'URI', 16392 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded 16393 ) 16394 ); 16395 if ($max === null || $config->get('HTML.Trusted')) { 16396 $img->attr['height'] = 16397 $img->attr['width'] = 'Length'; 16398 } 16399 16400 // kind of strange, but splitting things up would be inefficient 16401 $img->attr_transform_pre[] = 16402 $img->attr_transform_post[] = 16403 new HTMLPurifier_AttrTransform_ImgRequired(); 16404 } 16405} 16406 16407 16408 16409 16410 16411/** 16412 * XHTML 1.1 Legacy module defines elements that were previously 16413 * deprecated. 16414 * 16415 * @note Not all legacy elements have been implemented yet, which 16416 * is a bit of a reverse problem as compared to browsers! In 16417 * addition, this legacy module may implement a bit more than 16418 * mandated by XHTML 1.1. 16419 * 16420 * This module can be used in combination with TransformToStrict in order 16421 * to transform as many deprecated elements as possible, but retain 16422 * questionably deprecated elements that do not have good alternatives 16423 * as well as transform elements that don't have an implementation. 16424 * See docs/ref-strictness.txt for more details. 16425 */ 16426 16427class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule 16428{ 16429 /** 16430 * @type string 16431 */ 16432 public $name = 'Legacy'; 16433 16434 /** 16435 * @param HTMLPurifier_Config $config 16436 */ 16437 public function setup($config) 16438 { 16439 $this->addElement( 16440 'basefont', 16441 'Inline', 16442 'Empty', 16443 null, 16444 array( 16445 'color' => 'Color', 16446 'face' => 'Text', // extremely broad, we should 16447 'size' => 'Text', // tighten it 16448 'id' => 'ID' 16449 ) 16450 ); 16451 $this->addElement('center', 'Block', 'Flow', 'Common'); 16452 $this->addElement( 16453 'dir', 16454 'Block', 16455 'Required: li', 16456 'Common', 16457 array( 16458 'compact' => 'Bool#compact' 16459 ) 16460 ); 16461 $this->addElement( 16462 'font', 16463 'Inline', 16464 'Inline', 16465 array('Core', 'I18N'), 16466 array( 16467 'color' => 'Color', 16468 'face' => 'Text', // extremely broad, we should 16469 'size' => 'Text', // tighten it 16470 ) 16471 ); 16472 $this->addElement( 16473 'menu', 16474 'Block', 16475 'Required: li', 16476 'Common', 16477 array( 16478 'compact' => 'Bool#compact' 16479 ) 16480 ); 16481 16482 $s = $this->addElement('s', 'Inline', 'Inline', 'Common'); 16483 $s->formatting = true; 16484 16485 $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common'); 16486 $strike->formatting = true; 16487 16488 $u = $this->addElement('u', 'Inline', 'Inline', 'Common'); 16489 $u->formatting = true; 16490 16491 // setup modifications to old elements 16492 16493 $align = 'Enum#left,right,center,justify'; 16494 16495 $address = $this->addBlankElement('address'); 16496 $address->content_model = 'Inline | #PCDATA | p'; 16497 $address->content_model_type = 'optional'; 16498 $address->child = false; 16499 16500 $blockquote = $this->addBlankElement('blockquote'); 16501 $blockquote->content_model = 'Flow | #PCDATA'; 16502 $blockquote->content_model_type = 'optional'; 16503 $blockquote->child = false; 16504 16505 $br = $this->addBlankElement('br'); 16506 $br->attr['clear'] = 'Enum#left,all,right,none'; 16507 16508 $caption = $this->addBlankElement('caption'); 16509 $caption->attr['align'] = 'Enum#top,bottom,left,right'; 16510 16511 $div = $this->addBlankElement('div'); 16512 $div->attr['align'] = $align; 16513 16514 $dl = $this->addBlankElement('dl'); 16515 $dl->attr['compact'] = 'Bool#compact'; 16516 16517 for ($i = 1; $i <= 6; $i++) { 16518 $h = $this->addBlankElement("h$i"); 16519 $h->attr['align'] = $align; 16520 } 16521 16522 $hr = $this->addBlankElement('hr'); 16523 $hr->attr['align'] = $align; 16524 $hr->attr['noshade'] = 'Bool#noshade'; 16525 $hr->attr['size'] = 'Pixels'; 16526 $hr->attr['width'] = 'Length'; 16527 16528 $img = $this->addBlankElement('img'); 16529 $img->attr['align'] = 'IAlign'; 16530 $img->attr['border'] = 'Pixels'; 16531 $img->attr['hspace'] = 'Pixels'; 16532 $img->attr['vspace'] = 'Pixels'; 16533 16534 // figure out this integer business 16535 16536 $li = $this->addBlankElement('li'); 16537 $li->attr['value'] = new HTMLPurifier_AttrDef_Integer(); 16538 $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle'; 16539 16540 $ol = $this->addBlankElement('ol'); 16541 $ol->attr['compact'] = 'Bool#compact'; 16542 $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer(); 16543 $ol->attr['type'] = 'Enum#s:1,i,I,a,A'; 16544 16545 $p = $this->addBlankElement('p'); 16546 $p->attr['align'] = $align; 16547 16548 $pre = $this->addBlankElement('pre'); 16549 $pre->attr['width'] = 'Number'; 16550 16551 // script omitted 16552 16553 $table = $this->addBlankElement('table'); 16554 $table->attr['align'] = 'Enum#left,center,right'; 16555 $table->attr['bgcolor'] = 'Color'; 16556 16557 $tr = $this->addBlankElement('tr'); 16558 $tr->attr['bgcolor'] = 'Color'; 16559 16560 $th = $this->addBlankElement('th'); 16561 $th->attr['bgcolor'] = 'Color'; 16562 $th->attr['height'] = 'Length'; 16563 $th->attr['nowrap'] = 'Bool#nowrap'; 16564 $th->attr['width'] = 'Length'; 16565 16566 $td = $this->addBlankElement('td'); 16567 $td->attr['bgcolor'] = 'Color'; 16568 $td->attr['height'] = 'Length'; 16569 $td->attr['nowrap'] = 'Bool#nowrap'; 16570 $td->attr['width'] = 'Length'; 16571 16572 $ul = $this->addBlankElement('ul'); 16573 $ul->attr['compact'] = 'Bool#compact'; 16574 $ul->attr['type'] = 'Enum#square,disc,circle'; 16575 16576 // "safe" modifications to "unsafe" elements 16577 // WARNING: If you want to add support for an unsafe, legacy 16578 // attribute, make a new TrustedLegacy module with the trusted 16579 // bit set appropriately 16580 16581 $form = $this->addBlankElement('form'); 16582 $form->content_model = 'Flow | #PCDATA'; 16583 $form->content_model_type = 'optional'; 16584 $form->attr['target'] = 'FrameTarget'; 16585 16586 $input = $this->addBlankElement('input'); 16587 $input->attr['align'] = 'IAlign'; 16588 16589 $legend = $this->addBlankElement('legend'); 16590 $legend->attr['align'] = 'LAlign'; 16591 } 16592} 16593 16594 16595 16596 16597 16598/** 16599 * XHTML 1.1 List Module, defines list-oriented elements. Core Module. 16600 */ 16601class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule 16602{ 16603 /** 16604 * @type string 16605 */ 16606 public $name = 'List'; 16607 16608 // According to the abstract schema, the List content set is a fully formed 16609 // one or more expr, but it invariably occurs in an optional declaration 16610 // so we're not going to do that subtlety. It might cause trouble 16611 // if a user defines "List" and expects that multiple lists are 16612 // allowed to be specified, but then again, that's not very intuitive. 16613 // Furthermore, the actual XML Schema may disagree. Regardless, 16614 // we don't have support for such nested expressions without using 16615 // the incredibly inefficient and draconic Custom ChildDef. 16616 16617 /** 16618 * @type array 16619 */ 16620 public $content_sets = array('Flow' => 'List'); 16621 16622 /** 16623 * @param HTMLPurifier_Config $config 16624 */ 16625 public function setup($config) 16626 { 16627 $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); 16628 $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common'); 16629 // XXX The wrap attribute is handled by MakeWellFormed. This is all 16630 // quite unsatisfactory, because we generated this 16631 // *specifically* for lists, and now a big chunk of the handling 16632 // is done properly by the List ChildDef. So actually, we just 16633 // want enough information to make autoclosing work properly, 16634 // and then hand off the tricky stuff to the ChildDef. 16635 $ol->wrap = 'li'; 16636 $ul->wrap = 'li'; 16637 $this->addElement('dl', 'List', 'Required: dt | dd', 'Common'); 16638 16639 $this->addElement('li', false, 'Flow', 'Common'); 16640 16641 $this->addElement('dd', false, 'Flow', 'Common'); 16642 $this->addElement('dt', false, 'Inline', 'Common'); 16643 } 16644} 16645 16646 16647 16648 16649 16650class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule 16651{ 16652 /** 16653 * @type string 16654 */ 16655 public $name = 'Name'; 16656 16657 /** 16658 * @param HTMLPurifier_Config $config 16659 */ 16660 public function setup($config) 16661 { 16662 $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map'); 16663 foreach ($elements as $name) { 16664 $element = $this->addBlankElement($name); 16665 $element->attr['name'] = 'CDATA'; 16666 if (!$config->get('HTML.Attr.Name.UseCDATA')) { 16667 $element->attr_transform_post[] = new HTMLPurifier_AttrTransform_NameSync(); 16668 } 16669 } 16670 } 16671} 16672 16673 16674 16675 16676 16677/** 16678 * Module adds the nofollow attribute transformation to a tags. It 16679 * is enabled by HTML.Nofollow 16680 */ 16681class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule 16682{ 16683 16684 /** 16685 * @type string 16686 */ 16687 public $name = 'Nofollow'; 16688 16689 /** 16690 * @param HTMLPurifier_Config $config 16691 */ 16692 public function setup($config) 16693 { 16694 $a = $this->addBlankElement('a'); 16695 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow(); 16696 } 16697} 16698 16699 16700 16701 16702 16703class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule 16704{ 16705 /** 16706 * @type string 16707 */ 16708 public $name = 'NonXMLCommonAttributes'; 16709 16710 /** 16711 * @type array 16712 */ 16713 public $attr_collections = array( 16714 'Lang' => array( 16715 'lang' => 'LanguageCode', 16716 ) 16717 ); 16718} 16719 16720 16721 16722 16723 16724/** 16725 * XHTML 1.1 Object Module, defines elements for generic object inclusion 16726 * @warning Users will commonly use <embed> to cater to legacy browsers: this 16727 * module does not allow this sort of behavior 16728 */ 16729class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule 16730{ 16731 /** 16732 * @type string 16733 */ 16734 public $name = 'Object'; 16735 16736 /** 16737 * @type bool 16738 */ 16739 public $safe = false; 16740 16741 /** 16742 * @param HTMLPurifier_Config $config 16743 */ 16744 public function setup($config) 16745 { 16746 $this->addElement( 16747 'object', 16748 'Inline', 16749 'Optional: #PCDATA | Flow | param', 16750 'Common', 16751 array( 16752 'archive' => 'URI', 16753 'classid' => 'URI', 16754 'codebase' => 'URI', 16755 'codetype' => 'Text', 16756 'data' => 'URI', 16757 'declare' => 'Bool#declare', 16758 'height' => 'Length', 16759 'name' => 'CDATA', 16760 'standby' => 'Text', 16761 'tabindex' => 'Number', 16762 'type' => 'ContentType', 16763 'width' => 'Length' 16764 ) 16765 ); 16766 16767 $this->addElement( 16768 'param', 16769 false, 16770 'Empty', 16771 null, 16772 array( 16773 'id' => 'ID', 16774 'name*' => 'Text', 16775 'type' => 'Text', 16776 'value' => 'Text', 16777 'valuetype' => 'Enum#data,ref,object' 16778 ) 16779 ); 16780 } 16781} 16782 16783 16784 16785 16786 16787/** 16788 * XHTML 1.1 Presentation Module, defines simple presentation-related 16789 * markup. Text Extension Module. 16790 * @note The official XML Schema and DTD specs further divide this into 16791 * two modules: 16792 * - Block Presentation (hr) 16793 * - Inline Presentation (b, big, i, small, sub, sup, tt) 16794 * We have chosen not to heed this distinction, as content_sets 16795 * provides satisfactory disambiguation. 16796 */ 16797class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule 16798{ 16799 16800 /** 16801 * @type string 16802 */ 16803 public $name = 'Presentation'; 16804 16805 /** 16806 * @param HTMLPurifier_Config $config 16807 */ 16808 public function setup($config) 16809 { 16810 $this->addElement('hr', 'Block', 'Empty', 'Common'); 16811 $this->addElement('sub', 'Inline', 'Inline', 'Common'); 16812 $this->addElement('sup', 'Inline', 'Inline', 'Common'); 16813 $b = $this->addElement('b', 'Inline', 'Inline', 'Common'); 16814 $b->formatting = true; 16815 $big = $this->addElement('big', 'Inline', 'Inline', 'Common'); 16816 $big->formatting = true; 16817 $i = $this->addElement('i', 'Inline', 'Inline', 'Common'); 16818 $i->formatting = true; 16819 $small = $this->addElement('small', 'Inline', 'Inline', 'Common'); 16820 $small->formatting = true; 16821 $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common'); 16822 $tt->formatting = true; 16823 } 16824} 16825 16826 16827 16828 16829 16830/** 16831 * Module defines proprietary tags and attributes in HTML. 16832 * @warning If this module is enabled, standards-compliance is off! 16833 */ 16834class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule 16835{ 16836 /** 16837 * @type string 16838 */ 16839 public $name = 'Proprietary'; 16840 16841 /** 16842 * @param HTMLPurifier_Config $config 16843 */ 16844 public function setup($config) 16845 { 16846 $this->addElement( 16847 'marquee', 16848 'Inline', 16849 'Flow', 16850 'Common', 16851 array( 16852 'direction' => 'Enum#left,right,up,down', 16853 'behavior' => 'Enum#alternate', 16854 'width' => 'Length', 16855 'height' => 'Length', 16856 'scrolldelay' => 'Number', 16857 'scrollamount' => 'Number', 16858 'loop' => 'Number', 16859 'bgcolor' => 'Color', 16860 'hspace' => 'Pixels', 16861 'vspace' => 'Pixels', 16862 ) 16863 ); 16864 } 16865} 16866 16867 16868 16869 16870 16871/** 16872 * XHTML 1.1 Ruby Annotation Module, defines elements that indicate 16873 * short runs of text alongside base text for annotation or pronounciation. 16874 */ 16875class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule 16876{ 16877 16878 /** 16879 * @type string 16880 */ 16881 public $name = 'Ruby'; 16882 16883 /** 16884 * @param HTMLPurifier_Config $config 16885 */ 16886 public function setup($config) 16887 { 16888 $this->addElement( 16889 'ruby', 16890 'Inline', 16891 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))', 16892 'Common' 16893 ); 16894 $this->addElement('rbc', false, 'Required: rb', 'Common'); 16895 $this->addElement('rtc', false, 'Required: rt', 'Common'); 16896 $rb = $this->addElement('rb', false, 'Inline', 'Common'); 16897 $rb->excludes = array('ruby' => true); 16898 $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number')); 16899 $rt->excludes = array('ruby' => true); 16900 $this->addElement('rp', false, 'Optional: #PCDATA', 'Common'); 16901 } 16902} 16903 16904 16905 16906 16907 16908/** 16909 * A "safe" embed module. See SafeObject. This is a proprietary element. 16910 */ 16911class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule 16912{ 16913 /** 16914 * @type string 16915 */ 16916 public $name = 'SafeEmbed'; 16917 16918 /** 16919 * @param HTMLPurifier_Config $config 16920 */ 16921 public function setup($config) 16922 { 16923 $max = $config->get('HTML.MaxImgLength'); 16924 $embed = $this->addElement( 16925 'embed', 16926 'Inline', 16927 'Empty', 16928 'Common', 16929 array( 16930 'src*' => 'URI#embedded', 16931 'type' => 'Enum#application/x-shockwave-flash', 16932 'width' => 'Pixels#' . $max, 16933 'height' => 'Pixels#' . $max, 16934 'allowscriptaccess' => 'Enum#never', 16935 'allownetworking' => 'Enum#internal', 16936 'flashvars' => 'Text', 16937 'wmode' => 'Enum#window,transparent,opaque', 16938 'name' => 'ID', 16939 ) 16940 ); 16941 $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed(); 16942 } 16943} 16944 16945 16946 16947 16948 16949/** 16950 * A "safe" object module. In theory, objects permitted by this module will 16951 * be safe, and untrusted users can be allowed to embed arbitrary flash objects 16952 * (maybe other types too, but only Flash is supported as of right now). 16953 * Highly experimental. 16954 */ 16955class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule 16956{ 16957 /** 16958 * @type string 16959 */ 16960 public $name = 'SafeObject'; 16961 16962 /** 16963 * @param HTMLPurifier_Config $config 16964 */ 16965 public function setup($config) 16966 { 16967 // These definitions are not intrinsically safe: the attribute transforms 16968 // are a vital part of ensuring safety. 16969 16970 $max = $config->get('HTML.MaxImgLength'); 16971 $object = $this->addElement( 16972 'object', 16973 'Inline', 16974 'Optional: param | Flow | #PCDATA', 16975 'Common', 16976 array( 16977 // While technically not required by the spec, we're forcing 16978 // it to this value. 16979 'type' => 'Enum#application/x-shockwave-flash', 16980 'width' => 'Pixels#' . $max, 16981 'height' => 'Pixels#' . $max, 16982 'data' => 'URI#embedded', 16983 'codebase' => new HTMLPurifier_AttrDef_Enum( 16984 array( 16985 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0' 16986 ) 16987 ), 16988 ) 16989 ); 16990 $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject(); 16991 16992 $param = $this->addElement( 16993 'param', 16994 false, 16995 'Empty', 16996 false, 16997 array( 16998 'id' => 'ID', 16999 'name*' => 'Text', 17000 'value' => 'Text' 17001 ) 17002 ); 17003 $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam(); 17004 $this->info_injector[] = 'SafeObject'; 17005 } 17006} 17007 17008 17009 17010 17011 17012/** 17013 * A "safe" script module. No inline JS is allowed, and pointed to JS 17014 * files must match whitelist. 17015 */ 17016class HTMLPurifier_HTMLModule_SafeScripting extends HTMLPurifier_HTMLModule 17017{ 17018 /** 17019 * @type string 17020 */ 17021 public $name = 'SafeScripting'; 17022 17023 /** 17024 * @param HTMLPurifier_Config $config 17025 */ 17026 public function setup($config) 17027 { 17028 // These definitions are not intrinsically safe: the attribute transforms 17029 // are a vital part of ensuring safety. 17030 17031 $allowed = $config->get('HTML.SafeScripting'); 17032 $script = $this->addElement( 17033 'script', 17034 'Inline', 17035 'Empty', 17036 null, 17037 array( 17038 // While technically not required by the spec, we're forcing 17039 // it to this value. 17040 'type' => 'Enum#text/javascript', 17041 'src*' => new HTMLPurifier_AttrDef_Enum(array_keys($allowed)) 17042 ) 17043 ); 17044 $script->attr_transform_pre[] = 17045 $script->attr_transform_post[] = new HTMLPurifier_AttrTransform_ScriptRequired(); 17046 } 17047} 17048 17049 17050 17051 17052 17053/* 17054 17055WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING 17056INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!! 17057 17058*/ 17059 17060/** 17061 * XHTML 1.1 Scripting module, defines elements that are used to contain 17062 * information pertaining to executable scripts or the lack of support 17063 * for executable scripts. 17064 * @note This module does not contain inline scripting elements 17065 */ 17066class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule 17067{ 17068 /** 17069 * @type string 17070 */ 17071 public $name = 'Scripting'; 17072 17073 /** 17074 * @type array 17075 */ 17076 public $elements = array('script', 'noscript'); 17077 17078 /** 17079 * @type array 17080 */ 17081 public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript'); 17082 17083 /** 17084 * @type bool 17085 */ 17086 public $safe = false; 17087 17088 /** 17089 * @param HTMLPurifier_Config $config 17090 */ 17091 public function setup($config) 17092 { 17093 // TODO: create custom child-definition for noscript that 17094 // auto-wraps stray #PCDATA in a similar manner to 17095 // blockquote's custom definition (we would use it but 17096 // blockquote's contents are optional while noscript's contents 17097 // are required) 17098 17099 // TODO: convert this to new syntax, main problem is getting 17100 // both content sets working 17101 17102 // In theory, this could be safe, but I don't see any reason to 17103 // allow it. 17104 $this->info['noscript'] = new HTMLPurifier_ElementDef(); 17105 $this->info['noscript']->attr = array(0 => array('Common')); 17106 $this->info['noscript']->content_model = 'Heading | List | Block'; 17107 $this->info['noscript']->content_model_type = 'required'; 17108 17109 $this->info['script'] = new HTMLPurifier_ElementDef(); 17110 $this->info['script']->attr = array( 17111 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')), 17112 'src' => new HTMLPurifier_AttrDef_URI(true), 17113 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript')) 17114 ); 17115 $this->info['script']->content_model = '#PCDATA'; 17116 $this->info['script']->content_model_type = 'optional'; 17117 $this->info['script']->attr_transform_pre[] = 17118 $this->info['script']->attr_transform_post[] = 17119 new HTMLPurifier_AttrTransform_ScriptRequired(); 17120 } 17121} 17122 17123 17124 17125 17126 17127/** 17128 * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension 17129 * Module. 17130 */ 17131class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule 17132{ 17133 /** 17134 * @type string 17135 */ 17136 public $name = 'StyleAttribute'; 17137 17138 /** 17139 * @type array 17140 */ 17141 public $attr_collections = array( 17142 // The inclusion routine differs from the Abstract Modules but 17143 // is in line with the DTD and XML Schemas. 17144 'Style' => array('style' => false), // see constructor 17145 'Core' => array(0 => array('Style')) 17146 ); 17147 17148 /** 17149 * @param HTMLPurifier_Config $config 17150 */ 17151 public function setup($config) 17152 { 17153 $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS(); 17154 } 17155} 17156 17157 17158 17159 17160 17161/** 17162 * XHTML 1.1 Tables Module, fully defines accessible table elements. 17163 */ 17164class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule 17165{ 17166 /** 17167 * @type string 17168 */ 17169 public $name = 'Tables'; 17170 17171 /** 17172 * @param HTMLPurifier_Config $config 17173 */ 17174 public function setup($config) 17175 { 17176 $this->addElement('caption', false, 'Inline', 'Common'); 17177 17178 $this->addElement( 17179 'table', 17180 'Block', 17181 new HTMLPurifier_ChildDef_Table(), 17182 'Common', 17183 array( 17184 'border' => 'Pixels', 17185 'cellpadding' => 'Length', 17186 'cellspacing' => 'Length', 17187 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border', 17188 'rules' => 'Enum#none,groups,rows,cols,all', 17189 'summary' => 'Text', 17190 'width' => 'Length' 17191 ) 17192 ); 17193 17194 // common attributes 17195 $cell_align = array( 17196 'align' => 'Enum#left,center,right,justify,char', 17197 'charoff' => 'Length', 17198 'valign' => 'Enum#top,middle,bottom,baseline', 17199 ); 17200 17201 $cell_t = array_merge( 17202 array( 17203 'abbr' => 'Text', 17204 'colspan' => 'Number', 17205 'rowspan' => 'Number', 17206 // Apparently, as of HTML5 this attribute only applies 17207 // to 'th' elements. 17208 'scope' => 'Enum#row,col,rowgroup,colgroup', 17209 ), 17210 $cell_align 17211 ); 17212 $this->addElement('td', false, 'Flow', 'Common', $cell_t); 17213 $this->addElement('th', false, 'Flow', 'Common', $cell_t); 17214 17215 $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align); 17216 17217 $cell_col = array_merge( 17218 array( 17219 'span' => 'Number', 17220 'width' => 'MultiLength', 17221 ), 17222 $cell_align 17223 ); 17224 $this->addElement('col', false, 'Empty', 'Common', $cell_col); 17225 $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col); 17226 17227 $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align); 17228 $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align); 17229 $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align); 17230 } 17231} 17232 17233 17234 17235 17236 17237/** 17238 * XHTML 1.1 Target Module, defines target attribute in link elements. 17239 */ 17240class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule 17241{ 17242 /** 17243 * @type string 17244 */ 17245 public $name = 'Target'; 17246 17247 /** 17248 * @param HTMLPurifier_Config $config 17249 */ 17250 public function setup($config) 17251 { 17252 $elements = array('a'); 17253 foreach ($elements as $name) { 17254 $e = $this->addBlankElement($name); 17255 $e->attr = array( 17256 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget() 17257 ); 17258 } 17259 } 17260} 17261 17262 17263 17264 17265 17266/** 17267 * Module adds the target=blank attribute transformation to a tags. It 17268 * is enabled by HTML.TargetBlank 17269 */ 17270class HTMLPurifier_HTMLModule_TargetBlank extends HTMLPurifier_HTMLModule 17271{ 17272 /** 17273 * @type string 17274 */ 17275 public $name = 'TargetBlank'; 17276 17277 /** 17278 * @param HTMLPurifier_Config $config 17279 */ 17280 public function setup($config) 17281 { 17282 $a = $this->addBlankElement('a'); 17283 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank(); 17284 } 17285} 17286 17287 17288 17289 17290 17291/** 17292 * Module adds the target-based noopener attribute transformation to a tags. It 17293 * is enabled by HTML.TargetNoopener 17294 */ 17295class HTMLPurifier_HTMLModule_TargetNoopener extends HTMLPurifier_HTMLModule 17296{ 17297 /** 17298 * @type string 17299 */ 17300 public $name = 'TargetNoopener'; 17301 17302 /** 17303 * @param HTMLPurifier_Config $config 17304 */ 17305 public function setup($config) { 17306 $a = $this->addBlankElement('a'); 17307 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoopener(); 17308 } 17309} 17310 17311 17312 17313/** 17314 * Module adds the target-based noreferrer attribute transformation to a tags. It 17315 * is enabled by HTML.TargetNoreferrer 17316 */ 17317class HTMLPurifier_HTMLModule_TargetNoreferrer extends HTMLPurifier_HTMLModule 17318{ 17319 /** 17320 * @type string 17321 */ 17322 public $name = 'TargetNoreferrer'; 17323 17324 /** 17325 * @param HTMLPurifier_Config $config 17326 */ 17327 public function setup($config) { 17328 $a = $this->addBlankElement('a'); 17329 $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoreferrer(); 17330 } 17331} 17332 17333 17334 17335/** 17336 * XHTML 1.1 Text Module, defines basic text containers. Core Module. 17337 * @note In the normative XML Schema specification, this module 17338 * is further abstracted into the following modules: 17339 * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6) 17340 * - Block Structural (div, p) 17341 * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var) 17342 * - Inline Structural (br, span) 17343 * This module, functionally, does not distinguish between these 17344 * sub-modules, but the code is internally structured to reflect 17345 * these distinctions. 17346 */ 17347class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule 17348{ 17349 /** 17350 * @type string 17351 */ 17352 public $name = 'Text'; 17353 17354 /** 17355 * @type array 17356 */ 17357 public $content_sets = array( 17358 'Flow' => 'Heading | Block | Inline' 17359 ); 17360 17361 /** 17362 * @param HTMLPurifier_Config $config 17363 */ 17364 public function setup($config) 17365 { 17366 // Inline Phrasal ------------------------------------------------- 17367 $this->addElement('abbr', 'Inline', 'Inline', 'Common'); 17368 $this->addElement('acronym', 'Inline', 'Inline', 'Common'); 17369 $this->addElement('cite', 'Inline', 'Inline', 'Common'); 17370 $this->addElement('dfn', 'Inline', 'Inline', 'Common'); 17371 $this->addElement('kbd', 'Inline', 'Inline', 'Common'); 17372 $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI')); 17373 $this->addElement('samp', 'Inline', 'Inline', 'Common'); 17374 $this->addElement('var', 'Inline', 'Inline', 'Common'); 17375 17376 $em = $this->addElement('em', 'Inline', 'Inline', 'Common'); 17377 $em->formatting = true; 17378 17379 $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common'); 17380 $strong->formatting = true; 17381 17382 $code = $this->addElement('code', 'Inline', 'Inline', 'Common'); 17383 $code->formatting = true; 17384 17385 // Inline Structural ---------------------------------------------- 17386 $this->addElement('span', 'Inline', 'Inline', 'Common'); 17387 $this->addElement('br', 'Inline', 'Empty', 'Core'); 17388 17389 // Block Phrasal -------------------------------------------------- 17390 $this->addElement('address', 'Block', 'Inline', 'Common'); 17391 $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI')); 17392 $pre = $this->addElement('pre', 'Block', 'Inline', 'Common'); 17393 $pre->excludes = $this->makeLookup( 17394 'img', 17395 'big', 17396 'small', 17397 'object', 17398 'applet', 17399 'font', 17400 'basefont' 17401 ); 17402 $this->addElement('h1', 'Heading', 'Inline', 'Common'); 17403 $this->addElement('h2', 'Heading', 'Inline', 'Common'); 17404 $this->addElement('h3', 'Heading', 'Inline', 'Common'); 17405 $this->addElement('h4', 'Heading', 'Inline', 'Common'); 17406 $this->addElement('h5', 'Heading', 'Inline', 'Common'); 17407 $this->addElement('h6', 'Heading', 'Inline', 'Common'); 17408 17409 // Block Structural ----------------------------------------------- 17410 $p = $this->addElement('p', 'Block', 'Inline', 'Common'); 17411 $p->autoclose = array_flip( 17412 array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul") 17413 ); 17414 17415 $this->addElement('div', 'Block', 'Flow', 'Common'); 17416 } 17417} 17418 17419 17420 17421 17422 17423/** 17424 * Abstract class for a set of proprietary modules that clean up (tidy) 17425 * poorly written HTML. 17426 * @todo Figure out how to protect some of these methods/properties 17427 */ 17428class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule 17429{ 17430 /** 17431 * List of supported levels. 17432 * Index zero is a special case "no fixes" level. 17433 * @type array 17434 */ 17435 public $levels = array(0 => 'none', 'light', 'medium', 'heavy'); 17436 17437 /** 17438 * Default level to place all fixes in. 17439 * Disabled by default. 17440 * @type string 17441 */ 17442 public $defaultLevel = null; 17443 17444 /** 17445 * Lists of fixes used by getFixesForLevel(). 17446 * Format is: 17447 * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2'); 17448 * @type array 17449 */ 17450 public $fixesForLevel = array( 17451 'light' => array(), 17452 'medium' => array(), 17453 'heavy' => array() 17454 ); 17455 17456 /** 17457 * Lazy load constructs the module by determining the necessary 17458 * fixes to create and then delegating to the populate() function. 17459 * @param HTMLPurifier_Config $config 17460 * @todo Wildcard matching and error reporting when an added or 17461 * subtracted fix has no effect. 17462 */ 17463 public function setup($config) 17464 { 17465 // create fixes, initialize fixesForLevel 17466 $fixes = $this->makeFixes(); 17467 $this->makeFixesForLevel($fixes); 17468 17469 // figure out which fixes to use 17470 $level = $config->get('HTML.TidyLevel'); 17471 $fixes_lookup = $this->getFixesForLevel($level); 17472 17473 // get custom fix declarations: these need namespace processing 17474 $add_fixes = $config->get('HTML.TidyAdd'); 17475 $remove_fixes = $config->get('HTML.TidyRemove'); 17476 17477 foreach ($fixes as $name => $fix) { 17478 // needs to be refactored a little to implement globbing 17479 if (isset($remove_fixes[$name]) || 17480 (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))) { 17481 unset($fixes[$name]); 17482 } 17483 } 17484 17485 // populate this module with necessary fixes 17486 $this->populate($fixes); 17487 } 17488 17489 /** 17490 * Retrieves all fixes per a level, returning fixes for that specific 17491 * level as well as all levels below it. 17492 * @param string $level level identifier, see $levels for valid values 17493 * @return array Lookup up table of fixes 17494 */ 17495 public function getFixesForLevel($level) 17496 { 17497 if ($level == $this->levels[0]) { 17498 return array(); 17499 } 17500 $activated_levels = array(); 17501 for ($i = 1, $c = count($this->levels); $i < $c; $i++) { 17502 $activated_levels[] = $this->levels[$i]; 17503 if ($this->levels[$i] == $level) { 17504 break; 17505 } 17506 } 17507 if ($i == $c) { 17508 trigger_error( 17509 'Tidy level ' . htmlspecialchars($level) . ' not recognized', 17510 E_USER_WARNING 17511 ); 17512 return array(); 17513 } 17514 $ret = array(); 17515 foreach ($activated_levels as $level) { 17516 foreach ($this->fixesForLevel[$level] as $fix) { 17517 $ret[$fix] = true; 17518 } 17519 } 17520 return $ret; 17521 } 17522 17523 /** 17524 * Dynamically populates the $fixesForLevel member variable using 17525 * the fixes array. It may be custom overloaded, used in conjunction 17526 * with $defaultLevel, or not used at all. 17527 * @param array $fixes 17528 */ 17529 public function makeFixesForLevel($fixes) 17530 { 17531 if (!isset($this->defaultLevel)) { 17532 return; 17533 } 17534 if (!isset($this->fixesForLevel[$this->defaultLevel])) { 17535 trigger_error( 17536 'Default level ' . $this->defaultLevel . ' does not exist', 17537 E_USER_ERROR 17538 ); 17539 return; 17540 } 17541 $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes); 17542 } 17543 17544 /** 17545 * Populates the module with transforms and other special-case code 17546 * based on a list of fixes passed to it 17547 * @param array $fixes Lookup table of fixes to activate 17548 */ 17549 public function populate($fixes) 17550 { 17551 foreach ($fixes as $name => $fix) { 17552 // determine what the fix is for 17553 list($type, $params) = $this->getFixType($name); 17554 switch ($type) { 17555 case 'attr_transform_pre': 17556 case 'attr_transform_post': 17557 $attr = $params['attr']; 17558 if (isset($params['element'])) { 17559 $element = $params['element']; 17560 if (empty($this->info[$element])) { 17561 $e = $this->addBlankElement($element); 17562 } else { 17563 $e = $this->info[$element]; 17564 } 17565 } else { 17566 $type = "info_$type"; 17567 $e = $this; 17568 } 17569 // PHP does some weird parsing when I do 17570 // $e->$type[$attr], so I have to assign a ref. 17571 $f =& $e->$type; 17572 $f[$attr] = $fix; 17573 break; 17574 case 'tag_transform': 17575 $this->info_tag_transform[$params['element']] = $fix; 17576 break; 17577 case 'child': 17578 case 'content_model_type': 17579 $element = $params['element']; 17580 if (empty($this->info[$element])) { 17581 $e = $this->addBlankElement($element); 17582 } else { 17583 $e = $this->info[$element]; 17584 } 17585 $e->$type = $fix; 17586 break; 17587 default: 17588 trigger_error("Fix type $type not supported", E_USER_ERROR); 17589 break; 17590 } 17591 } 17592 } 17593 17594 /** 17595 * Parses a fix name and determines what kind of fix it is, as well 17596 * as other information defined by the fix 17597 * @param $name String name of fix 17598 * @return array(string $fix_type, array $fix_parameters) 17599 * @note $fix_parameters is type dependant, see populate() for usage 17600 * of these parameters 17601 */ 17602 public function getFixType($name) 17603 { 17604 // parse it 17605 $property = $attr = null; 17606 if (strpos($name, '#') !== false) { 17607 list($name, $property) = explode('#', $name); 17608 } 17609 if (strpos($name, '@') !== false) { 17610 list($name, $attr) = explode('@', $name); 17611 } 17612 17613 // figure out the parameters 17614 $params = array(); 17615 if ($name !== '') { 17616 $params['element'] = $name; 17617 } 17618 if (!is_null($attr)) { 17619 $params['attr'] = $attr; 17620 } 17621 17622 // special case: attribute transform 17623 if (!is_null($attr)) { 17624 if (is_null($property)) { 17625 $property = 'pre'; 17626 } 17627 $type = 'attr_transform_' . $property; 17628 return array($type, $params); 17629 } 17630 17631 // special case: tag transform 17632 if (is_null($property)) { 17633 return array('tag_transform', $params); 17634 } 17635 17636 return array($property, $params); 17637 17638 } 17639 17640 /** 17641 * Defines all fixes the module will perform in a compact 17642 * associative array of fix name to fix implementation. 17643 * @return array 17644 */ 17645 public function makeFixes() 17646 { 17647 } 17648} 17649 17650 17651 17652 17653 17654class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule 17655{ 17656 /** 17657 * @type string 17658 */ 17659 public $name = 'XMLCommonAttributes'; 17660 17661 /** 17662 * @type array 17663 */ 17664 public $attr_collections = array( 17665 'Lang' => array( 17666 'xml:lang' => 'LanguageCode', 17667 ) 17668 ); 17669} 17670 17671 17672 17673 17674 17675/** 17676 * Name is deprecated, but allowed in strict doctypes, so onl 17677 */ 17678class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy 17679{ 17680 /** 17681 * @type string 17682 */ 17683 public $name = 'Tidy_Name'; 17684 17685 /** 17686 * @type string 17687 */ 17688 public $defaultLevel = 'heavy'; 17689 17690 /** 17691 * @return array 17692 */ 17693 public function makeFixes() 17694 { 17695 $r = array(); 17696 // @name for img, a ----------------------------------------------- 17697 // Technically, it's allowed even on strict, so we allow authors to use 17698 // it. However, it's deprecated in future versions of XHTML. 17699 $r['img@name'] = 17700 $r['a@name'] = new HTMLPurifier_AttrTransform_Name(); 17701 return $r; 17702 } 17703} 17704 17705 17706 17707 17708 17709class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy 17710{ 17711 17712 /** 17713 * @type string 17714 */ 17715 public $name = 'Tidy_Proprietary'; 17716 17717 /** 17718 * @type string 17719 */ 17720 public $defaultLevel = 'light'; 17721 17722 /** 17723 * @return array 17724 */ 17725 public function makeFixes() 17726 { 17727 $r = array(); 17728 $r['table@background'] = new HTMLPurifier_AttrTransform_Background(); 17729 $r['td@background'] = new HTMLPurifier_AttrTransform_Background(); 17730 $r['th@background'] = new HTMLPurifier_AttrTransform_Background(); 17731 $r['tr@background'] = new HTMLPurifier_AttrTransform_Background(); 17732 $r['thead@background'] = new HTMLPurifier_AttrTransform_Background(); 17733 $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background(); 17734 $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background(); 17735 $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height'); 17736 return $r; 17737 } 17738} 17739 17740 17741 17742 17743 17744class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy 17745{ 17746 17747 /** 17748 * @return array 17749 */ 17750 public function makeFixes() 17751 { 17752 $r = array(); 17753 17754 // == deprecated tag transforms =================================== 17755 17756 $r['font'] = new HTMLPurifier_TagTransform_Font(); 17757 $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul'); 17758 $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul'); 17759 $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;'); 17760 $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;'); 17761 $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;'); 17762 $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;'); 17763 17764 // == deprecated attribute transforms ============================= 17765 17766 $r['caption@align'] = 17767 new HTMLPurifier_AttrTransform_EnumToCSS( 17768 'align', 17769 array( 17770 // we're following IE's behavior, not Firefox's, due 17771 // to the fact that no one supports caption-side:right, 17772 // W3C included (with CSS 2.1). This is a slightly 17773 // unreasonable attribute! 17774 'left' => 'text-align:left;', 17775 'right' => 'text-align:right;', 17776 'top' => 'caption-side:top;', 17777 'bottom' => 'caption-side:bottom;' // not supported by IE 17778 ) 17779 ); 17780 17781 // @align for img ------------------------------------------------- 17782 $r['img@align'] = 17783 new HTMLPurifier_AttrTransform_EnumToCSS( 17784 'align', 17785 array( 17786 'left' => 'float:left;', 17787 'right' => 'float:right;', 17788 'top' => 'vertical-align:top;', 17789 'middle' => 'vertical-align:middle;', 17790 'bottom' => 'vertical-align:baseline;', 17791 ) 17792 ); 17793 17794 // @align for table ----------------------------------------------- 17795 $r['table@align'] = 17796 new HTMLPurifier_AttrTransform_EnumToCSS( 17797 'align', 17798 array( 17799 'left' => 'float:left;', 17800 'center' => 'margin-left:auto;margin-right:auto;', 17801 'right' => 'float:right;' 17802 ) 17803 ); 17804 17805 // @align for hr ----------------------------------------------- 17806 $r['hr@align'] = 17807 new HTMLPurifier_AttrTransform_EnumToCSS( 17808 'align', 17809 array( 17810 // we use both text-align and margin because these work 17811 // for different browsers (IE and Firefox, respectively) 17812 // and the melange makes for a pretty cross-compatible 17813 // solution 17814 'left' => 'margin-left:0;margin-right:auto;text-align:left;', 17815 'center' => 'margin-left:auto;margin-right:auto;text-align:center;', 17816 'right' => 'margin-left:auto;margin-right:0;text-align:right;' 17817 ) 17818 ); 17819 17820 // @align for h1, h2, h3, h4, h5, h6, p, div ---------------------- 17821 // {{{ 17822 $align_lookup = array(); 17823 $align_values = array('left', 'right', 'center', 'justify'); 17824 foreach ($align_values as $v) { 17825 $align_lookup[$v] = "text-align:$v;"; 17826 } 17827 // }}} 17828 $r['h1@align'] = 17829 $r['h2@align'] = 17830 $r['h3@align'] = 17831 $r['h4@align'] = 17832 $r['h5@align'] = 17833 $r['h6@align'] = 17834 $r['p@align'] = 17835 $r['div@align'] = 17836 new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup); 17837 17838 // @bgcolor for table, tr, td, th --------------------------------- 17839 $r['table@bgcolor'] = 17840 $r['td@bgcolor'] = 17841 $r['th@bgcolor'] = 17842 new HTMLPurifier_AttrTransform_BgColor(); 17843 17844 // @border for img ------------------------------------------------ 17845 $r['img@border'] = new HTMLPurifier_AttrTransform_Border(); 17846 17847 // @clear for br -------------------------------------------------- 17848 $r['br@clear'] = 17849 new HTMLPurifier_AttrTransform_EnumToCSS( 17850 'clear', 17851 array( 17852 'left' => 'clear:left;', 17853 'right' => 'clear:right;', 17854 'all' => 'clear:both;', 17855 'none' => 'clear:none;', 17856 ) 17857 ); 17858 17859 // @height for td, th --------------------------------------------- 17860 $r['td@height'] = 17861 $r['th@height'] = 17862 new HTMLPurifier_AttrTransform_Length('height'); 17863 17864 // @hspace for img ------------------------------------------------ 17865 $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace'); 17866 17867 // @noshade for hr ------------------------------------------------ 17868 // this transformation is not precise but often good enough. 17869 // different browsers use different styles to designate noshade 17870 $r['hr@noshade'] = 17871 new HTMLPurifier_AttrTransform_BoolToCSS( 17872 'noshade', 17873 'color:#808080;background-color:#808080;border:0;' 17874 ); 17875 17876 // @nowrap for td, th --------------------------------------------- 17877 $r['td@nowrap'] = 17878 $r['th@nowrap'] = 17879 new HTMLPurifier_AttrTransform_BoolToCSS( 17880 'nowrap', 17881 'white-space:nowrap;' 17882 ); 17883 17884 // @size for hr -------------------------------------------------- 17885 $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height'); 17886 17887 // @type for li, ol, ul ------------------------------------------- 17888 // {{{ 17889 $ul_types = array( 17890 'disc' => 'list-style-type:disc;', 17891 'square' => 'list-style-type:square;', 17892 'circle' => 'list-style-type:circle;' 17893 ); 17894 $ol_types = array( 17895 '1' => 'list-style-type:decimal;', 17896 'i' => 'list-style-type:lower-roman;', 17897 'I' => 'list-style-type:upper-roman;', 17898 'a' => 'list-style-type:lower-alpha;', 17899 'A' => 'list-style-type:upper-alpha;' 17900 ); 17901 $li_types = $ul_types + $ol_types; 17902 // }}} 17903 17904 $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types); 17905 $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true); 17906 $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true); 17907 17908 // @vspace for img ------------------------------------------------ 17909 $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace'); 17910 17911 // @width for hr, td, th ------------------------------------------ 17912 $r['td@width'] = 17913 $r['th@width'] = 17914 $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width'); 17915 17916 return $r; 17917 } 17918} 17919 17920 17921 17922 17923 17924class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 17925{ 17926 /** 17927 * @type string 17928 */ 17929 public $name = 'Tidy_Strict'; 17930 17931 /** 17932 * @type string 17933 */ 17934 public $defaultLevel = 'light'; 17935 17936 /** 17937 * @return array 17938 */ 17939 public function makeFixes() 17940 { 17941 $r = parent::makeFixes(); 17942 $r['blockquote#content_model_type'] = 'strictblockquote'; 17943 return $r; 17944 } 17945 17946 /** 17947 * @type bool 17948 */ 17949 public $defines_child_def = true; 17950 17951 /** 17952 * @param HTMLPurifier_ElementDef $def 17953 * @return HTMLPurifier_ChildDef_StrictBlockquote 17954 */ 17955 public function getChildDef($def) 17956 { 17957 if ($def->content_model_type != 'strictblockquote') { 17958 return parent::getChildDef($def); 17959 } 17960 return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model); 17961 } 17962} 17963 17964 17965 17966 17967 17968class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 17969{ 17970 /** 17971 * @type string 17972 */ 17973 public $name = 'Tidy_Transitional'; 17974 17975 /** 17976 * @type string 17977 */ 17978 public $defaultLevel = 'heavy'; 17979} 17980 17981 17982 17983 17984 17985class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy 17986{ 17987 /** 17988 * @type string 17989 */ 17990 public $name = 'Tidy_XHTML'; 17991 17992 /** 17993 * @type string 17994 */ 17995 public $defaultLevel = 'medium'; 17996 17997 /** 17998 * @return array 17999 */ 18000 public function makeFixes() 18001 { 18002 $r = array(); 18003 $r['@lang'] = new HTMLPurifier_AttrTransform_Lang(); 18004 return $r; 18005 } 18006} 18007 18008 18009 18010 18011 18012/** 18013 * Injector that auto paragraphs text in the root node based on 18014 * double-spacing. 18015 * @todo Ensure all states are unit tested, including variations as well. 18016 * @todo Make a graph of the flow control for this Injector. 18017 */ 18018class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector 18019{ 18020 /** 18021 * @type string 18022 */ 18023 public $name = 'AutoParagraph'; 18024 18025 /** 18026 * @type array 18027 */ 18028 public $needed = array('p'); 18029 18030 /** 18031 * @return HTMLPurifier_Token_Start 18032 */ 18033 private function _pStart() 18034 { 18035 $par = new HTMLPurifier_Token_Start('p'); 18036 $par->armor['MakeWellFormed_TagClosedError'] = true; 18037 return $par; 18038 } 18039 18040 /** 18041 * @param HTMLPurifier_Token_Text $token 18042 */ 18043 public function handleText(&$token) 18044 { 18045 $text = $token->data; 18046 // Does the current parent allow <p> tags? 18047 if ($this->allowsElement('p')) { 18048 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { 18049 // Note that we have differing behavior when dealing with text 18050 // in the anonymous root node, or a node inside the document. 18051 // If the text as a double-newline, the treatment is the same; 18052 // if it doesn't, see the next if-block if you're in the document. 18053 18054 $i = $nesting = null; 18055 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { 18056 // State 1.1: ... ^ (whitespace, then document end) 18057 // ---- 18058 // This is a degenerate case 18059 } else { 18060 if (!$token->is_whitespace || $this->_isInline($current)) { 18061 // State 1.2: PAR1 18062 // ---- 18063 18064 // State 1.3: PAR1\n\nPAR2 18065 // ------------ 18066 18067 // State 1.4: <div>PAR1\n\nPAR2 (see State 2) 18068 // ------------ 18069 $token = array($this->_pStart()); 18070 $this->_splitText($text, $token); 18071 } else { 18072 // State 1.5: \n<hr /> 18073 // -- 18074 } 18075 } 18076 } else { 18077 // State 2: <div>PAR1... (similar to 1.4) 18078 // ---- 18079 18080 // We're in an element that allows paragraph tags, but we're not 18081 // sure if we're going to need them. 18082 if ($this->_pLookAhead()) { 18083 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 18084 // ---- 18085 // Note: This will always be the first child, since any 18086 // previous inline element would have triggered this very 18087 // same routine, and found the double newline. One possible 18088 // exception would be a comment. 18089 $token = array($this->_pStart(), $token); 18090 } else { 18091 // State 2.2.1: <div>PAR1<div> 18092 // ---- 18093 18094 // State 2.2.2: <div>PAR1<b>PAR1</b></div> 18095 // ---- 18096 } 18097 } 18098 // Is the current parent a <p> tag? 18099 } elseif (!empty($this->currentNesting) && 18100 $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') { 18101 // State 3.1: ...<p>PAR1 18102 // ---- 18103 18104 // State 3.2: ...<p>PAR1\n\nPAR2 18105 // ------------ 18106 $token = array(); 18107 $this->_splitText($text, $token); 18108 // Abort! 18109 } else { 18110 // State 4.1: ...<b>PAR1 18111 // ---- 18112 18113 // State 4.2: ...<b>PAR1\n\nPAR2 18114 // ------------ 18115 } 18116 } 18117 18118 /** 18119 * @param HTMLPurifier_Token $token 18120 */ 18121 public function handleElement(&$token) 18122 { 18123 // We don't have to check if we're already in a <p> tag for block 18124 // tokens, because the tag would have been autoclosed by MakeWellFormed. 18125 if ($this->allowsElement('p')) { 18126 if (!empty($this->currentNesting)) { 18127 if ($this->_isInline($token)) { 18128 // State 1: <div>...<b> 18129 // --- 18130 // Check if this token is adjacent to the parent token 18131 // (seek backwards until token isn't whitespace) 18132 $i = null; 18133 $this->backward($i, $prev); 18134 18135 if (!$prev instanceof HTMLPurifier_Token_Start) { 18136 // Token wasn't adjacent 18137 if ($prev instanceof HTMLPurifier_Token_Text && 18138 substr($prev->data, -2) === "\n\n" 18139 ) { 18140 // State 1.1.4: <div><p>PAR1</p>\n\n<b> 18141 // --- 18142 // Quite frankly, this should be handled by splitText 18143 $token = array($this->_pStart(), $token); 18144 } else { 18145 // State 1.1.1: <div><p>PAR1</p><b> 18146 // --- 18147 // State 1.1.2: <div><br /><b> 18148 // --- 18149 // State 1.1.3: <div>PAR<b> 18150 // --- 18151 } 18152 } else { 18153 // State 1.2.1: <div><b> 18154 // --- 18155 // Lookahead to see if <p> is needed. 18156 if ($this->_pLookAhead()) { 18157 // State 1.3.1: <div><b>PAR1\n\nPAR2 18158 // --- 18159 $token = array($this->_pStart(), $token); 18160 } else { 18161 // State 1.3.2: <div><b>PAR1</b></div> 18162 // --- 18163 18164 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> 18165 // --- 18166 } 18167 } 18168 } else { 18169 // State 2.3: ...<div> 18170 // ----- 18171 } 18172 } else { 18173 if ($this->_isInline($token)) { 18174 // State 3.1: <b> 18175 // --- 18176 // This is where the {p} tag is inserted, not reflected in 18177 // inputTokens yet, however. 18178 $token = array($this->_pStart(), $token); 18179 } else { 18180 // State 3.2: <div> 18181 // ----- 18182 } 18183 18184 $i = null; 18185 if ($this->backward($i, $prev)) { 18186 if (!$prev instanceof HTMLPurifier_Token_Text) { 18187 // State 3.1.1: ...</p>{p}<b> 18188 // --- 18189 // State 3.2.1: ...</p><div> 18190 // ----- 18191 if (!is_array($token)) { 18192 $token = array($token); 18193 } 18194 array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); 18195 } else { 18196 // State 3.1.2: ...</p>\n\n{p}<b> 18197 // --- 18198 // State 3.2.2: ...</p>\n\n<div> 18199 // ----- 18200 // Note: PAR<ELEM> cannot occur because PAR would have been 18201 // wrapped in <p> tags. 18202 } 18203 } 18204 } 18205 } else { 18206 // State 2.2: <ul><li> 18207 // ---- 18208 // State 2.4: <p><b> 18209 // --- 18210 } 18211 } 18212 18213 /** 18214 * Splits up a text in paragraph tokens and appends them 18215 * to the result stream that will replace the original 18216 * @param string $data String text data that will be processed 18217 * into paragraphs 18218 * @param HTMLPurifier_Token[] $result Reference to array of tokens that the 18219 * tags will be appended onto 18220 */ 18221 private function _splitText($data, &$result) 18222 { 18223 $raw_paragraphs = explode("\n\n", $data); 18224 $paragraphs = array(); // without empty paragraphs 18225 $needs_start = false; 18226 $needs_end = false; 18227 18228 $c = count($raw_paragraphs); 18229 if ($c == 1) { 18230 // There were no double-newlines, abort quickly. In theory this 18231 // should never happen. 18232 $result[] = new HTMLPurifier_Token_Text($data); 18233 return; 18234 } 18235 for ($i = 0; $i < $c; $i++) { 18236 $par = $raw_paragraphs[$i]; 18237 if (trim($par) !== '') { 18238 $paragraphs[] = $par; 18239 } else { 18240 if ($i == 0) { 18241 // Double newline at the front 18242 if (empty($result)) { 18243 // The empty result indicates that the AutoParagraph 18244 // injector did not add any start paragraph tokens. 18245 // This means that we have been in a paragraph for 18246 // a while, and the newline means we should start a new one. 18247 $result[] = new HTMLPurifier_Token_End('p'); 18248 $result[] = new HTMLPurifier_Token_Text("\n\n"); 18249 // However, the start token should only be added if 18250 // there is more processing to be done (i.e. there are 18251 // real paragraphs in here). If there are none, the 18252 // next start paragraph tag will be handled by the 18253 // next call to the injector 18254 $needs_start = true; 18255 } else { 18256 // We just started a new paragraph! 18257 // Reinstate a double-newline for presentation's sake, since 18258 // it was in the source code. 18259 array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); 18260 } 18261 } elseif ($i + 1 == $c) { 18262 // Double newline at the end 18263 // There should be a trailing </p> when we're finally done. 18264 $needs_end = true; 18265 } 18266 } 18267 } 18268 18269 // Check if this was just a giant blob of whitespace. Move this earlier, 18270 // perhaps? 18271 if (empty($paragraphs)) { 18272 return; 18273 } 18274 18275 // Add the start tag indicated by \n\n at the beginning of $data 18276 if ($needs_start) { 18277 $result[] = $this->_pStart(); 18278 } 18279 18280 // Append the paragraphs onto the result 18281 foreach ($paragraphs as $par) { 18282 $result[] = new HTMLPurifier_Token_Text($par); 18283 $result[] = new HTMLPurifier_Token_End('p'); 18284 $result[] = new HTMLPurifier_Token_Text("\n\n"); 18285 $result[] = $this->_pStart(); 18286 } 18287 18288 // Remove trailing start token; Injector will handle this later if 18289 // it was indeed needed. This prevents from needing to do a lookahead, 18290 // at the cost of a lookbehind later. 18291 array_pop($result); 18292 18293 // If there is no need for an end tag, remove all of it and let 18294 // MakeWellFormed close it later. 18295 if (!$needs_end) { 18296 array_pop($result); // removes \n\n 18297 array_pop($result); // removes </p> 18298 } 18299 } 18300 18301 /** 18302 * Returns true if passed token is inline (and, ergo, allowed in 18303 * paragraph tags) 18304 * @param HTMLPurifier_Token $token 18305 * @return bool 18306 */ 18307 private function _isInline($token) 18308 { 18309 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); 18310 } 18311 18312 /** 18313 * Looks ahead in the token list and determines whether or not we need 18314 * to insert a <p> tag. 18315 * @return bool 18316 */ 18317 private function _pLookAhead() 18318 { 18319 if ($this->currentToken instanceof HTMLPurifier_Token_Start) { 18320 $nesting = 1; 18321 } else { 18322 $nesting = 0; 18323 } 18324 $ok = false; 18325 $i = null; 18326 while ($this->forwardUntilEndToken($i, $current, $nesting)) { 18327 $result = $this->_checkNeedsP($current); 18328 if ($result !== null) { 18329 $ok = $result; 18330 break; 18331 } 18332 } 18333 return $ok; 18334 } 18335 18336 /** 18337 * Determines if a particular token requires an earlier inline token 18338 * to get a paragraph. This should be used with _forwardUntilEndToken 18339 * @param HTMLPurifier_Token $current 18340 * @return bool 18341 */ 18342 private function _checkNeedsP($current) 18343 { 18344 if ($current instanceof HTMLPurifier_Token_Start) { 18345 if (!$this->_isInline($current)) { 18346 // <div>PAR1<div> 18347 // ---- 18348 // Terminate early, since we hit a block element 18349 return false; 18350 } 18351 } elseif ($current instanceof HTMLPurifier_Token_Text) { 18352 if (strpos($current->data, "\n\n") !== false) { 18353 // <div>PAR1<b>PAR1\n\nPAR2 18354 // ---- 18355 return true; 18356 } else { 18357 // <div>PAR1<b>PAR1... 18358 // ---- 18359 } 18360 } 18361 return null; 18362 } 18363} 18364 18365 18366 18367 18368 18369/** 18370 * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link. 18371 */ 18372class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector 18373{ 18374 /** 18375 * @type string 18376 */ 18377 public $name = 'DisplayLinkURI'; 18378 18379 /** 18380 * @type array 18381 */ 18382 public $needed = array('a'); 18383 18384 /** 18385 * @param $token 18386 */ 18387 public function handleElement(&$token) 18388 { 18389 } 18390 18391 /** 18392 * @param HTMLPurifier_Token $token 18393 */ 18394 public function handleEnd(&$token) 18395 { 18396 if (isset($token->start->attr['href'])) { 18397 $url = $token->start->attr['href']; 18398 unset($token->start->attr['href']); 18399 $token = array($token, new HTMLPurifier_Token_Text(" ($url)")); 18400 } else { 18401 // nothing to display 18402 } 18403 } 18404} 18405 18406 18407 18408 18409 18410/** 18411 * Injector that converts http, https and ftp text URLs to actual links. 18412 */ 18413class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector 18414{ 18415 /** 18416 * @type string 18417 */ 18418 public $name = 'Linkify'; 18419 18420 /** 18421 * @type array 18422 */ 18423 public $needed = array('a' => array('href')); 18424 18425 /** 18426 * @param HTMLPurifier_Token $token 18427 */ 18428 public function handleText(&$token) 18429 { 18430 if (!$this->allowsElement('a')) { 18431 return; 18432 } 18433 18434 if (strpos($token->data, '://') === false) { 18435 // our really quick heuristic failed, abort 18436 // this may not work so well if we want to match things like 18437 // "google.com", but then again, most people don't 18438 return; 18439 } 18440 18441 // there is/are URL(s). Let's split the string. 18442 // We use this regex: 18443 // https://gist.github.com/gruber/249502 18444 // but with @cscott's backtracking fix and also 18445 // the Unicode characters un-Unicodified. 18446 $bits = preg_split( 18447 '/\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'".,<>?\x{00ab}\x{00bb}\x{201c}\x{201d}\x{2018}\x{2019}]))/iu', 18448 $token->data, -1, PREG_SPLIT_DELIM_CAPTURE); 18449 18450 18451 $token = array(); 18452 18453 // $i = index 18454 // $c = count 18455 // $l = is link 18456 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) { 18457 if (!$l) { 18458 if ($bits[$i] === '') { 18459 continue; 18460 } 18461 $token[] = new HTMLPurifier_Token_Text($bits[$i]); 18462 } else { 18463 $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i])); 18464 $token[] = new HTMLPurifier_Token_Text($bits[$i]); 18465 $token[] = new HTMLPurifier_Token_End('a'); 18466 } 18467 } 18468 } 18469} 18470 18471 18472 18473 18474 18475/** 18476 * Injector that converts configuration directive syntax %Namespace.Directive 18477 * to links 18478 */ 18479class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector 18480{ 18481 /** 18482 * @type string 18483 */ 18484 public $name = 'PurifierLinkify'; 18485 18486 /** 18487 * @type string 18488 */ 18489 public $docURL; 18490 18491 /** 18492 * @type array 18493 */ 18494 public $needed = array('a' => array('href')); 18495 18496 /** 18497 * @param HTMLPurifier_Config $config 18498 * @param HTMLPurifier_Context $context 18499 * @return string 18500 */ 18501 public function prepare($config, $context) 18502 { 18503 $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL'); 18504 return parent::prepare($config, $context); 18505 } 18506 18507 /** 18508 * @param HTMLPurifier_Token $token 18509 */ 18510 public function handleText(&$token) 18511 { 18512 if (!$this->allowsElement('a')) { 18513 return; 18514 } 18515 if (strpos($token->data, '%') === false) { 18516 return; 18517 } 18518 18519 $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE); 18520 $token = array(); 18521 18522 // $i = index 18523 // $c = count 18524 // $l = is link 18525 for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) { 18526 if (!$l) { 18527 if ($bits[$i] === '') { 18528 continue; 18529 } 18530 $token[] = new HTMLPurifier_Token_Text($bits[$i]); 18531 } else { 18532 $token[] = new HTMLPurifier_Token_Start( 18533 'a', 18534 array('href' => str_replace('%s', $bits[$i], $this->docURL)) 18535 ); 18536 $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]); 18537 $token[] = new HTMLPurifier_Token_End('a'); 18538 } 18539 } 18540 } 18541} 18542 18543 18544 18545 18546 18547class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector 18548{ 18549 /** 18550 * @type HTMLPurifier_Context 18551 */ 18552 private $context; 18553 18554 /** 18555 * @type HTMLPurifier_Config 18556 */ 18557 private $config; 18558 18559 /** 18560 * @type HTMLPurifier_AttrValidator 18561 */ 18562 private $attrValidator; 18563 18564 /** 18565 * @type bool 18566 */ 18567 private $removeNbsp; 18568 18569 /** 18570 * @type bool 18571 */ 18572 private $removeNbspExceptions; 18573 18574 /** 18575 * Cached contents of %AutoFormat.RemoveEmpty.Predicate 18576 * @type array 18577 */ 18578 private $exclude; 18579 18580 /** 18581 * @param HTMLPurifier_Config $config 18582 * @param HTMLPurifier_Context $context 18583 * @return void 18584 */ 18585 public function prepare($config, $context) 18586 { 18587 parent::prepare($config, $context); 18588 $this->config = $config; 18589 $this->context = $context; 18590 $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp'); 18591 $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions'); 18592 $this->exclude = $config->get('AutoFormat.RemoveEmpty.Predicate'); 18593 foreach ($this->exclude as $key => $attrs) { 18594 if (!is_array($attrs)) { 18595 // HACK, see HTMLPurifier/Printer/ConfigForm.php 18596 $this->exclude[$key] = explode(';', $attrs); 18597 } 18598 } 18599 $this->attrValidator = new HTMLPurifier_AttrValidator(); 18600 } 18601 18602 /** 18603 * @param HTMLPurifier_Token $token 18604 */ 18605 public function handleElement(&$token) 18606 { 18607 if (!$token instanceof HTMLPurifier_Token_Start) { 18608 return; 18609 } 18610 $next = false; 18611 $deleted = 1; // the current tag 18612 for ($i = count($this->inputZipper->back) - 1; $i >= 0; $i--, $deleted++) { 18613 $next = $this->inputZipper->back[$i]; 18614 if ($next instanceof HTMLPurifier_Token_Text) { 18615 if ($next->is_whitespace) { 18616 continue; 18617 } 18618 if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) { 18619 $plain = str_replace("\xC2\xA0", "", $next->data); 18620 $isWsOrNbsp = $plain === '' || ctype_space($plain); 18621 if ($isWsOrNbsp) { 18622 continue; 18623 } 18624 } 18625 } 18626 break; 18627 } 18628 if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) { 18629 $this->attrValidator->validateToken($token, $this->config, $this->context); 18630 $token->armor['ValidateAttributes'] = true; 18631 if (isset($this->exclude[$token->name])) { 18632 $r = true; 18633 foreach ($this->exclude[$token->name] as $elem) { 18634 if (!isset($token->attr[$elem])) $r = false; 18635 } 18636 if ($r) return; 18637 } 18638 if (isset($token->attr['id']) || isset($token->attr['name'])) { 18639 return; 18640 } 18641 $token = $deleted + 1; 18642 for ($b = 0, $c = count($this->inputZipper->front); $b < $c; $b++) { 18643 $prev = $this->inputZipper->front[$b]; 18644 if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) { 18645 continue; 18646 } 18647 break; 18648 } 18649 // This is safe because we removed the token that triggered this. 18650 $this->rewindOffset($b+$deleted); 18651 return; 18652 } 18653 } 18654} 18655 18656 18657 18658 18659 18660/** 18661 * Injector that removes spans with no attributes 18662 */ 18663class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector 18664{ 18665 /** 18666 * @type string 18667 */ 18668 public $name = 'RemoveSpansWithoutAttributes'; 18669 18670 /** 18671 * @type array 18672 */ 18673 public $needed = array('span'); 18674 18675 /** 18676 * @type HTMLPurifier_AttrValidator 18677 */ 18678 private $attrValidator; 18679 18680 /** 18681 * Used by AttrValidator. 18682 * @type HTMLPurifier_Config 18683 */ 18684 private $config; 18685 18686 /** 18687 * @type HTMLPurifier_Context 18688 */ 18689 private $context; 18690 18691 public function prepare($config, $context) 18692 { 18693 $this->attrValidator = new HTMLPurifier_AttrValidator(); 18694 $this->config = $config; 18695 $this->context = $context; 18696 return parent::prepare($config, $context); 18697 } 18698 18699 /** 18700 * @param HTMLPurifier_Token $token 18701 */ 18702 public function handleElement(&$token) 18703 { 18704 if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) { 18705 return; 18706 } 18707 18708 // We need to validate the attributes now since this doesn't normally 18709 // happen until after MakeWellFormed. If all the attributes are removed 18710 // the span needs to be removed too. 18711 $this->attrValidator->validateToken($token, $this->config, $this->context); 18712 $token->armor['ValidateAttributes'] = true; 18713 18714 if (!empty($token->attr)) { 18715 return; 18716 } 18717 18718 $nesting = 0; 18719 while ($this->forwardUntilEndToken($i, $current, $nesting)) { 18720 } 18721 18722 if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') { 18723 // Mark closing span tag for deletion 18724 $current->markForDeletion = true; 18725 // Delete open span tag 18726 $token = false; 18727 } 18728 } 18729 18730 /** 18731 * @param HTMLPurifier_Token $token 18732 */ 18733 public function handleEnd(&$token) 18734 { 18735 if ($token->markForDeletion) { 18736 $token = false; 18737 } 18738 } 18739} 18740 18741 18742 18743 18744 18745/** 18746 * Adds important param elements to inside of object in order to make 18747 * things safe. 18748 */ 18749class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector 18750{ 18751 /** 18752 * @type string 18753 */ 18754 public $name = 'SafeObject'; 18755 18756 /** 18757 * @type array 18758 */ 18759 public $needed = array('object', 'param'); 18760 18761 /** 18762 * @type array 18763 */ 18764 protected $objectStack = array(); 18765 18766 /** 18767 * @type array 18768 */ 18769 protected $paramStack = array(); 18770 18771 /** 18772 * Keep this synchronized with AttrTransform/SafeParam.php. 18773 * @type array 18774 */ 18775 protected $addParam = array( 18776 'allowScriptAccess' => 'never', 18777 'allowNetworking' => 'internal', 18778 ); 18779 18780 /** 18781 * These are all lower-case keys. 18782 * @type array 18783 */ 18784 protected $allowedParam = array( 18785 'wmode' => true, 18786 'movie' => true, 18787 'flashvars' => true, 18788 'src' => true, 18789 'allowfullscreen' => true, // if omitted, assume to be 'false' 18790 ); 18791 18792 /** 18793 * @param HTMLPurifier_Config $config 18794 * @param HTMLPurifier_Context $context 18795 * @return void 18796 */ 18797 public function prepare($config, $context) 18798 { 18799 parent::prepare($config, $context); 18800 } 18801 18802 /** 18803 * @param HTMLPurifier_Token $token 18804 */ 18805 public function handleElement(&$token) 18806 { 18807 if ($token->name == 'object') { 18808 $this->objectStack[] = $token; 18809 $this->paramStack[] = array(); 18810 $new = array($token); 18811 foreach ($this->addParam as $name => $value) { 18812 $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value)); 18813 } 18814 $token = $new; 18815 } elseif ($token->name == 'param') { 18816 $nest = count($this->currentNesting) - 1; 18817 if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') { 18818 $i = count($this->objectStack) - 1; 18819 if (!isset($token->attr['name'])) { 18820 $token = false; 18821 return; 18822 } 18823 $n = $token->attr['name']; 18824 // We need this fix because YouTube doesn't supply a data 18825 // attribute, which we need if a type is specified. This is 18826 // *very* Flash specific. 18827 if (!isset($this->objectStack[$i]->attr['data']) && 18828 ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src') 18829 ) { 18830 $this->objectStack[$i]->attr['data'] = $token->attr['value']; 18831 } 18832 // Check if the parameter is the correct value but has not 18833 // already been added 18834 if (!isset($this->paramStack[$i][$n]) && 18835 isset($this->addParam[$n]) && 18836 $token->attr['name'] === $this->addParam[$n]) { 18837 // keep token, and add to param stack 18838 $this->paramStack[$i][$n] = true; 18839 } elseif (isset($this->allowedParam[strtolower($n)])) { 18840 // keep token, don't do anything to it 18841 // (could possibly check for duplicates here) 18842 // Note: In principle, parameters should be case sensitive. 18843 // But it seems they are not really; so accept any case. 18844 } else { 18845 $token = false; 18846 } 18847 } else { 18848 // not directly inside an object, DENY! 18849 $token = false; 18850 } 18851 } 18852 } 18853 18854 public function handleEnd(&$token) 18855 { 18856 // This is the WRONG way of handling the object and param stacks; 18857 // we should be inserting them directly on the relevant object tokens 18858 // so that the global stack handling handles it. 18859 if ($token->name == 'object') { 18860 array_pop($this->objectStack); 18861 array_pop($this->paramStack); 18862 } 18863 } 18864} 18865 18866 18867 18868 18869 18870/** 18871 * Parser that uses PHP 5's DOM extension (part of the core). 18872 * 18873 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. 18874 * It gives us a forgiving HTML parser, which we use to transform the HTML 18875 * into a DOM, and then into the tokens. It is blazingly fast (for large 18876 * documents, it performs twenty times faster than 18877 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 18878 * 18879 * @note Any empty elements will have empty tokens associated with them, even if 18880 * this is prohibited by the spec. This is cannot be fixed until the spec 18881 * comes into play. 18882 * 18883 * @note PHP's DOM extension does not actually parse any entities, we use 18884 * our own function to do that. 18885 * 18886 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. 18887 * If this is a huge problem, due to the fact that HTML is hand 18888 * edited and you are unable to get a parser cache that caches the 18889 * the output of HTML Purifier while keeping the original HTML lying 18890 * around, you may want to run Tidy on the resulting output or use 18891 * HTMLPurifier_DirectLex 18892 */ 18893 18894class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer 18895{ 18896 18897 /** 18898 * @type HTMLPurifier_TokenFactory 18899 */ 18900 private $factory; 18901 18902 public function __construct() 18903 { 18904 // setup the factory 18905 parent::__construct(); 18906 $this->factory = new HTMLPurifier_TokenFactory(); 18907 } 18908 18909 /** 18910 * @param string $html 18911 * @param HTMLPurifier_Config $config 18912 * @param HTMLPurifier_Context $context 18913 * @return HTMLPurifier_Token[] 18914 */ 18915 public function tokenizeHTML($html, $config, $context) 18916 { 18917 $html = $this->normalize($html, $config, $context); 18918 18919 // attempt to armor stray angled brackets that cannot possibly 18920 // form tags and thus are probably being used as emoticons 18921 if ($config->get('Core.AggressivelyFixLt')) { 18922 $char = '[^a-z!\/]'; 18923 $comment = "/<!--(.*?)(-->|\z)/is"; 18924 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); 18925 do { 18926 $old = $html; 18927 $html = preg_replace("/<($char)/i", '<\\1', $html); 18928 } while ($html !== $old); 18929 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments 18930 } 18931 18932 // preprocess html, essential for UTF-8 18933 $html = $this->wrapHTML($html, $config, $context); 18934 18935 $doc = new DOMDocument(); 18936 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered 18937 18938 set_error_handler(array($this, 'muteErrorHandler')); 18939 $doc->loadHTML($html); 18940 restore_error_handler(); 18941 18942 $body = $doc->getElementsByTagName('html')->item(0)-> // <html> 18943 getElementsByTagName('body')->item(0); // <body> 18944 18945 $div = $body->getElementsByTagName('div')->item(0); // <div> 18946 $tokens = array(); 18947 $this->tokenizeDOM($div, $tokens, $config); 18948 // If the div has a sibling, that means we tripped across 18949 // a premature </div> tag. So remove the div we parsed, 18950 // and then tokenize the rest of body. We can't tokenize 18951 // the sibling directly as we'll lose the tags in that case. 18952 if ($div->nextSibling) { 18953 $body->removeChild($div); 18954 $this->tokenizeDOM($body, $tokens, $config); 18955 } 18956 return $tokens; 18957 } 18958 18959 /** 18960 * Iterative function that tokenizes a node, putting it into an accumulator. 18961 * To iterate is human, to recurse divine - L. Peter Deutsch 18962 * @param DOMNode $node DOMNode to be tokenized. 18963 * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. 18964 * @return HTMLPurifier_Token of node appended to previously passed tokens. 18965 */ 18966 protected function tokenizeDOM($node, &$tokens, $config) 18967 { 18968 $level = 0; 18969 $nodes = array($level => new HTMLPurifier_Queue(array($node))); 18970 $closingNodes = array(); 18971 do { 18972 while (!$nodes[$level]->isEmpty()) { 18973 $node = $nodes[$level]->shift(); // FIFO 18974 $collect = $level > 0 ? true : false; 18975 $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config); 18976 if ($needEndingTag) { 18977 $closingNodes[$level][] = $node; 18978 } 18979 if ($node->childNodes && $node->childNodes->length) { 18980 $level++; 18981 $nodes[$level] = new HTMLPurifier_Queue(); 18982 foreach ($node->childNodes as $childNode) { 18983 $nodes[$level]->push($childNode); 18984 } 18985 } 18986 } 18987 $level--; 18988 if ($level && isset($closingNodes[$level])) { 18989 while ($node = array_pop($closingNodes[$level])) { 18990 $this->createEndNode($node, $tokens); 18991 } 18992 } 18993 } while ($level > 0); 18994 } 18995 18996 /** 18997 * Portably retrieve the tag name of a node; deals with older versions 18998 * of libxml like 2.7.6 18999 * @param DOMNode $node 19000 */ 19001 protected function getTagName($node) 19002 { 19003 if (property_exists($node, 'tagName')) { 19004 return $node->tagName; 19005 } else if (property_exists($node, 'nodeName')) { 19006 return $node->nodeName; 19007 } else if (property_exists($node, 'localName')) { 19008 return $node->localName; 19009 } 19010 return null; 19011 } 19012 19013 /** 19014 * Portably retrieve the data of a node; deals with older versions 19015 * of libxml like 2.7.6 19016 * @param DOMNode $node 19017 */ 19018 protected function getData($node) 19019 { 19020 if (property_exists($node, 'data')) { 19021 return $node->data; 19022 } else if (property_exists($node, 'nodeValue')) { 19023 return $node->nodeValue; 19024 } else if (property_exists($node, 'textContent')) { 19025 return $node->textContent; 19026 } 19027 return null; 19028 } 19029 19030 19031 /** 19032 * @param DOMNode $node DOMNode to be tokenized. 19033 * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. 19034 * @param bool $collect Says whether or start and close are collected, set to 19035 * false at first recursion because it's the implicit DIV 19036 * tag you're dealing with. 19037 * @return bool if the token needs an endtoken 19038 * @todo data and tagName properties don't seem to exist in DOMNode? 19039 */ 19040 protected function createStartNode($node, &$tokens, $collect, $config) 19041 { 19042 // intercept non element nodes. WE MUST catch all of them, 19043 // but we're not getting the character reference nodes because 19044 // those should have been preprocessed 19045 if ($node->nodeType === XML_TEXT_NODE) { 19046 $data = $this->getData($node); // Handle variable data property 19047 if ($data !== null) { 19048 $tokens[] = $this->factory->createText($data); 19049 } 19050 return false; 19051 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { 19052 // undo libxml's special treatment of <script> and <style> tags 19053 $last = end($tokens); 19054 $data = $node->data; 19055 // (note $node->tagname is already normalized) 19056 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { 19057 $new_data = trim($data); 19058 if (substr($new_data, 0, 4) === '<!--') { 19059 $data = substr($new_data, 4); 19060 if (substr($data, -3) === '-->') { 19061 $data = substr($data, 0, -3); 19062 } else { 19063 // Highly suspicious! Not sure what to do... 19064 } 19065 } 19066 } 19067 $tokens[] = $this->factory->createText($this->parseText($data, $config)); 19068 return false; 19069 } elseif ($node->nodeType === XML_COMMENT_NODE) { 19070 // this is code is only invoked for comments in script/style in versions 19071 // of libxml pre-2.6.28 (regular comments, of course, are still 19072 // handled regularly) 19073 $tokens[] = $this->factory->createComment($node->data); 19074 return false; 19075 } elseif ($node->nodeType !== XML_ELEMENT_NODE) { 19076 // not-well tested: there may be other nodes we have to grab 19077 return false; 19078 } 19079 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); 19080 $tag_name = $this->getTagName($node); // Handle variable tagName property 19081 if (empty($tag_name)) { 19082 return (bool) $node->childNodes->length; 19083 } 19084 // We still have to make sure that the element actually IS empty 19085 if (!$node->childNodes->length) { 19086 if ($collect) { 19087 $tokens[] = $this->factory->createEmpty($tag_name, $attr); 19088 } 19089 return false; 19090 } else { 19091 if ($collect) { 19092 $tokens[] = $this->factory->createStart($tag_name, $attr); 19093 } 19094 return true; 19095 } 19096 } 19097 19098 /** 19099 * @param DOMNode $node 19100 * @param HTMLPurifier_Token[] $tokens 19101 */ 19102 protected function createEndNode($node, &$tokens) 19103 { 19104 $tag_name = $this->getTagName($node); // Handle variable tagName property 19105 $tokens[] = $this->factory->createEnd($tag_name); 19106 } 19107 19108 /** 19109 * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. 19110 * 19111 * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. 19112 * @return array Associative array of attributes. 19113 */ 19114 protected function transformAttrToAssoc($node_map) 19115 { 19116 // NamedNodeMap is documented very well, so we're using undocumented 19117 // features, namely, the fact that it implements Iterator and 19118 // has a ->length attribute 19119 if ($node_map->length === 0) { 19120 return array(); 19121 } 19122 $array = array(); 19123 foreach ($node_map as $attr) { 19124 $array[$attr->name] = $attr->value; 19125 } 19126 return $array; 19127 } 19128 19129 /** 19130 * An error handler that mutes all errors 19131 * @param int $errno 19132 * @param string $errstr 19133 */ 19134 public function muteErrorHandler($errno, $errstr) 19135 { 19136 } 19137 19138 /** 19139 * Callback function for undoing escaping of stray angled brackets 19140 * in comments 19141 * @param array $matches 19142 * @return string 19143 */ 19144 public function callbackUndoCommentSubst($matches) 19145 { 19146 return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; 19147 } 19148 19149 /** 19150 * Callback function that entity-izes ampersands in comments so that 19151 * callbackUndoCommentSubst doesn't clobber them 19152 * @param array $matches 19153 * @return string 19154 */ 19155 public function callbackArmorCommentEntities($matches) 19156 { 19157 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; 19158 } 19159 19160 /** 19161 * Wraps an HTML fragment in the necessary HTML 19162 * @param string $html 19163 * @param HTMLPurifier_Config $config 19164 * @param HTMLPurifier_Context $context 19165 * @return string 19166 */ 19167 protected function wrapHTML($html, $config, $context, $use_div = true) 19168 { 19169 $def = $config->getDefinition('HTML'); 19170 $ret = ''; 19171 19172 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { 19173 $ret .= '<!DOCTYPE html '; 19174 if (!empty($def->doctype->dtdPublic)) { 19175 $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; 19176 } 19177 if (!empty($def->doctype->dtdSystem)) { 19178 $ret .= '"' . $def->doctype->dtdSystem . '" '; 19179 } 19180 $ret .= '>'; 19181 } 19182 19183 $ret .= '<html><head>'; 19184 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; 19185 // No protection if $html contains a stray </div>! 19186 $ret .= '</head><body>'; 19187 if ($use_div) $ret .= '<div>'; 19188 $ret .= $html; 19189 if ($use_div) $ret .= '</div>'; 19190 $ret .= '</body></html>'; 19191 return $ret; 19192 } 19193} 19194 19195 19196 19197 19198 19199/** 19200 * Our in-house implementation of a parser. 19201 * 19202 * A pure PHP parser, DirectLex has absolutely no dependencies, making 19203 * it a reasonably good default for PHP4. Written with efficiency in mind, 19204 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it 19205 * pales in comparison to HTMLPurifier_Lexer_DOMLex. 19206 * 19207 * @todo Reread XML spec and document differences. 19208 */ 19209class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer 19210{ 19211 /** 19212 * @type bool 19213 */ 19214 public $tracksLineNumbers = true; 19215 19216 /** 19217 * Whitespace characters for str(c)spn. 19218 * @type string 19219 */ 19220 protected $_whitespace = "\x20\x09\x0D\x0A"; 19221 19222 /** 19223 * Callback function for script CDATA fudge 19224 * @param array $matches, in form of array(opening tag, contents, closing tag) 19225 * @return string 19226 */ 19227 protected function scriptCallback($matches) 19228 { 19229 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; 19230 } 19231 19232 /** 19233 * @param String $html 19234 * @param HTMLPurifier_Config $config 19235 * @param HTMLPurifier_Context $context 19236 * @return array|HTMLPurifier_Token[] 19237 */ 19238 public function tokenizeHTML($html, $config, $context) 19239 { 19240 // special normalization for script tags without any armor 19241 // our "armor" heurstic is a < sign any number of whitespaces after 19242 // the first script tag 19243 if ($config->get('HTML.Trusted')) { 19244 $html = preg_replace_callback( 19245 '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', 19246 array($this, 'scriptCallback'), 19247 $html 19248 ); 19249 } 19250 19251 $html = $this->normalize($html, $config, $context); 19252 19253 $cursor = 0; // our location in the text 19254 $inside_tag = false; // whether or not we're parsing the inside of a tag 19255 $array = array(); // result array 19256 19257 // This is also treated to mean maintain *column* numbers too 19258 $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); 19259 19260 if ($maintain_line_numbers === null) { 19261 // automatically determine line numbering by checking 19262 // if error collection is on 19263 $maintain_line_numbers = $config->get('Core.CollectErrors'); 19264 } 19265 19266 if ($maintain_line_numbers) { 19267 $current_line = 1; 19268 $current_col = 0; 19269 $length = strlen($html); 19270 } else { 19271 $current_line = false; 19272 $current_col = false; 19273 $length = false; 19274 } 19275 $context->register('CurrentLine', $current_line); 19276 $context->register('CurrentCol', $current_col); 19277 $nl = "\n"; 19278 // how often to manually recalculate. This will ALWAYS be right, 19279 // but it's pretty wasteful. Set to 0 to turn off 19280 $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); 19281 19282 $e = false; 19283 if ($config->get('Core.CollectErrors')) { 19284 $e =& $context->get('ErrorCollector'); 19285 } 19286 19287 // for testing synchronization 19288 $loops = 0; 19289 19290 while (++$loops) { 19291 // $cursor is either at the start of a token, or inside of 19292 // a tag (i.e. there was a < immediately before it), as indicated 19293 // by $inside_tag 19294 19295 if ($maintain_line_numbers) { 19296 // $rcursor, however, is always at the start of a token. 19297 $rcursor = $cursor - (int)$inside_tag; 19298 19299 // Column number is cheap, so we calculate it every round. 19300 // We're interested at the *end* of the newline string, so 19301 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it 19302 // from our "rcursor" position. 19303 $nl_pos = strrpos($html, $nl, $rcursor - $length); 19304 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); 19305 19306 // recalculate lines 19307 if ($synchronize_interval && // synchronization is on 19308 $cursor > 0 && // cursor is further than zero 19309 $loops % $synchronize_interval === 0) { // time to synchronize! 19310 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); 19311 } 19312 } 19313 19314 $position_next_lt = strpos($html, '<', $cursor); 19315 $position_next_gt = strpos($html, '>', $cursor); 19316 19317 // triggers on "<b>asdf</b>" but not "asdf <b></b>" 19318 // special case to set up context 19319 if ($position_next_lt === $cursor) { 19320 $inside_tag = true; 19321 $cursor++; 19322 } 19323 19324 if (!$inside_tag && $position_next_lt !== false) { 19325 // We are not inside tag and there still is another tag to parse 19326 $token = new 19327 HTMLPurifier_Token_Text( 19328 $this->parseText( 19329 substr( 19330 $html, 19331 $cursor, 19332 $position_next_lt - $cursor 19333 ), $config 19334 ) 19335 ); 19336 if ($maintain_line_numbers) { 19337 $token->rawPosition($current_line, $current_col); 19338 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); 19339 } 19340 $array[] = $token; 19341 $cursor = $position_next_lt + 1; 19342 $inside_tag = true; 19343 continue; 19344 } elseif (!$inside_tag) { 19345 // We are not inside tag but there are no more tags 19346 // If we're already at the end, break 19347 if ($cursor === strlen($html)) { 19348 break; 19349 } 19350 // Create Text of rest of string 19351 $token = new 19352 HTMLPurifier_Token_Text( 19353 $this->parseText( 19354 substr( 19355 $html, 19356 $cursor 19357 ), $config 19358 ) 19359 ); 19360 if ($maintain_line_numbers) { 19361 $token->rawPosition($current_line, $current_col); 19362 } 19363 $array[] = $token; 19364 break; 19365 } elseif ($inside_tag && $position_next_gt !== false) { 19366 // We are in tag and it is well formed 19367 // Grab the internals of the tag 19368 $strlen_segment = $position_next_gt - $cursor; 19369 19370 if ($strlen_segment < 1) { 19371 // there's nothing to process! 19372 $token = new HTMLPurifier_Token_Text('<'); 19373 $cursor++; 19374 continue; 19375 } 19376 19377 $segment = substr($html, $cursor, $strlen_segment); 19378 19379 if ($segment === false) { 19380 // somehow, we attempted to access beyond the end of 19381 // the string, defense-in-depth, reported by Nate Abele 19382 break; 19383 } 19384 19385 // Check if it's a comment 19386 if (substr($segment, 0, 3) === '!--') { 19387 // re-determine segment length, looking for --> 19388 $position_comment_end = strpos($html, '-->', $cursor); 19389 if ($position_comment_end === false) { 19390 // uh oh, we have a comment that extends to 19391 // infinity. Can't be helped: set comment 19392 // end position to end of string 19393 if ($e) { 19394 $e->send(E_WARNING, 'Lexer: Unclosed comment'); 19395 } 19396 $position_comment_end = strlen($html); 19397 $end = true; 19398 } else { 19399 $end = false; 19400 } 19401 $strlen_segment = $position_comment_end - $cursor; 19402 $segment = substr($html, $cursor, $strlen_segment); 19403 $token = new 19404 HTMLPurifier_Token_Comment( 19405 substr( 19406 $segment, 19407 3, 19408 $strlen_segment - 3 19409 ) 19410 ); 19411 if ($maintain_line_numbers) { 19412 $token->rawPosition($current_line, $current_col); 19413 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); 19414 } 19415 $array[] = $token; 19416 $cursor = $end ? $position_comment_end : $position_comment_end + 3; 19417 $inside_tag = false; 19418 continue; 19419 } 19420 19421 // Check if it's an end tag 19422 $is_end_tag = (strpos($segment, '/') === 0); 19423 if ($is_end_tag) { 19424 $type = substr($segment, 1); 19425 $token = new HTMLPurifier_Token_End($type); 19426 if ($maintain_line_numbers) { 19427 $token->rawPosition($current_line, $current_col); 19428 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 19429 } 19430 $array[] = $token; 19431 $inside_tag = false; 19432 $cursor = $position_next_gt + 1; 19433 continue; 19434 } 19435 19436 // Check leading character is alnum, if not, we may 19437 // have accidently grabbed an emoticon. Translate into 19438 // text and go our merry way 19439 if (!ctype_alpha($segment[0])) { 19440 // XML: $segment[0] !== '_' && $segment[0] !== ':' 19441 if ($e) { 19442 $e->send(E_NOTICE, 'Lexer: Unescaped lt'); 19443 } 19444 $token = new HTMLPurifier_Token_Text('<'); 19445 if ($maintain_line_numbers) { 19446 $token->rawPosition($current_line, $current_col); 19447 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 19448 } 19449 $array[] = $token; 19450 $inside_tag = false; 19451 continue; 19452 } 19453 19454 // Check if it is explicitly self closing, if so, remove 19455 // trailing slash. Remember, we could have a tag like <br>, so 19456 // any later token processing scripts must convert improperly 19457 // classified EmptyTags from StartTags. 19458 $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); 19459 if ($is_self_closing) { 19460 $strlen_segment--; 19461 $segment = substr($segment, 0, $strlen_segment); 19462 } 19463 19464 // Check if there are any attributes 19465 $position_first_space = strcspn($segment, $this->_whitespace); 19466 19467 if ($position_first_space >= $strlen_segment) { 19468 if ($is_self_closing) { 19469 $token = new HTMLPurifier_Token_Empty($segment); 19470 } else { 19471 $token = new HTMLPurifier_Token_Start($segment); 19472 } 19473 if ($maintain_line_numbers) { 19474 $token->rawPosition($current_line, $current_col); 19475 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 19476 } 19477 $array[] = $token; 19478 $inside_tag = false; 19479 $cursor = $position_next_gt + 1; 19480 continue; 19481 } 19482 19483 // Grab out all the data 19484 $type = substr($segment, 0, $position_first_space); 19485 $attribute_string = 19486 trim( 19487 substr( 19488 $segment, 19489 $position_first_space 19490 ) 19491 ); 19492 if ($attribute_string) { 19493 $attr = $this->parseAttributeString( 19494 $attribute_string, 19495 $config, 19496 $context 19497 ); 19498 } else { 19499 $attr = array(); 19500 } 19501 19502 if ($is_self_closing) { 19503 $token = new HTMLPurifier_Token_Empty($type, $attr); 19504 } else { 19505 $token = new HTMLPurifier_Token_Start($type, $attr); 19506 } 19507 if ($maintain_line_numbers) { 19508 $token->rawPosition($current_line, $current_col); 19509 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 19510 } 19511 $array[] = $token; 19512 $cursor = $position_next_gt + 1; 19513 $inside_tag = false; 19514 continue; 19515 } else { 19516 // inside tag, but there's no ending > sign 19517 if ($e) { 19518 $e->send(E_WARNING, 'Lexer: Missing gt'); 19519 } 19520 $token = new 19521 HTMLPurifier_Token_Text( 19522 '<' . 19523 $this->parseText( 19524 substr($html, $cursor), $config 19525 ) 19526 ); 19527 if ($maintain_line_numbers) { 19528 $token->rawPosition($current_line, $current_col); 19529 } 19530 // no cursor scroll? Hmm... 19531 $array[] = $token; 19532 break; 19533 } 19534 break; 19535 } 19536 19537 $context->destroy('CurrentLine'); 19538 $context->destroy('CurrentCol'); 19539 return $array; 19540 } 19541 19542 /** 19543 * PHP 5.0.x compatible substr_count that implements offset and length 19544 * @param string $haystack 19545 * @param string $needle 19546 * @param int $offset 19547 * @param int $length 19548 * @return int 19549 */ 19550 protected function substrCount($haystack, $needle, $offset, $length) 19551 { 19552 static $oldVersion; 19553 if ($oldVersion === null) { 19554 $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); 19555 } 19556 if ($oldVersion) { 19557 $haystack = substr($haystack, $offset, $length); 19558 return substr_count($haystack, $needle); 19559 } else { 19560 return substr_count($haystack, $needle, $offset, $length); 19561 } 19562 } 19563 19564 /** 19565 * Takes the inside of an HTML tag and makes an assoc array of attributes. 19566 * 19567 * @param string $string Inside of tag excluding name. 19568 * @param HTMLPurifier_Config $config 19569 * @param HTMLPurifier_Context $context 19570 * @return array Assoc array of attributes. 19571 */ 19572 public function parseAttributeString($string, $config, $context) 19573 { 19574 $string = (string)$string; // quick typecast 19575 19576 if ($string == '') { 19577 return array(); 19578 } // no attributes 19579 19580 $e = false; 19581 if ($config->get('Core.CollectErrors')) { 19582 $e =& $context->get('ErrorCollector'); 19583 } 19584 19585 // let's see if we can abort as quickly as possible 19586 // one equal sign, no spaces => one attribute 19587 $num_equal = substr_count($string, '='); 19588 $has_space = strpos($string, ' '); 19589 if ($num_equal === 0 && !$has_space) { 19590 // bool attribute 19591 return array($string => $string); 19592 } elseif ($num_equal === 1 && !$has_space) { 19593 // only one attribute 19594 list($key, $quoted_value) = explode('=', $string); 19595 $quoted_value = trim($quoted_value); 19596 if (!$key) { 19597 if ($e) { 19598 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 19599 } 19600 return array(); 19601 } 19602 if (!$quoted_value) { 19603 return array($key => ''); 19604 } 19605 $first_char = @$quoted_value[0]; 19606 $last_char = @$quoted_value[strlen($quoted_value) - 1]; 19607 19608 $same_quote = ($first_char == $last_char); 19609 $open_quote = ($first_char == '"' || $first_char == "'"); 19610 19611 if ($same_quote && $open_quote) { 19612 // well behaved 19613 $value = substr($quoted_value, 1, strlen($quoted_value) - 2); 19614 } else { 19615 // not well behaved 19616 if ($open_quote) { 19617 if ($e) { 19618 $e->send(E_ERROR, 'Lexer: Missing end quote'); 19619 } 19620 $value = substr($quoted_value, 1); 19621 } else { 19622 $value = $quoted_value; 19623 } 19624 } 19625 if ($value === false) { 19626 $value = ''; 19627 } 19628 return array($key => $this->parseAttr($value, $config)); 19629 } 19630 19631 // setup loop environment 19632 $array = array(); // return assoc array of attributes 19633 $cursor = 0; // current position in string (moves forward) 19634 $size = strlen($string); // size of the string (stays the same) 19635 19636 // if we have unquoted attributes, the parser expects a terminating 19637 // space, so let's guarantee that there's always a terminating space. 19638 $string .= ' '; 19639 19640 $old_cursor = -1; 19641 while ($cursor < $size) { 19642 if ($old_cursor >= $cursor) { 19643 throw new Exception("Infinite loop detected"); 19644 } 19645 $old_cursor = $cursor; 19646 19647 $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); 19648 // grab the key 19649 19650 $key_begin = $cursor; //we're currently at the start of the key 19651 19652 // scroll past all characters that are the key (not whitespace or =) 19653 $cursor += strcspn($string, $this->_whitespace . '=', $cursor); 19654 19655 $key_end = $cursor; // now at the end of the key 19656 19657 $key = substr($string, $key_begin, $key_end - $key_begin); 19658 19659 if (!$key) { 19660 if ($e) { 19661 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 19662 } 19663 $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop 19664 continue; // empty key 19665 } 19666 19667 // scroll past all whitespace 19668 $cursor += strspn($string, $this->_whitespace, $cursor); 19669 19670 if ($cursor >= $size) { 19671 $array[$key] = $key; 19672 break; 19673 } 19674 19675 // if the next character is an equal sign, we've got a regular 19676 // pair, otherwise, it's a bool attribute 19677 $first_char = @$string[$cursor]; 19678 19679 if ($first_char == '=') { 19680 // key="value" 19681 19682 $cursor++; 19683 $cursor += strspn($string, $this->_whitespace, $cursor); 19684 19685 if ($cursor === false) { 19686 $array[$key] = ''; 19687 break; 19688 } 19689 19690 // we might be in front of a quote right now 19691 19692 $char = @$string[$cursor]; 19693 19694 if ($char == '"' || $char == "'") { 19695 // it's quoted, end bound is $char 19696 $cursor++; 19697 $value_begin = $cursor; 19698 $cursor = strpos($string, $char, $cursor); 19699 $value_end = $cursor; 19700 } else { 19701 // it's not quoted, end bound is whitespace 19702 $value_begin = $cursor; 19703 $cursor += strcspn($string, $this->_whitespace, $cursor); 19704 $value_end = $cursor; 19705 } 19706 19707 // we reached a premature end 19708 if ($cursor === false) { 19709 $cursor = $size; 19710 $value_end = $cursor; 19711 } 19712 19713 $value = substr($string, $value_begin, $value_end - $value_begin); 19714 if ($value === false) { 19715 $value = ''; 19716 } 19717 $array[$key] = $this->parseAttr($value, $config); 19718 $cursor++; 19719 } else { 19720 // boolattr 19721 if ($key !== '') { 19722 $array[$key] = $key; 19723 } else { 19724 // purely theoretical 19725 if ($e) { 19726 $e->send(E_ERROR, 'Lexer: Missing attribute key'); 19727 } 19728 } 19729 } 19730 } 19731 return $array; 19732 } 19733} 19734 19735 19736 19737 19738 19739/** 19740 * Concrete comment node class. 19741 */ 19742class HTMLPurifier_Node_Comment extends HTMLPurifier_Node 19743{ 19744 /** 19745 * Character data within comment. 19746 * @type string 19747 */ 19748 public $data; 19749 19750 /** 19751 * @type bool 19752 */ 19753 public $is_whitespace = true; 19754 19755 /** 19756 * Transparent constructor. 19757 * 19758 * @param string $data String comment data. 19759 * @param int $line 19760 * @param int $col 19761 */ 19762 public function __construct($data, $line = null, $col = null) 19763 { 19764 $this->data = $data; 19765 $this->line = $line; 19766 $this->col = $col; 19767 } 19768 19769 public function toTokenPair() { 19770 return array(new HTMLPurifier_Token_Comment($this->data, $this->line, $this->col), null); 19771 } 19772} 19773 19774 19775 19776/** 19777 * Concrete element node class. 19778 */ 19779class HTMLPurifier_Node_Element extends HTMLPurifier_Node 19780{ 19781 /** 19782 * The lower-case name of the tag, like 'a', 'b' or 'blockquote'. 19783 * 19784 * @note Strictly speaking, XML tags are case sensitive, so we shouldn't 19785 * be lower-casing them, but these tokens cater to HTML tags, which are 19786 * insensitive. 19787 * @type string 19788 */ 19789 public $name; 19790 19791 /** 19792 * Associative array of the node's attributes. 19793 * @type array 19794 */ 19795 public $attr = array(); 19796 19797 /** 19798 * List of child elements. 19799 * @type array 19800 */ 19801 public $children = array(); 19802 19803 /** 19804 * Does this use the <a></a> form or the </a> form, i.e. 19805 * is it a pair of start/end tokens or an empty token. 19806 * @bool 19807 */ 19808 public $empty = false; 19809 19810 public $endCol = null, $endLine = null, $endArmor = array(); 19811 19812 public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) { 19813 $this->name = $name; 19814 $this->attr = $attr; 19815 $this->line = $line; 19816 $this->col = $col; 19817 $this->armor = $armor; 19818 } 19819 19820 public function toTokenPair() { 19821 // XXX inefficiency here, normalization is not necessary 19822 if ($this->empty) { 19823 return array(new HTMLPurifier_Token_Empty($this->name, $this->attr, $this->line, $this->col, $this->armor), null); 19824 } else { 19825 $start = new HTMLPurifier_Token_Start($this->name, $this->attr, $this->line, $this->col, $this->armor); 19826 $end = new HTMLPurifier_Token_End($this->name, array(), $this->endLine, $this->endCol, $this->endArmor); 19827 //$end->start = $start; 19828 return array($start, $end); 19829 } 19830 } 19831} 19832 19833 19834 19835 19836/** 19837 * Concrete text token class. 19838 * 19839 * Text tokens comprise of regular parsed character data (PCDATA) and raw 19840 * character data (from the CDATA sections). Internally, their 19841 * data is parsed with all entities expanded. Surprisingly, the text token 19842 * does have a "tag name" called #PCDATA, which is how the DTD represents it 19843 * in permissible child nodes. 19844 */ 19845class HTMLPurifier_Node_Text extends HTMLPurifier_Node 19846{ 19847 19848 /** 19849 * PCDATA tag name compatible with DTD, see 19850 * HTMLPurifier_ChildDef_Custom for details. 19851 * @type string 19852 */ 19853 public $name = '#PCDATA'; 19854 19855 /** 19856 * @type string 19857 */ 19858 public $data; 19859 /**< Parsed character data of text. */ 19860 19861 /** 19862 * @type bool 19863 */ 19864 public $is_whitespace; 19865 19866 /**< Bool indicating if node is whitespace. */ 19867 19868 /** 19869 * Constructor, accepts data and determines if it is whitespace. 19870 * @param string $data String parsed character data. 19871 * @param int $line 19872 * @param int $col 19873 */ 19874 public function __construct($data, $is_whitespace, $line = null, $col = null) 19875 { 19876 $this->data = $data; 19877 $this->is_whitespace = $is_whitespace; 19878 $this->line = $line; 19879 $this->col = $col; 19880 } 19881 19882 public function toTokenPair() { 19883 return array(new HTMLPurifier_Token_Text($this->data, $this->line, $this->col), null); 19884 } 19885} 19886 19887 19888 19889 19890 19891/** 19892 * Composite strategy that runs multiple strategies on tokens. 19893 */ 19894abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy 19895{ 19896 19897 /** 19898 * List of strategies to run tokens through. 19899 * @type HTMLPurifier_Strategy[] 19900 */ 19901 protected $strategies = array(); 19902 19903 /** 19904 * @param HTMLPurifier_Token[] $tokens 19905 * @param HTMLPurifier_Config $config 19906 * @param HTMLPurifier_Context $context 19907 * @return HTMLPurifier_Token[] 19908 */ 19909 public function execute($tokens, $config, $context) 19910 { 19911 foreach ($this->strategies as $strategy) { 19912 $tokens = $strategy->execute($tokens, $config, $context); 19913 } 19914 return $tokens; 19915 } 19916} 19917 19918 19919 19920 19921 19922/** 19923 * Core strategy composed of the big four strategies. 19924 */ 19925class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite 19926{ 19927 public function __construct() 19928 { 19929 $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements(); 19930 $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed(); 19931 $this->strategies[] = new HTMLPurifier_Strategy_FixNesting(); 19932 $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes(); 19933 } 19934} 19935 19936 19937 19938 19939 19940/** 19941 * Takes a well formed list of tokens and fixes their nesting. 19942 * 19943 * HTML elements dictate which elements are allowed to be their children, 19944 * for example, you can't have a p tag in a span tag. Other elements have 19945 * much more rigorous definitions: tables, for instance, require a specific 19946 * order for their elements. There are also constraints not expressible by 19947 * document type definitions, such as the chameleon nature of ins/del 19948 * tags and global child exclusions. 19949 * 19950 * The first major objective of this strategy is to iterate through all 19951 * the nodes and determine whether or not their children conform to the 19952 * element's definition. If they do not, the child definition may 19953 * optionally supply an amended list of elements that is valid or 19954 * require that the entire node be deleted (and the previous node 19955 * rescanned). 19956 * 19957 * The second objective is to ensure that explicitly excluded elements of 19958 * an element do not appear in its children. Code that accomplishes this 19959 * task is pervasive through the strategy, though the two are distinct tasks 19960 * and could, theoretically, be seperated (although it's not recommended). 19961 * 19962 * @note Whether or not unrecognized children are silently dropped or 19963 * translated into text depends on the child definitions. 19964 * 19965 * @todo Enable nodes to be bubbled out of the structure. This is 19966 * easier with our new algorithm. 19967 */ 19968 19969class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy 19970{ 19971 19972 /** 19973 * @param HTMLPurifier_Token[] $tokens 19974 * @param HTMLPurifier_Config $config 19975 * @param HTMLPurifier_Context $context 19976 * @return array|HTMLPurifier_Token[] 19977 */ 19978 public function execute($tokens, $config, $context) 19979 { 19980 19981 //####################################################################// 19982 // Pre-processing 19983 19984 // O(n) pass to convert to a tree, so that we can efficiently 19985 // refer to substrings 19986 $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); 19987 19988 // get a copy of the HTML definition 19989 $definition = $config->getHTMLDefinition(); 19990 19991 $excludes_enabled = !$config->get('Core.DisableExcludes'); 19992 19993 // setup the context variable 'IsInline', for chameleon processing 19994 // is 'false' when we are not inline, 'true' when it must always 19995 // be inline, and an integer when it is inline for a certain 19996 // branch of the document tree 19997 $is_inline = $definition->info_parent_def->descendants_are_inline; 19998 $context->register('IsInline', $is_inline); 19999 20000 // setup error collector 20001 $e =& $context->get('ErrorCollector', true); 20002 20003 //####################################################################// 20004 // Loop initialization 20005 20006 // stack that contains all elements that are excluded 20007 // it is organized by parent elements, similar to $stack, 20008 // but it is only populated when an element with exclusions is 20009 // processed, i.e. there won't be empty exclusions. 20010 $exclude_stack = array($definition->info_parent_def->excludes); 20011 20012 // variable that contains the start token while we are processing 20013 // nodes. This enables error reporting to do its job 20014 $node = $top_node; 20015 // dummy token 20016 list($token, $d) = $node->toTokenPair(); 20017 $context->register('CurrentNode', $node); 20018 $context->register('CurrentToken', $token); 20019 20020 //####################################################################// 20021 // Loop 20022 20023 // We need to implement a post-order traversal iteratively, to 20024 // avoid running into stack space limits. This is pretty tricky 20025 // to reason about, so we just manually stack-ify the recursive 20026 // variant: 20027 // 20028 // function f($node) { 20029 // foreach ($node->children as $child) { 20030 // f($child); 20031 // } 20032 // validate($node); 20033 // } 20034 // 20035 // Thus, we will represent a stack frame as array($node, 20036 // $is_inline, stack of children) 20037 // e.g. array_reverse($node->children) - already processed 20038 // children. 20039 20040 $parent_def = $definition->info_parent_def; 20041 $stack = array( 20042 array($top_node, 20043 $parent_def->descendants_are_inline, 20044 $parent_def->excludes, // exclusions 20045 0) 20046 ); 20047 20048 while (!empty($stack)) { 20049 list($node, $is_inline, $excludes, $ix) = array_pop($stack); 20050 // recursive call 20051 $go = false; 20052 $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name]; 20053 while (isset($node->children[$ix])) { 20054 $child = $node->children[$ix++]; 20055 if ($child instanceof HTMLPurifier_Node_Element) { 20056 $go = true; 20057 $stack[] = array($node, $is_inline, $excludes, $ix); 20058 $stack[] = array($child, 20059 // ToDo: I don't think it matters if it's def or 20060 // child_def, but double check this... 20061 $is_inline || $def->descendants_are_inline, 20062 empty($def->excludes) ? $excludes 20063 : array_merge($excludes, $def->excludes), 20064 0); 20065 break; 20066 } 20067 }; 20068 if ($go) continue; 20069 list($token, $d) = $node->toTokenPair(); 20070 // base case 20071 if ($excludes_enabled && isset($excludes[$node->name])) { 20072 $node->dead = true; 20073 if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); 20074 } else { 20075 // XXX I suppose it would be slightly more efficient to 20076 // avoid the allocation here and have children 20077 // strategies handle it 20078 $children = array(); 20079 foreach ($node->children as $child) { 20080 if (!$child->dead) $children[] = $child; 20081 } 20082 $result = $def->child->validateChildren($children, $config, $context); 20083 if ($result === true) { 20084 // nop 20085 $node->children = $children; 20086 } elseif ($result === false) { 20087 $node->dead = true; 20088 if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); 20089 } else { 20090 $node->children = $result; 20091 if ($e) { 20092 // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators 20093 if (empty($result) && !empty($children)) { 20094 $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); 20095 } else if ($result != $children) { 20096 $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); 20097 } 20098 } 20099 } 20100 } 20101 } 20102 20103 //####################################################################// 20104 // Post-processing 20105 20106 // remove context variables 20107 $context->destroy('IsInline'); 20108 $context->destroy('CurrentNode'); 20109 $context->destroy('CurrentToken'); 20110 20111 //####################################################################// 20112 // Return 20113 20114 return HTMLPurifier_Arborize::flatten($node, $config, $context); 20115 } 20116} 20117 20118 20119 20120 20121 20122/** 20123 * Takes tokens makes them well-formed (balance end tags, etc.) 20124 * 20125 * Specification of the armor attributes this strategy uses: 20126 * 20127 * - MakeWellFormed_TagClosedError: This armor field is used to 20128 * suppress tag closed errors for certain tokens [TagClosedSuppress], 20129 * in particular, if a tag was generated automatically by HTML 20130 * Purifier, we may rely on our infrastructure to close it for us 20131 * and shouldn't report an error to the user [TagClosedAuto]. 20132 */ 20133class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy 20134{ 20135 20136 /** 20137 * Array stream of tokens being processed. 20138 * @type HTMLPurifier_Token[] 20139 */ 20140 protected $tokens; 20141 20142 /** 20143 * Current token. 20144 * @type HTMLPurifier_Token 20145 */ 20146 protected $token; 20147 20148 /** 20149 * Zipper managing the true state. 20150 * @type HTMLPurifier_Zipper 20151 */ 20152 protected $zipper; 20153 20154 /** 20155 * Current nesting of elements. 20156 * @type array 20157 */ 20158 protected $stack; 20159 20160 /** 20161 * Injectors active in this stream processing. 20162 * @type HTMLPurifier_Injector[] 20163 */ 20164 protected $injectors; 20165 20166 /** 20167 * Current instance of HTMLPurifier_Config. 20168 * @type HTMLPurifier_Config 20169 */ 20170 protected $config; 20171 20172 /** 20173 * Current instance of HTMLPurifier_Context. 20174 * @type HTMLPurifier_Context 20175 */ 20176 protected $context; 20177 20178 /** 20179 * @param HTMLPurifier_Token[] $tokens 20180 * @param HTMLPurifier_Config $config 20181 * @param HTMLPurifier_Context $context 20182 * @return HTMLPurifier_Token[] 20183 * @throws HTMLPurifier_Exception 20184 */ 20185 public function execute($tokens, $config, $context) 20186 { 20187 $definition = $config->getHTMLDefinition(); 20188 20189 // local variables 20190 $generator = new HTMLPurifier_Generator($config, $context); 20191 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags'); 20192 // used for autoclose early abortion 20193 $global_parent_allowed_elements = $definition->info_parent_def->child->getAllowedElements($config); 20194 $e = $context->get('ErrorCollector', true); 20195 $i = false; // injector index 20196 list($zipper, $token) = HTMLPurifier_Zipper::fromArray($tokens); 20197 if ($token === NULL) { 20198 return array(); 20199 } 20200 $reprocess = false; // whether or not to reprocess the same token 20201 $stack = array(); 20202 20203 // member variables 20204 $this->stack =& $stack; 20205 $this->tokens =& $tokens; 20206 $this->token =& $token; 20207 $this->zipper =& $zipper; 20208 $this->config = $config; 20209 $this->context = $context; 20210 20211 // context variables 20212 $context->register('CurrentNesting', $stack); 20213 $context->register('InputZipper', $zipper); 20214 $context->register('CurrentToken', $token); 20215 20216 // -- begin INJECTOR -- 20217 20218 $this->injectors = array(); 20219 20220 $injectors = $config->getBatch('AutoFormat'); 20221 $def_injectors = $definition->info_injector; 20222 $custom_injectors = $injectors['Custom']; 20223 unset($injectors['Custom']); // special case 20224 foreach ($injectors as $injector => $b) { 20225 // XXX: Fix with a legitimate lookup table of enabled filters 20226 if (strpos($injector, '.') !== false) { 20227 continue; 20228 } 20229 $injector = "HTMLPurifier_Injector_$injector"; 20230 if (!$b) { 20231 continue; 20232 } 20233 $this->injectors[] = new $injector; 20234 } 20235 foreach ($def_injectors as $injector) { 20236 // assumed to be objects 20237 $this->injectors[] = $injector; 20238 } 20239 foreach ($custom_injectors as $injector) { 20240 if (!$injector) { 20241 continue; 20242 } 20243 if (is_string($injector)) { 20244 $injector = "HTMLPurifier_Injector_$injector"; 20245 $injector = new $injector; 20246 } 20247 $this->injectors[] = $injector; 20248 } 20249 20250 // give the injectors references to the definition and context 20251 // variables for performance reasons 20252 foreach ($this->injectors as $ix => $injector) { 20253 $error = $injector->prepare($config, $context); 20254 if (!$error) { 20255 continue; 20256 } 20257 array_splice($this->injectors, $ix, 1); // rm the injector 20258 trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING); 20259 } 20260 20261 // -- end INJECTOR -- 20262 20263 // a note on reprocessing: 20264 // In order to reduce code duplication, whenever some code needs 20265 // to make HTML changes in order to make things "correct", the 20266 // new HTML gets sent through the purifier, regardless of its 20267 // status. This means that if we add a start token, because it 20268 // was totally necessary, we don't have to update nesting; we just 20269 // punt ($reprocess = true; continue;) and it does that for us. 20270 20271 // isset is in loop because $tokens size changes during loop exec 20272 for (;; 20273 // only increment if we don't need to reprocess 20274 $reprocess ? $reprocess = false : $token = $zipper->next($token)) { 20275 20276 // check for a rewind 20277 if (is_int($i)) { 20278 // possibility: disable rewinding if the current token has a 20279 // rewind set on it already. This would offer protection from 20280 // infinite loop, but might hinder some advanced rewinding. 20281 $rewind_offset = $this->injectors[$i]->getRewindOffset(); 20282 if (is_int($rewind_offset)) { 20283 for ($j = 0; $j < $rewind_offset; $j++) { 20284 if (empty($zipper->front)) break; 20285 $token = $zipper->prev($token); 20286 // indicate that other injectors should not process this token, 20287 // but we need to reprocess it. See Note [Injector skips] 20288 unset($token->skip[$i]); 20289 $token->rewind = $i; 20290 if ($token instanceof HTMLPurifier_Token_Start) { 20291 array_pop($this->stack); 20292 } elseif ($token instanceof HTMLPurifier_Token_End) { 20293 $this->stack[] = $token->start; 20294 } 20295 } 20296 } 20297 $i = false; 20298 } 20299 20300 // handle case of document end 20301 if ($token === NULL) { 20302 // kill processing if stack is empty 20303 if (empty($this->stack)) { 20304 break; 20305 } 20306 20307 // peek 20308 $top_nesting = array_pop($this->stack); 20309 $this->stack[] = $top_nesting; 20310 20311 // send error [TagClosedSuppress] 20312 if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) { 20313 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting); 20314 } 20315 20316 // append, don't splice, since this is the end 20317 $token = new HTMLPurifier_Token_End($top_nesting->name); 20318 20319 // punt! 20320 $reprocess = true; 20321 continue; 20322 } 20323 20324 //echo '<br>'; printZipper($zipper, $token);//printTokens($this->stack); 20325 //flush(); 20326 20327 // quick-check: if it's not a tag, no need to process 20328 if (empty($token->is_tag)) { 20329 if ($token instanceof HTMLPurifier_Token_Text) { 20330 foreach ($this->injectors as $i => $injector) { 20331 if (isset($token->skip[$i])) { 20332 // See Note [Injector skips] 20333 continue; 20334 } 20335 if ($token->rewind !== null && $token->rewind !== $i) { 20336 continue; 20337 } 20338 // XXX fuckup 20339 $r = $token; 20340 $injector->handleText($r); 20341 $token = $this->processToken($r, $i); 20342 $reprocess = true; 20343 break; 20344 } 20345 } 20346 // another possibility is a comment 20347 continue; 20348 } 20349 20350 if (isset($definition->info[$token->name])) { 20351 $type = $definition->info[$token->name]->child->type; 20352 } else { 20353 $type = false; // Type is unknown, treat accordingly 20354 } 20355 20356 // quick tag checks: anything that's *not* an end tag 20357 $ok = false; 20358 if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) { 20359 // claims to be a start tag but is empty 20360 $token = new HTMLPurifier_Token_Empty( 20361 $token->name, 20362 $token->attr, 20363 $token->line, 20364 $token->col, 20365 $token->armor 20366 ); 20367 $ok = true; 20368 } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) { 20369 // claims to be empty but really is a start tag 20370 // NB: this assignment is required 20371 $old_token = $token; 20372 $token = new HTMLPurifier_Token_End($token->name); 20373 $token = $this->insertBefore( 20374 new HTMLPurifier_Token_Start($old_token->name, $old_token->attr, $old_token->line, $old_token->col, $old_token->armor) 20375 ); 20376 // punt (since we had to modify the input stream in a non-trivial way) 20377 $reprocess = true; 20378 continue; 20379 } elseif ($token instanceof HTMLPurifier_Token_Empty) { 20380 // real empty token 20381 $ok = true; 20382 } elseif ($token instanceof HTMLPurifier_Token_Start) { 20383 // start tag 20384 20385 // ...unless they also have to close their parent 20386 if (!empty($this->stack)) { 20387 20388 // Performance note: you might think that it's rather 20389 // inefficient, recalculating the autoclose information 20390 // for every tag that a token closes (since when we 20391 // do an autoclose, we push a new token into the 20392 // stream and then /process/ that, before 20393 // re-processing this token.) But this is 20394 // necessary, because an injector can make an 20395 // arbitrary transformations to the autoclosing 20396 // tokens we introduce, so things may have changed 20397 // in the meantime. Also, doing the inefficient thing is 20398 // "easy" to reason about (for certain perverse definitions 20399 // of "easy") 20400 20401 $parent = array_pop($this->stack); 20402 $this->stack[] = $parent; 20403 20404 $parent_def = null; 20405 $parent_elements = null; 20406 $autoclose = false; 20407 if (isset($definition->info[$parent->name])) { 20408 $parent_def = $definition->info[$parent->name]; 20409 $parent_elements = $parent_def->child->getAllowedElements($config); 20410 $autoclose = !isset($parent_elements[$token->name]); 20411 } 20412 20413 if ($autoclose && $definition->info[$token->name]->wrap) { 20414 // Check if an element can be wrapped by another 20415 // element to make it valid in a context (for 20416 // example, <ul><ul> needs a <li> in between) 20417 $wrapname = $definition->info[$token->name]->wrap; 20418 $wrapdef = $definition->info[$wrapname]; 20419 $elements = $wrapdef->child->getAllowedElements($config); 20420 if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) { 20421 $newtoken = new HTMLPurifier_Token_Start($wrapname); 20422 $token = $this->insertBefore($newtoken); 20423 $reprocess = true; 20424 continue; 20425 } 20426 } 20427 20428 $carryover = false; 20429 if ($autoclose && $parent_def->formatting) { 20430 $carryover = true; 20431 } 20432 20433 if ($autoclose) { 20434 // check if this autoclose is doomed to fail 20435 // (this rechecks $parent, which his harmless) 20436 $autoclose_ok = isset($global_parent_allowed_elements[$token->name]); 20437 if (!$autoclose_ok) { 20438 foreach ($this->stack as $ancestor) { 20439 $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config); 20440 if (isset($elements[$token->name])) { 20441 $autoclose_ok = true; 20442 break; 20443 } 20444 if ($definition->info[$token->name]->wrap) { 20445 $wrapname = $definition->info[$token->name]->wrap; 20446 $wrapdef = $definition->info[$wrapname]; 20447 $wrap_elements = $wrapdef->child->getAllowedElements($config); 20448 if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) { 20449 $autoclose_ok = true; 20450 break; 20451 } 20452 } 20453 } 20454 } 20455 if ($autoclose_ok) { 20456 // errors need to be updated 20457 $new_token = new HTMLPurifier_Token_End($parent->name); 20458 $new_token->start = $parent; 20459 // [TagClosedSuppress] 20460 if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) { 20461 if (!$carryover) { 20462 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent); 20463 } else { 20464 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent); 20465 } 20466 } 20467 if ($carryover) { 20468 $element = clone $parent; 20469 // [TagClosedAuto] 20470 $element->armor['MakeWellFormed_TagClosedError'] = true; 20471 $element->carryover = true; 20472 $token = $this->processToken(array($new_token, $token, $element)); 20473 } else { 20474 $token = $this->insertBefore($new_token); 20475 } 20476 } else { 20477 $token = $this->remove(); 20478 } 20479 $reprocess = true; 20480 continue; 20481 } 20482 20483 } 20484 $ok = true; 20485 } 20486 20487 if ($ok) { 20488 foreach ($this->injectors as $i => $injector) { 20489 if (isset($token->skip[$i])) { 20490 // See Note [Injector skips] 20491 continue; 20492 } 20493 if ($token->rewind !== null && $token->rewind !== $i) { 20494 continue; 20495 } 20496 $r = $token; 20497 $injector->handleElement($r); 20498 $token = $this->processToken($r, $i); 20499 $reprocess = true; 20500 break; 20501 } 20502 if (!$reprocess) { 20503 // ah, nothing interesting happened; do normal processing 20504 if ($token instanceof HTMLPurifier_Token_Start) { 20505 $this->stack[] = $token; 20506 } elseif ($token instanceof HTMLPurifier_Token_End) { 20507 throw new HTMLPurifier_Exception( 20508 'Improper handling of end tag in start code; possible error in MakeWellFormed' 20509 ); 20510 } 20511 } 20512 continue; 20513 } 20514 20515 // sanity check: we should be dealing with a closing tag 20516 if (!$token instanceof HTMLPurifier_Token_End) { 20517 throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier'); 20518 } 20519 20520 // make sure that we have something open 20521 if (empty($this->stack)) { 20522 if ($escape_invalid_tags) { 20523 if ($e) { 20524 $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text'); 20525 } 20526 $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token)); 20527 } else { 20528 if ($e) { 20529 $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); 20530 } 20531 $token = $this->remove(); 20532 } 20533 $reprocess = true; 20534 continue; 20535 } 20536 20537 // first, check for the simplest case: everything closes neatly. 20538 // Eventually, everything passes through here; if there are problems 20539 // we modify the input stream accordingly and then punt, so that 20540 // the tokens get processed again. 20541 $current_parent = array_pop($this->stack); 20542 if ($current_parent->name == $token->name) { 20543 $token->start = $current_parent; 20544 foreach ($this->injectors as $i => $injector) { 20545 if (isset($token->skip[$i])) { 20546 // See Note [Injector skips] 20547 continue; 20548 } 20549 if ($token->rewind !== null && $token->rewind !== $i) { 20550 continue; 20551 } 20552 $r = $token; 20553 $injector->handleEnd($r); 20554 $token = $this->processToken($r, $i); 20555 $this->stack[] = $current_parent; 20556 $reprocess = true; 20557 break; 20558 } 20559 continue; 20560 } 20561 20562 // okay, so we're trying to close the wrong tag 20563 20564 // undo the pop previous pop 20565 $this->stack[] = $current_parent; 20566 20567 // scroll back the entire nest, trying to find our tag. 20568 // (feature could be to specify how far you'd like to go) 20569 $size = count($this->stack); 20570 // -2 because -1 is the last element, but we already checked that 20571 $skipped_tags = false; 20572 for ($j = $size - 2; $j >= 0; $j--) { 20573 if ($this->stack[$j]->name == $token->name) { 20574 $skipped_tags = array_slice($this->stack, $j); 20575 break; 20576 } 20577 } 20578 20579 // we didn't find the tag, so remove 20580 if ($skipped_tags === false) { 20581 if ($escape_invalid_tags) { 20582 if ($e) { 20583 $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text'); 20584 } 20585 $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token)); 20586 } else { 20587 if ($e) { 20588 $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); 20589 } 20590 $token = $this->remove(); 20591 } 20592 $reprocess = true; 20593 continue; 20594 } 20595 20596 // do errors, in REVERSE $j order: a,b,c with </a></b></c> 20597 $c = count($skipped_tags); 20598 if ($e) { 20599 for ($j = $c - 1; $j > 0; $j--) { 20600 // notice we exclude $j == 0, i.e. the current ending tag, from 20601 // the errors... [TagClosedSuppress] 20602 if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) { 20603 $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]); 20604 } 20605 } 20606 } 20607 20608 // insert tags, in FORWARD $j order: c,b,a with </a></b></c> 20609 $replace = array($token); 20610 for ($j = 1; $j < $c; $j++) { 20611 // ...as well as from the insertions 20612 $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name); 20613 $new_token->start = $skipped_tags[$j]; 20614 array_unshift($replace, $new_token); 20615 if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) { 20616 // [TagClosedAuto] 20617 $element = clone $skipped_tags[$j]; 20618 $element->carryover = true; 20619 $element->armor['MakeWellFormed_TagClosedError'] = true; 20620 $replace[] = $element; 20621 } 20622 } 20623 $token = $this->processToken($replace); 20624 $reprocess = true; 20625 continue; 20626 } 20627 20628 $context->destroy('CurrentToken'); 20629 $context->destroy('CurrentNesting'); 20630 $context->destroy('InputZipper'); 20631 20632 unset($this->injectors, $this->stack, $this->tokens); 20633 return $zipper->toArray($token); 20634 } 20635 20636 /** 20637 * Processes arbitrary token values for complicated substitution patterns. 20638 * In general: 20639 * 20640 * If $token is an array, it is a list of tokens to substitute for the 20641 * current token. These tokens then get individually processed. If there 20642 * is a leading integer in the list, that integer determines how many 20643 * tokens from the stream should be removed. 20644 * 20645 * If $token is a regular token, it is swapped with the current token. 20646 * 20647 * If $token is false, the current token is deleted. 20648 * 20649 * If $token is an integer, that number of tokens (with the first token 20650 * being the current one) will be deleted. 20651 * 20652 * @param HTMLPurifier_Token|array|int|bool $token Token substitution value 20653 * @param HTMLPurifier_Injector|int $injector Injector that performed the substitution; default is if 20654 * this is not an injector related operation. 20655 * @throws HTMLPurifier_Exception 20656 */ 20657 protected function processToken($token, $injector = -1) 20658 { 20659 // Zend OpCache miscompiles $token = array($token), so 20660 // avoid this pattern. See: https://github.com/ezyang/htmlpurifier/issues/108 20661 20662 // normalize forms of token 20663 if (is_object($token)) { 20664 $tmp = $token; 20665 $token = array(1, $tmp); 20666 } 20667 if (is_int($token)) { 20668 $tmp = $token; 20669 $token = array($tmp); 20670 } 20671 if ($token === false) { 20672 $token = array(1); 20673 } 20674 if (!is_array($token)) { 20675 throw new HTMLPurifier_Exception('Invalid token type from injector'); 20676 } 20677 if (!is_int($token[0])) { 20678 array_unshift($token, 1); 20679 } 20680 if ($token[0] === 0) { 20681 throw new HTMLPurifier_Exception('Deleting zero tokens is not valid'); 20682 } 20683 20684 // $token is now an array with the following form: 20685 // array(number nodes to delete, new node 1, new node 2, ...) 20686 20687 $delete = array_shift($token); 20688 list($old, $r) = $this->zipper->splice($this->token, $delete, $token); 20689 20690 if ($injector > -1) { 20691 // See Note [Injector skips] 20692 // Determine appropriate skips. Here's what the code does: 20693 // *If* we deleted one or more tokens, copy the skips 20694 // of those tokens into the skips of the new tokens (in $token). 20695 // Also, mark the newly inserted tokens as having come from 20696 // $injector. 20697 $oldskip = isset($old[0]) ? $old[0]->skip : array(); 20698 foreach ($token as $object) { 20699 $object->skip = $oldskip; 20700 $object->skip[$injector] = true; 20701 } 20702 } 20703 20704 return $r; 20705 20706 } 20707 20708 /** 20709 * Inserts a token before the current token. Cursor now points to 20710 * this token. You must reprocess after this. 20711 * @param HTMLPurifier_Token $token 20712 */ 20713 private function insertBefore($token) 20714 { 20715 // NB not $this->zipper->insertBefore(), due to positioning 20716 // differences 20717 $splice = $this->zipper->splice($this->token, 0, array($token)); 20718 20719 return $splice[1]; 20720 } 20721 20722 /** 20723 * Removes current token. Cursor now points to new token occupying previously 20724 * occupied space. You must reprocess after this. 20725 */ 20726 private function remove() 20727 { 20728 return $this->zipper->delete(); 20729 } 20730} 20731 20732// Note [Injector skips] 20733// ~~~~~~~~~~~~~~~~~~~~~ 20734// When I originally designed this class, the idea behind the 'skip' 20735// property of HTMLPurifier_Token was to help avoid infinite loops 20736// in injector processing. For example, suppose you wrote an injector 20737// that bolded swear words. Naively, you might write it so that 20738// whenever you saw ****, you replaced it with <strong>****</strong>. 20739// 20740// When this happens, we will reprocess all of the tokens with the 20741// other injectors. Now there is an opportunity for infinite loop: 20742// if we rerun the swear-word injector on these tokens, we might 20743// see **** and then reprocess again to get 20744// <strong><strong>****</strong></strong> ad infinitum. 20745// 20746// Thus, the idea of a skip is that once we process a token with 20747// an injector, we mark all of those tokens as having "come from" 20748// the injector, and we never run the injector again on these 20749// tokens. 20750// 20751// There were two more complications, however: 20752// 20753// - With HTMLPurifier_Injector_RemoveEmpty, we noticed that if 20754// you had <b><i></i></b>, after you removed the <i></i>, you 20755// really would like this injector to go back and reprocess 20756// the <b> tag, discovering that it is now empty and can be 20757// removed. So we reintroduced the possibility of infinite looping 20758// by adding a "rewind" function, which let you go back to an 20759// earlier point in the token stream and reprocess it with injectors. 20760// Needless to say, we need to UN-skip the token so it gets 20761// reprocessed. 20762// 20763// - Suppose that you successfuly process a token, replace it with 20764// one with your skip mark, but now another injector wants to 20765// process the skipped token with another token. Should you continue 20766// to skip that new token, or reprocess it? If you reprocess, 20767// you can end up with an infinite loop where one injector converts 20768// <a> to <b>, and then another injector converts it back. So 20769// we inherit the skips, but for some reason, I thought that we 20770// should inherit the skip from the first token of the token 20771// that we deleted. Why? Well, it seems to work OK. 20772// 20773// If I were to redesign this functionality, I would absolutely not 20774// go about doing it this way: the semantics are just not very well 20775// defined, and in any case you probably wanted to operate on trees, 20776// not token streams. 20777 20778 20779 20780 20781 20782/** 20783 * Removes all unrecognized tags from the list of tokens. 20784 * 20785 * This strategy iterates through all the tokens and removes unrecognized 20786 * tokens. If a token is not recognized but a TagTransform is defined for 20787 * that element, the element will be transformed accordingly. 20788 */ 20789 20790class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy 20791{ 20792 20793 /** 20794 * @param HTMLPurifier_Token[] $tokens 20795 * @param HTMLPurifier_Config $config 20796 * @param HTMLPurifier_Context $context 20797 * @return array|HTMLPurifier_Token[] 20798 */ 20799 public function execute($tokens, $config, $context) 20800 { 20801 $definition = $config->getHTMLDefinition(); 20802 $generator = new HTMLPurifier_Generator($config, $context); 20803 $result = array(); 20804 20805 $escape_invalid_tags = $config->get('Core.EscapeInvalidTags'); 20806 $remove_invalid_img = $config->get('Core.RemoveInvalidImg'); 20807 20808 // currently only used to determine if comments should be kept 20809 $trusted = $config->get('HTML.Trusted'); 20810 $comment_lookup = $config->get('HTML.AllowedComments'); 20811 $comment_regexp = $config->get('HTML.AllowedCommentsRegexp'); 20812 $check_comments = $comment_lookup !== array() || $comment_regexp !== null; 20813 20814 $remove_script_contents = $config->get('Core.RemoveScriptContents'); 20815 $hidden_elements = $config->get('Core.HiddenElements'); 20816 20817 // remove script contents compatibility 20818 if ($remove_script_contents === true) { 20819 $hidden_elements['script'] = true; 20820 } elseif ($remove_script_contents === false && isset($hidden_elements['script'])) { 20821 unset($hidden_elements['script']); 20822 } 20823 20824 $attr_validator = new HTMLPurifier_AttrValidator(); 20825 20826 // removes tokens until it reaches a closing tag with its value 20827 $remove_until = false; 20828 20829 // converts comments into text tokens when this is equal to a tag name 20830 $textify_comments = false; 20831 20832 $token = false; 20833 $context->register('CurrentToken', $token); 20834 20835 $e = false; 20836 if ($config->get('Core.CollectErrors')) { 20837 $e =& $context->get('ErrorCollector'); 20838 } 20839 20840 foreach ($tokens as $token) { 20841 if ($remove_until) { 20842 if (empty($token->is_tag) || $token->name !== $remove_until) { 20843 continue; 20844 } 20845 } 20846 if (!empty($token->is_tag)) { 20847 // DEFINITION CALL 20848 20849 // before any processing, try to transform the element 20850 if (isset($definition->info_tag_transform[$token->name])) { 20851 $original_name = $token->name; 20852 // there is a transformation for this tag 20853 // DEFINITION CALL 20854 $token = $definition-> 20855 info_tag_transform[$token->name]->transform($token, $config, $context); 20856 if ($e) { 20857 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name); 20858 } 20859 } 20860 20861 if (isset($definition->info[$token->name])) { 20862 // mostly everything's good, but 20863 // we need to make sure required attributes are in order 20864 if (($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) && 20865 $definition->info[$token->name]->required_attr && 20866 ($token->name != 'img' || $remove_invalid_img) // ensure config option still works 20867 ) { 20868 $attr_validator->validateToken($token, $config, $context); 20869 $ok = true; 20870 foreach ($definition->info[$token->name]->required_attr as $name) { 20871 if (!isset($token->attr[$name])) { 20872 $ok = false; 20873 break; 20874 } 20875 } 20876 if (!$ok) { 20877 if ($e) { 20878 $e->send( 20879 E_ERROR, 20880 'Strategy_RemoveForeignElements: Missing required attribute', 20881 $name 20882 ); 20883 } 20884 continue; 20885 } 20886 $token->armor['ValidateAttributes'] = true; 20887 } 20888 20889 if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) { 20890 $textify_comments = $token->name; 20891 } elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) { 20892 $textify_comments = false; 20893 } 20894 20895 } elseif ($escape_invalid_tags) { 20896 // invalid tag, generate HTML representation and insert in 20897 if ($e) { 20898 $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text'); 20899 } 20900 $token = new HTMLPurifier_Token_Text( 20901 $generator->generateFromToken($token) 20902 ); 20903 } else { 20904 // check if we need to destroy all of the tag's children 20905 // CAN BE GENERICIZED 20906 if (isset($hidden_elements[$token->name])) { 20907 if ($token instanceof HTMLPurifier_Token_Start) { 20908 $remove_until = $token->name; 20909 } elseif ($token instanceof HTMLPurifier_Token_Empty) { 20910 // do nothing: we're still looking 20911 } else { 20912 $remove_until = false; 20913 } 20914 if ($e) { 20915 $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed'); 20916 } 20917 } else { 20918 if ($e) { 20919 $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed'); 20920 } 20921 } 20922 continue; 20923 } 20924 } elseif ($token instanceof HTMLPurifier_Token_Comment) { 20925 // textify comments in script tags when they are allowed 20926 if ($textify_comments !== false) { 20927 $data = $token->data; 20928 $token = new HTMLPurifier_Token_Text($data); 20929 } elseif ($trusted || $check_comments) { 20930 // always cleanup comments 20931 $trailing_hyphen = false; 20932 if ($e) { 20933 // perform check whether or not there's a trailing hyphen 20934 if (substr($token->data, -1) == '-') { 20935 $trailing_hyphen = true; 20936 } 20937 } 20938 $token->data = rtrim($token->data, '-'); 20939 $found_double_hyphen = false; 20940 while (strpos($token->data, '--') !== false) { 20941 $found_double_hyphen = true; 20942 $token->data = str_replace('--', '-', $token->data); 20943 } 20944 if ($trusted || !empty($comment_lookup[trim($token->data)]) || 20945 ($comment_regexp !== null && preg_match($comment_regexp, trim($token->data)))) { 20946 // OK good 20947 if ($e) { 20948 if ($trailing_hyphen) { 20949 $e->send( 20950 E_NOTICE, 20951 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed' 20952 ); 20953 } 20954 if ($found_double_hyphen) { 20955 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed'); 20956 } 20957 } 20958 } else { 20959 if ($e) { 20960 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); 20961 } 20962 continue; 20963 } 20964 } else { 20965 // strip comments 20966 if ($e) { 20967 $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed'); 20968 } 20969 continue; 20970 } 20971 } elseif ($token instanceof HTMLPurifier_Token_Text) { 20972 } else { 20973 continue; 20974 } 20975 $result[] = $token; 20976 } 20977 if ($remove_until && $e) { 20978 // we removed tokens until the end, throw error 20979 $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until); 20980 } 20981 $context->destroy('CurrentToken'); 20982 return $result; 20983 } 20984} 20985 20986 20987 20988 20989 20990/** 20991 * Validate all attributes in the tokens. 20992 */ 20993 20994class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy 20995{ 20996 20997 /** 20998 * @param HTMLPurifier_Token[] $tokens 20999 * @param HTMLPurifier_Config $config 21000 * @param HTMLPurifier_Context $context 21001 * @return HTMLPurifier_Token[] 21002 */ 21003 public function execute($tokens, $config, $context) 21004 { 21005 // setup validator 21006 $validator = new HTMLPurifier_AttrValidator(); 21007 21008 $token = false; 21009 $context->register('CurrentToken', $token); 21010 21011 foreach ($tokens as $key => $token) { 21012 21013 // only process tokens that have attributes, 21014 // namely start and empty tags 21015 if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) { 21016 continue; 21017 } 21018 21019 // skip tokens that are armored 21020 if (!empty($token->armor['ValidateAttributes'])) { 21021 continue; 21022 } 21023 21024 // note that we have no facilities here for removing tokens 21025 $validator->validateToken($token, $config, $context); 21026 } 21027 $context->destroy('CurrentToken'); 21028 return $tokens; 21029 } 21030} 21031 21032 21033 21034 21035 21036/** 21037 * Transforms FONT tags to the proper form (SPAN with CSS styling) 21038 * 21039 * This transformation takes the three proprietary attributes of FONT and 21040 * transforms them into their corresponding CSS attributes. These are color, 21041 * face, and size. 21042 * 21043 * @note Size is an interesting case because it doesn't map cleanly to CSS. 21044 * Thanks to 21045 * http://style.cleverchimp.com/font_size_intervals/altintervals.html 21046 * for reasonable mappings. 21047 * @warning This doesn't work completely correctly; specifically, this 21048 * TagTransform operates before well-formedness is enforced, so 21049 * the "active formatting elements" algorithm doesn't get applied. 21050 */ 21051class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform 21052{ 21053 /** 21054 * @type string 21055 */ 21056 public $transform_to = 'span'; 21057 21058 /** 21059 * @type array 21060 */ 21061 protected $_size_lookup = array( 21062 '0' => 'xx-small', 21063 '1' => 'xx-small', 21064 '2' => 'small', 21065 '3' => 'medium', 21066 '4' => 'large', 21067 '5' => 'x-large', 21068 '6' => 'xx-large', 21069 '7' => '300%', 21070 '-1' => 'smaller', 21071 '-2' => '60%', 21072 '+1' => 'larger', 21073 '+2' => '150%', 21074 '+3' => '200%', 21075 '+4' => '300%' 21076 ); 21077 21078 /** 21079 * @param HTMLPurifier_Token_Tag $tag 21080 * @param HTMLPurifier_Config $config 21081 * @param HTMLPurifier_Context $context 21082 * @return HTMLPurifier_Token_End|string 21083 */ 21084 public function transform($tag, $config, $context) 21085 { 21086 if ($tag instanceof HTMLPurifier_Token_End) { 21087 $new_tag = clone $tag; 21088 $new_tag->name = $this->transform_to; 21089 return $new_tag; 21090 } 21091 21092 $attr = $tag->attr; 21093 $prepend_style = ''; 21094 21095 // handle color transform 21096 if (isset($attr['color'])) { 21097 $prepend_style .= 'color:' . $attr['color'] . ';'; 21098 unset($attr['color']); 21099 } 21100 21101 // handle face transform 21102 if (isset($attr['face'])) { 21103 $prepend_style .= 'font-family:' . $attr['face'] . ';'; 21104 unset($attr['face']); 21105 } 21106 21107 // handle size transform 21108 if (isset($attr['size'])) { 21109 // normalize large numbers 21110 if ($attr['size'] !== '') { 21111 if ($attr['size'][0] == '+' || $attr['size'][0] == '-') { 21112 $size = (int)$attr['size']; 21113 if ($size < -2) { 21114 $attr['size'] = '-2'; 21115 } 21116 if ($size > 4) { 21117 $attr['size'] = '+4'; 21118 } 21119 } else { 21120 $size = (int)$attr['size']; 21121 if ($size > 7) { 21122 $attr['size'] = '7'; 21123 } 21124 } 21125 } 21126 if (isset($this->_size_lookup[$attr['size']])) { 21127 $prepend_style .= 'font-size:' . 21128 $this->_size_lookup[$attr['size']] . ';'; 21129 } 21130 unset($attr['size']); 21131 } 21132 21133 if ($prepend_style) { 21134 $attr['style'] = isset($attr['style']) ? 21135 $prepend_style . $attr['style'] : 21136 $prepend_style; 21137 } 21138 21139 $new_tag = clone $tag; 21140 $new_tag->name = $this->transform_to; 21141 $new_tag->attr = $attr; 21142 21143 return $new_tag; 21144 } 21145} 21146 21147 21148 21149 21150 21151/** 21152 * Simple transformation, just change tag name to something else, 21153 * and possibly add some styling. This will cover most of the deprecated 21154 * tag cases. 21155 */ 21156class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform 21157{ 21158 /** 21159 * @type string 21160 */ 21161 protected $style; 21162 21163 /** 21164 * @param string $transform_to Tag name to transform to. 21165 * @param string $style CSS style to add to the tag 21166 */ 21167 public function __construct($transform_to, $style = null) 21168 { 21169 $this->transform_to = $transform_to; 21170 $this->style = $style; 21171 } 21172 21173 /** 21174 * @param HTMLPurifier_Token_Tag $tag 21175 * @param HTMLPurifier_Config $config 21176 * @param HTMLPurifier_Context $context 21177 * @return string 21178 */ 21179 public function transform($tag, $config, $context) 21180 { 21181 $new_tag = clone $tag; 21182 $new_tag->name = $this->transform_to; 21183 if (!is_null($this->style) && 21184 ($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty) 21185 ) { 21186 $this->prependCSS($new_tag->attr, $this->style); 21187 } 21188 return $new_tag; 21189 } 21190} 21191 21192 21193 21194 21195 21196/** 21197 * Concrete comment token class. Generally will be ignored. 21198 */ 21199class HTMLPurifier_Token_Comment extends HTMLPurifier_Token 21200{ 21201 /** 21202 * Character data within comment. 21203 * @type string 21204 */ 21205 public $data; 21206 21207 /** 21208 * @type bool 21209 */ 21210 public $is_whitespace = true; 21211 21212 /** 21213 * Transparent constructor. 21214 * 21215 * @param string $data String comment data. 21216 * @param int $line 21217 * @param int $col 21218 */ 21219 public function __construct($data, $line = null, $col = null) 21220 { 21221 $this->data = $data; 21222 $this->line = $line; 21223 $this->col = $col; 21224 } 21225 21226 public function toNode() { 21227 return new HTMLPurifier_Node_Comment($this->data, $this->line, $this->col); 21228 } 21229} 21230 21231 21232 21233 21234 21235/** 21236 * Abstract class of a tag token (start, end or empty), and its behavior. 21237 */ 21238abstract class HTMLPurifier_Token_Tag extends HTMLPurifier_Token 21239{ 21240 /** 21241 * Static bool marker that indicates the class is a tag. 21242 * 21243 * This allows us to check objects with <tt>!empty($obj->is_tag)</tt> 21244 * without having to use a function call <tt>is_a()</tt>. 21245 * @type bool 21246 */ 21247 public $is_tag = true; 21248 21249 /** 21250 * The lower-case name of the tag, like 'a', 'b' or 'blockquote'. 21251 * 21252 * @note Strictly speaking, XML tags are case sensitive, so we shouldn't 21253 * be lower-casing them, but these tokens cater to HTML tags, which are 21254 * insensitive. 21255 * @type string 21256 */ 21257 public $name; 21258 21259 /** 21260 * Associative array of the tag's attributes. 21261 * @type array 21262 */ 21263 public $attr = array(); 21264 21265 /** 21266 * Non-overloaded constructor, which lower-cases passed tag name. 21267 * 21268 * @param string $name String name. 21269 * @param array $attr Associative array of attributes. 21270 * @param int $line 21271 * @param int $col 21272 * @param array $armor 21273 */ 21274 public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) 21275 { 21276 $this->name = ctype_lower($name) ? $name : strtolower($name); 21277 foreach ($attr as $key => $value) { 21278 // normalization only necessary when key is not lowercase 21279 if (!ctype_lower($key)) { 21280 $new_key = strtolower($key); 21281 if (!isset($attr[$new_key])) { 21282 $attr[$new_key] = $attr[$key]; 21283 } 21284 if ($new_key !== $key) { 21285 unset($attr[$key]); 21286 } 21287 } 21288 } 21289 $this->attr = $attr; 21290 $this->line = $line; 21291 $this->col = $col; 21292 $this->armor = $armor; 21293 } 21294 21295 public function toNode() { 21296 return new HTMLPurifier_Node_Element($this->name, $this->attr, $this->line, $this->col, $this->armor); 21297 } 21298} 21299 21300 21301 21302 21303 21304/** 21305 * Concrete empty token class. 21306 */ 21307class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag 21308{ 21309 public function toNode() { 21310 $n = parent::toNode(); 21311 $n->empty = true; 21312 return $n; 21313 } 21314} 21315 21316 21317 21318 21319 21320/** 21321 * Concrete end token class. 21322 * 21323 * @warning This class accepts attributes even though end tags cannot. This 21324 * is for optimization reasons, as under normal circumstances, the Lexers 21325 * do not pass attributes. 21326 */ 21327class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag 21328{ 21329 /** 21330 * Token that started this node. 21331 * Added by MakeWellFormed. Please do not edit this! 21332 * @type HTMLPurifier_Token 21333 */ 21334 public $start; 21335 21336 public function toNode() { 21337 throw new Exception("HTMLPurifier_Token_End->toNode not supported!"); 21338 } 21339} 21340 21341 21342 21343 21344 21345/** 21346 * Concrete start token class. 21347 */ 21348class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag 21349{ 21350} 21351 21352 21353 21354 21355 21356/** 21357 * Concrete text token class. 21358 * 21359 * Text tokens comprise of regular parsed character data (PCDATA) and raw 21360 * character data (from the CDATA sections). Internally, their 21361 * data is parsed with all entities expanded. Surprisingly, the text token 21362 * does have a "tag name" called #PCDATA, which is how the DTD represents it 21363 * in permissible child nodes. 21364 */ 21365class HTMLPurifier_Token_Text extends HTMLPurifier_Token 21366{ 21367 21368 /** 21369 * @type string 21370 */ 21371 public $name = '#PCDATA'; 21372 /**< PCDATA tag name compatible with DTD. */ 21373 21374 /** 21375 * @type string 21376 */ 21377 public $data; 21378 /**< Parsed character data of text. */ 21379 21380 /** 21381 * @type bool 21382 */ 21383 public $is_whitespace; 21384 21385 /**< Bool indicating if node is whitespace. */ 21386 21387 /** 21388 * Constructor, accepts data and determines if it is whitespace. 21389 * @param string $data String parsed character data. 21390 * @param int $line 21391 * @param int $col 21392 */ 21393 public function __construct($data, $line = null, $col = null) 21394 { 21395 $this->data = $data; 21396 $this->is_whitespace = ctype_space($data); 21397 $this->line = $line; 21398 $this->col = $col; 21399 } 21400 21401 public function toNode() { 21402 return new HTMLPurifier_Node_Text($this->data, $this->is_whitespace, $this->line, $this->col); 21403 } 21404} 21405 21406 21407 21408 21409 21410class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter 21411{ 21412 /** 21413 * @type string 21414 */ 21415 public $name = 'DisableExternal'; 21416 21417 /** 21418 * @type array 21419 */ 21420 protected $ourHostParts = false; 21421 21422 /** 21423 * @param HTMLPurifier_Config $config 21424 * @return void 21425 */ 21426 public function prepare($config) 21427 { 21428 $our_host = $config->getDefinition('URI')->host; 21429 if ($our_host !== null) { 21430 $this->ourHostParts = array_reverse(explode('.', $our_host)); 21431 } 21432 } 21433 21434 /** 21435 * @param HTMLPurifier_URI $uri Reference 21436 * @param HTMLPurifier_Config $config 21437 * @param HTMLPurifier_Context $context 21438 * @return bool 21439 */ 21440 public function filter(&$uri, $config, $context) 21441 { 21442 if (is_null($uri->host)) { 21443 return true; 21444 } 21445 if ($this->ourHostParts === false) { 21446 return false; 21447 } 21448 $host_parts = array_reverse(explode('.', $uri->host)); 21449 foreach ($this->ourHostParts as $i => $x) { 21450 if (!isset($host_parts[$i])) { 21451 return false; 21452 } 21453 if ($host_parts[$i] != $this->ourHostParts[$i]) { 21454 return false; 21455 } 21456 } 21457 return true; 21458 } 21459} 21460 21461 21462 21463 21464 21465class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal 21466{ 21467 /** 21468 * @type string 21469 */ 21470 public $name = 'DisableExternalResources'; 21471 21472 /** 21473 * @param HTMLPurifier_URI $uri 21474 * @param HTMLPurifier_Config $config 21475 * @param HTMLPurifier_Context $context 21476 * @return bool 21477 */ 21478 public function filter(&$uri, $config, $context) 21479 { 21480 if (!$context->get('EmbeddedURI', true)) { 21481 return true; 21482 } 21483 return parent::filter($uri, $config, $context); 21484 } 21485} 21486 21487 21488 21489 21490 21491class HTMLPurifier_URIFilter_DisableResources extends HTMLPurifier_URIFilter 21492{ 21493 /** 21494 * @type string 21495 */ 21496 public $name = 'DisableResources'; 21497 21498 /** 21499 * @param HTMLPurifier_URI $uri 21500 * @param HTMLPurifier_Config $config 21501 * @param HTMLPurifier_Context $context 21502 * @return bool 21503 */ 21504 public function filter(&$uri, $config, $context) 21505 { 21506 return !$context->get('EmbeddedURI', true); 21507 } 21508} 21509 21510 21511 21512 21513 21514// It's not clear to me whether or not Punycode means that hostnames 21515// do not have canonical forms anymore. As far as I can tell, it's 21516// not a problem (punycoding should be identity when no Unicode 21517// points are involved), but I'm not 100% sure 21518class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter 21519{ 21520 /** 21521 * @type string 21522 */ 21523 public $name = 'HostBlacklist'; 21524 21525 /** 21526 * @type array 21527 */ 21528 protected $blacklist = array(); 21529 21530 /** 21531 * @param HTMLPurifier_Config $config 21532 * @return bool 21533 */ 21534 public function prepare($config) 21535 { 21536 $this->blacklist = $config->get('URI.HostBlacklist'); 21537 return true; 21538 } 21539 21540 /** 21541 * @param HTMLPurifier_URI $uri 21542 * @param HTMLPurifier_Config $config 21543 * @param HTMLPurifier_Context $context 21544 * @return bool 21545 */ 21546 public function filter(&$uri, $config, $context) 21547 { 21548 foreach ($this->blacklist as $blacklisted_host_fragment) { 21549 if (strpos($uri->host, $blacklisted_host_fragment) !== false) { 21550 return false; 21551 } 21552 } 21553 return true; 21554 } 21555} 21556 21557 21558 21559 21560 21561// does not support network paths 21562 21563class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter 21564{ 21565 /** 21566 * @type string 21567 */ 21568 public $name = 'MakeAbsolute'; 21569 21570 /** 21571 * @type 21572 */ 21573 protected $base; 21574 21575 /** 21576 * @type array 21577 */ 21578 protected $basePathStack = array(); 21579 21580 /** 21581 * @param HTMLPurifier_Config $config 21582 * @return bool 21583 */ 21584 public function prepare($config) 21585 { 21586 $def = $config->getDefinition('URI'); 21587 $this->base = $def->base; 21588 if (is_null($this->base)) { 21589 trigger_error( 21590 'URI.MakeAbsolute is being ignored due to lack of ' . 21591 'value for URI.Base configuration', 21592 E_USER_WARNING 21593 ); 21594 return false; 21595 } 21596 $this->base->fragment = null; // fragment is invalid for base URI 21597 $stack = explode('/', $this->base->path); 21598 array_pop($stack); // discard last segment 21599 $stack = $this->_collapseStack($stack); // do pre-parsing 21600 $this->basePathStack = $stack; 21601 return true; 21602 } 21603 21604 /** 21605 * @param HTMLPurifier_URI $uri 21606 * @param HTMLPurifier_Config $config 21607 * @param HTMLPurifier_Context $context 21608 * @return bool 21609 */ 21610 public function filter(&$uri, $config, $context) 21611 { 21612 if (is_null($this->base)) { 21613 return true; 21614 } // abort early 21615 if ($uri->path === '' && is_null($uri->scheme) && 21616 is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)) { 21617 // reference to current document 21618 $uri = clone $this->base; 21619 return true; 21620 } 21621 if (!is_null($uri->scheme)) { 21622 // absolute URI already: don't change 21623 if (!is_null($uri->host)) { 21624 return true; 21625 } 21626 $scheme_obj = $uri->getSchemeObj($config, $context); 21627 if (!$scheme_obj) { 21628 // scheme not recognized 21629 return false; 21630 } 21631 if (!$scheme_obj->hierarchical) { 21632 // non-hierarchal URI with explicit scheme, don't change 21633 return true; 21634 } 21635 // special case: had a scheme but always is hierarchical and had no authority 21636 } 21637 if (!is_null($uri->host)) { 21638 // network path, don't bother 21639 return true; 21640 } 21641 if ($uri->path === '') { 21642 $uri->path = $this->base->path; 21643 } elseif ($uri->path[0] !== '/') { 21644 // relative path, needs more complicated processing 21645 $stack = explode('/', $uri->path); 21646 $new_stack = array_merge($this->basePathStack, $stack); 21647 if ($new_stack[0] !== '' && !is_null($this->base->host)) { 21648 array_unshift($new_stack, ''); 21649 } 21650 $new_stack = $this->_collapseStack($new_stack); 21651 $uri->path = implode('/', $new_stack); 21652 } else { 21653 // absolute path, but still we should collapse 21654 $uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path))); 21655 } 21656 // re-combine 21657 $uri->scheme = $this->base->scheme; 21658 if (is_null($uri->userinfo)) { 21659 $uri->userinfo = $this->base->userinfo; 21660 } 21661 if (is_null($uri->host)) { 21662 $uri->host = $this->base->host; 21663 } 21664 if (is_null($uri->port)) { 21665 $uri->port = $this->base->port; 21666 } 21667 return true; 21668 } 21669 21670 /** 21671 * Resolve dots and double-dots in a path stack 21672 * @param array $stack 21673 * @return array 21674 */ 21675 private function _collapseStack($stack) 21676 { 21677 $result = array(); 21678 $is_folder = false; 21679 for ($i = 0; isset($stack[$i]); $i++) { 21680 $is_folder = false; 21681 // absorb an internally duplicated slash 21682 if ($stack[$i] == '' && $i && isset($stack[$i + 1])) { 21683 continue; 21684 } 21685 if ($stack[$i] == '..') { 21686 if (!empty($result)) { 21687 $segment = array_pop($result); 21688 if ($segment === '' && empty($result)) { 21689 // error case: attempted to back out too far: 21690 // restore the leading slash 21691 $result[] = ''; 21692 } elseif ($segment === '..') { 21693 $result[] = '..'; // cannot remove .. with .. 21694 } 21695 } else { 21696 // relative path, preserve the double-dots 21697 $result[] = '..'; 21698 } 21699 $is_folder = true; 21700 continue; 21701 } 21702 if ($stack[$i] == '.') { 21703 // silently absorb 21704 $is_folder = true; 21705 continue; 21706 } 21707 $result[] = $stack[$i]; 21708 } 21709 if ($is_folder) { 21710 $result[] = ''; 21711 } 21712 return $result; 21713 } 21714} 21715 21716 21717 21718 21719 21720class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter 21721{ 21722 /** 21723 * @type string 21724 */ 21725 public $name = 'Munge'; 21726 21727 /** 21728 * @type bool 21729 */ 21730 public $post = true; 21731 21732 /** 21733 * @type string 21734 */ 21735 private $target; 21736 21737 /** 21738 * @type HTMLPurifier_URIParser 21739 */ 21740 private $parser; 21741 21742 /** 21743 * @type bool 21744 */ 21745 private $doEmbed; 21746 21747 /** 21748 * @type string 21749 */ 21750 private $secretKey; 21751 21752 /** 21753 * @type array 21754 */ 21755 protected $replace = array(); 21756 21757 /** 21758 * @param HTMLPurifier_Config $config 21759 * @return bool 21760 */ 21761 public function prepare($config) 21762 { 21763 $this->target = $config->get('URI.' . $this->name); 21764 $this->parser = new HTMLPurifier_URIParser(); 21765 $this->doEmbed = $config->get('URI.MungeResources'); 21766 $this->secretKey = $config->get('URI.MungeSecretKey'); 21767 if ($this->secretKey && !function_exists('hash_hmac')) { 21768 throw new Exception("Cannot use %URI.MungeSecretKey without hash_hmac support."); 21769 } 21770 return true; 21771 } 21772 21773 /** 21774 * @param HTMLPurifier_URI $uri 21775 * @param HTMLPurifier_Config $config 21776 * @param HTMLPurifier_Context $context 21777 * @return bool 21778 */ 21779 public function filter(&$uri, $config, $context) 21780 { 21781 if ($context->get('EmbeddedURI', true) && !$this->doEmbed) { 21782 return true; 21783 } 21784 21785 $scheme_obj = $uri->getSchemeObj($config, $context); 21786 if (!$scheme_obj) { 21787 return true; 21788 } // ignore unknown schemes, maybe another postfilter did it 21789 if (!$scheme_obj->browsable) { 21790 return true; 21791 } // ignore non-browseable schemes, since we can't munge those in a reasonable way 21792 if ($uri->isBenign($config, $context)) { 21793 return true; 21794 } // don't redirect if a benign URL 21795 21796 $this->makeReplace($uri, $config, $context); 21797 $this->replace = array_map('rawurlencode', $this->replace); 21798 21799 $new_uri = strtr($this->target, $this->replace); 21800 $new_uri = $this->parser->parse($new_uri); 21801 // don't redirect if the target host is the same as the 21802 // starting host 21803 if ($uri->host === $new_uri->host) { 21804 return true; 21805 } 21806 $uri = $new_uri; // overwrite 21807 return true; 21808 } 21809 21810 /** 21811 * @param HTMLPurifier_URI $uri 21812 * @param HTMLPurifier_Config $config 21813 * @param HTMLPurifier_Context $context 21814 */ 21815 protected function makeReplace($uri, $config, $context) 21816 { 21817 $string = $uri->toString(); 21818 // always available 21819 $this->replace['%s'] = $string; 21820 $this->replace['%r'] = $context->get('EmbeddedURI', true); 21821 $token = $context->get('CurrentToken', true); 21822 $this->replace['%n'] = $token ? $token->name : null; 21823 $this->replace['%m'] = $context->get('CurrentAttr', true); 21824 $this->replace['%p'] = $context->get('CurrentCSSProperty', true); 21825 // not always available 21826 if ($this->secretKey) { 21827 $this->replace['%t'] = hash_hmac("sha256", $string, $this->secretKey); 21828 } 21829 } 21830} 21831 21832 21833 21834 21835 21836/** 21837 * Implements safety checks for safe iframes. 21838 * 21839 * @warning This filter is *critical* for ensuring that %HTML.SafeIframe 21840 * works safely. 21841 */ 21842class HTMLPurifier_URIFilter_SafeIframe extends HTMLPurifier_URIFilter 21843{ 21844 /** 21845 * @type string 21846 */ 21847 public $name = 'SafeIframe'; 21848 21849 /** 21850 * @type bool 21851 */ 21852 public $always_load = true; 21853 21854 /** 21855 * @type string 21856 */ 21857 protected $regexp = null; 21858 21859 // XXX: The not so good bit about how this is all set up now is we 21860 // can't check HTML.SafeIframe in the 'prepare' step: we have to 21861 // defer till the actual filtering. 21862 /** 21863 * @param HTMLPurifier_Config $config 21864 * @return bool 21865 */ 21866 public function prepare($config) 21867 { 21868 $this->regexp = $config->get('URI.SafeIframeRegexp'); 21869 return true; 21870 } 21871 21872 /** 21873 * @param HTMLPurifier_URI $uri 21874 * @param HTMLPurifier_Config $config 21875 * @param HTMLPurifier_Context $context 21876 * @return bool 21877 */ 21878 public function filter(&$uri, $config, $context) 21879 { 21880 // check if filter not applicable 21881 if (!$config->get('HTML.SafeIframe')) { 21882 return true; 21883 } 21884 // check if the filter should actually trigger 21885 if (!$context->get('EmbeddedURI', true)) { 21886 return true; 21887 } 21888 $token = $context->get('CurrentToken', true); 21889 if (!($token && $token->name == 'iframe')) { 21890 return true; 21891 } 21892 // check if we actually have some whitelists enabled 21893 if ($this->regexp === null) { 21894 return false; 21895 } 21896 // actually check the whitelists 21897 return preg_match($this->regexp, $uri->toString()); 21898 } 21899} 21900 21901 21902 21903 21904 21905/** 21906 * Implements data: URI for base64 encoded images supported by GD. 21907 */ 21908class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme 21909{ 21910 /** 21911 * @type bool 21912 */ 21913 public $browsable = true; 21914 21915 /** 21916 * @type array 21917 */ 21918 public $allowed_types = array( 21919 // you better write validation code for other types if you 21920 // decide to allow them 21921 'image/jpeg' => true, 21922 'image/gif' => true, 21923 'image/png' => true, 21924 ); 21925 // this is actually irrelevant since we only write out the path 21926 // component 21927 /** 21928 * @type bool 21929 */ 21930 public $may_omit_host = true; 21931 21932 /** 21933 * @param HTMLPurifier_URI $uri 21934 * @param HTMLPurifier_Config $config 21935 * @param HTMLPurifier_Context $context 21936 * @return bool 21937 */ 21938 public function doValidate(&$uri, $config, $context) 21939 { 21940 $result = explode(',', $uri->path, 2); 21941 $is_base64 = false; 21942 $charset = null; 21943 $content_type = null; 21944 if (count($result) == 2) { 21945 list($metadata, $data) = $result; 21946 // do some legwork on the metadata 21947 $metas = explode(';', $metadata); 21948 while (!empty($metas)) { 21949 $cur = array_shift($metas); 21950 if ($cur == 'base64') { 21951 $is_base64 = true; 21952 break; 21953 } 21954 if (substr($cur, 0, 8) == 'charset=') { 21955 // doesn't match if there are arbitrary spaces, but 21956 // whatever dude 21957 if ($charset !== null) { 21958 continue; 21959 } // garbage 21960 $charset = substr($cur, 8); // not used 21961 } else { 21962 if ($content_type !== null) { 21963 continue; 21964 } // garbage 21965 $content_type = $cur; 21966 } 21967 } 21968 } else { 21969 $data = $result[0]; 21970 } 21971 if ($content_type !== null && empty($this->allowed_types[$content_type])) { 21972 return false; 21973 } 21974 if ($charset !== null) { 21975 // error; we don't allow plaintext stuff 21976 $charset = null; 21977 } 21978 $data = rawurldecode($data); 21979 if ($is_base64) { 21980 $raw_data = base64_decode($data); 21981 } else { 21982 $raw_data = $data; 21983 } 21984 if ( strlen($raw_data) < 12 ) { 21985 // error; exif_imagetype throws exception with small files, 21986 // and this likely indicates a corrupt URI/failed parse anyway 21987 return false; 21988 } 21989 // XXX probably want to refactor this into a general mechanism 21990 // for filtering arbitrary content types 21991 if (function_exists('sys_get_temp_dir')) { 21992 $file = tempnam(sys_get_temp_dir(), ""); 21993 } else { 21994 $file = tempnam("/tmp", ""); 21995 } 21996 file_put_contents($file, $raw_data); 21997 if (function_exists('exif_imagetype')) { 21998 $image_code = exif_imagetype($file); 21999 unlink($file); 22000 } elseif (function_exists('getimagesize')) { 22001 set_error_handler(array($this, 'muteErrorHandler')); 22002 $info = getimagesize($file); 22003 restore_error_handler(); 22004 unlink($file); 22005 if ($info == false) { 22006 return false; 22007 } 22008 $image_code = $info[2]; 22009 } else { 22010 trigger_error("could not find exif_imagetype or getimagesize functions", E_USER_ERROR); 22011 } 22012 $real_content_type = image_type_to_mime_type($image_code); 22013 if ($real_content_type != $content_type) { 22014 // we're nice guys; if the content type is something else we 22015 // support, change it over 22016 if (empty($this->allowed_types[$real_content_type])) { 22017 return false; 22018 } 22019 $content_type = $real_content_type; 22020 } 22021 // ok, it's kosher, rewrite what we need 22022 $uri->userinfo = null; 22023 $uri->host = null; 22024 $uri->port = null; 22025 $uri->fragment = null; 22026 $uri->query = null; 22027 $uri->path = "$content_type;base64," . base64_encode($raw_data); 22028 return true; 22029 } 22030 22031 /** 22032 * @param int $errno 22033 * @param string $errstr 22034 */ 22035 public function muteErrorHandler($errno, $errstr) 22036 { 22037 } 22038} 22039 22040 22041 22042/** 22043 * Validates file as defined by RFC 1630 and RFC 1738. 22044 */ 22045class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme 22046{ 22047 /** 22048 * Generally file:// URLs are not accessible from most 22049 * machines, so placing them as an img src is incorrect. 22050 * @type bool 22051 */ 22052 public $browsable = false; 22053 22054 /** 22055 * Basically the *only* URI scheme for which this is true, since 22056 * accessing files on the local machine is very common. In fact, 22057 * browsers on some operating systems don't understand the 22058 * authority, though I hear it is used on Windows to refer to 22059 * network shares. 22060 * @type bool 22061 */ 22062 public $may_omit_host = true; 22063 22064 /** 22065 * @param HTMLPurifier_URI $uri 22066 * @param HTMLPurifier_Config $config 22067 * @param HTMLPurifier_Context $context 22068 * @return bool 22069 */ 22070 public function doValidate(&$uri, $config, $context) 22071 { 22072 // Authentication method is not supported 22073 $uri->userinfo = null; 22074 // file:// makes no provisions for accessing the resource 22075 $uri->port = null; 22076 // While it seems to work on Firefox, the querystring has 22077 // no possible effect and is thus stripped. 22078 $uri->query = null; 22079 return true; 22080 } 22081} 22082 22083 22084 22085 22086 22087/** 22088 * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738. 22089 */ 22090class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme 22091{ 22092 /** 22093 * @type int 22094 */ 22095 public $default_port = 21; 22096 22097 /** 22098 * @type bool 22099 */ 22100 public $browsable = true; // usually 22101 22102 /** 22103 * @type bool 22104 */ 22105 public $hierarchical = true; 22106 22107 /** 22108 * @param HTMLPurifier_URI $uri 22109 * @param HTMLPurifier_Config $config 22110 * @param HTMLPurifier_Context $context 22111 * @return bool 22112 */ 22113 public function doValidate(&$uri, $config, $context) 22114 { 22115 $uri->query = null; 22116 22117 // typecode check 22118 $semicolon_pos = strrpos($uri->path, ';'); // reverse 22119 if ($semicolon_pos !== false) { 22120 $type = substr($uri->path, $semicolon_pos + 1); // no semicolon 22121 $uri->path = substr($uri->path, 0, $semicolon_pos); 22122 $type_ret = ''; 22123 if (strpos($type, '=') !== false) { 22124 // figure out whether or not the declaration is correct 22125 list($key, $typecode) = explode('=', $type, 2); 22126 if ($key !== 'type') { 22127 // invalid key, tack it back on encoded 22128 $uri->path .= '%3B' . $type; 22129 } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') { 22130 $type_ret = ";type=$typecode"; 22131 } 22132 } else { 22133 $uri->path .= '%3B' . $type; 22134 } 22135 $uri->path = str_replace(';', '%3B', $uri->path); 22136 $uri->path .= $type_ret; 22137 } 22138 return true; 22139 } 22140} 22141 22142 22143 22144 22145 22146/** 22147 * Validates http (HyperText Transfer Protocol) as defined by RFC 2616 22148 */ 22149class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme 22150{ 22151 /** 22152 * @type int 22153 */ 22154 public $default_port = 80; 22155 22156 /** 22157 * @type bool 22158 */ 22159 public $browsable = true; 22160 22161 /** 22162 * @type bool 22163 */ 22164 public $hierarchical = true; 22165 22166 /** 22167 * @param HTMLPurifier_URI $uri 22168 * @param HTMLPurifier_Config $config 22169 * @param HTMLPurifier_Context $context 22170 * @return bool 22171 */ 22172 public function doValidate(&$uri, $config, $context) 22173 { 22174 $uri->userinfo = null; 22175 return true; 22176 } 22177} 22178 22179 22180 22181 22182 22183/** 22184 * Validates https (Secure HTTP) according to http scheme. 22185 */ 22186class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http 22187{ 22188 /** 22189 * @type int 22190 */ 22191 public $default_port = 443; 22192 /** 22193 * @type bool 22194 */ 22195 public $secure = true; 22196} 22197 22198 22199 22200 22201 22202// VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the 22203// email is valid, but be careful! 22204 22205/** 22206 * Validates mailto (for E-mail) according to RFC 2368 22207 * @todo Validate the email address 22208 * @todo Filter allowed query parameters 22209 */ 22210 22211class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme 22212{ 22213 /** 22214 * @type bool 22215 */ 22216 public $browsable = false; 22217 22218 /** 22219 * @type bool 22220 */ 22221 public $may_omit_host = true; 22222 22223 /** 22224 * @param HTMLPurifier_URI $uri 22225 * @param HTMLPurifier_Config $config 22226 * @param HTMLPurifier_Context $context 22227 * @return bool 22228 */ 22229 public function doValidate(&$uri, $config, $context) 22230 { 22231 $uri->userinfo = null; 22232 $uri->host = null; 22233 $uri->port = null; 22234 // we need to validate path against RFC 2368's addr-spec 22235 return true; 22236 } 22237} 22238 22239 22240 22241 22242 22243/** 22244 * Validates news (Usenet) as defined by generic RFC 1738 22245 */ 22246class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme 22247{ 22248 /** 22249 * @type bool 22250 */ 22251 public $browsable = false; 22252 22253 /** 22254 * @type bool 22255 */ 22256 public $may_omit_host = true; 22257 22258 /** 22259 * @param HTMLPurifier_URI $uri 22260 * @param HTMLPurifier_Config $config 22261 * @param HTMLPurifier_Context $context 22262 * @return bool 22263 */ 22264 public function doValidate(&$uri, $config, $context) 22265 { 22266 $uri->userinfo = null; 22267 $uri->host = null; 22268 $uri->port = null; 22269 $uri->query = null; 22270 // typecode check needed on path 22271 return true; 22272 } 22273} 22274 22275 22276 22277 22278 22279/** 22280 * Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738 22281 */ 22282class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme 22283{ 22284 /** 22285 * @type int 22286 */ 22287 public $default_port = 119; 22288 22289 /** 22290 * @type bool 22291 */ 22292 public $browsable = false; 22293 22294 /** 22295 * @param HTMLPurifier_URI $uri 22296 * @param HTMLPurifier_Config $config 22297 * @param HTMLPurifier_Context $context 22298 * @return bool 22299 */ 22300 public function doValidate(&$uri, $config, $context) 22301 { 22302 $uri->userinfo = null; 22303 $uri->query = null; 22304 return true; 22305 } 22306} 22307 22308 22309 22310 22311 22312/** 22313 * Validates tel (for phone numbers). 22314 * 22315 * The relevant specifications for this protocol are RFC 3966 and RFC 5341, 22316 * but this class takes a much simpler approach: we normalize phone 22317 * numbers so that they only include (possibly) a leading plus, 22318 * and then any number of digits and x'es. 22319 */ 22320 22321class HTMLPurifier_URIScheme_tel extends HTMLPurifier_URIScheme 22322{ 22323 /** 22324 * @type bool 22325 */ 22326 public $browsable = false; 22327 22328 /** 22329 * @type bool 22330 */ 22331 public $may_omit_host = true; 22332 22333 /** 22334 * @param HTMLPurifier_URI $uri 22335 * @param HTMLPurifier_Config $config 22336 * @param HTMLPurifier_Context $context 22337 * @return bool 22338 */ 22339 public function doValidate(&$uri, $config, $context) 22340 { 22341 $uri->userinfo = null; 22342 $uri->host = null; 22343 $uri->port = null; 22344 22345 // Delete all non-numeric characters, non-x characters 22346 // from phone number, EXCEPT for a leading plus sign. 22347 $uri->path = preg_replace('/(?!^\+)[^\dx]/', '', 22348 // Normalize e(x)tension to lower-case 22349 str_replace('X', 'x', $uri->path)); 22350 22351 return true; 22352 } 22353} 22354 22355 22356 22357 22358 22359/** 22360 * Performs safe variable parsing based on types which can be used by 22361 * users. This may not be able to represent all possible data inputs, 22362 * however. 22363 */ 22364class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser 22365{ 22366 /** 22367 * @param mixed $var 22368 * @param int $type 22369 * @param bool $allow_null 22370 * @return array|bool|float|int|mixed|null|string 22371 * @throws HTMLPurifier_VarParserException 22372 */ 22373 protected function parseImplementation($var, $type, $allow_null) 22374 { 22375 if ($allow_null && $var === null) { 22376 return null; 22377 } 22378 switch ($type) { 22379 // Note: if code "breaks" from the switch, it triggers a generic 22380 // exception to be thrown. Specific errors can be specifically 22381 // done here. 22382 case self::MIXED: 22383 case self::ISTRING: 22384 case self::STRING: 22385 case self::TEXT: 22386 case self::ITEXT: 22387 return $var; 22388 case self::INT: 22389 if (is_string($var) && ctype_digit($var)) { 22390 $var = (int)$var; 22391 } 22392 return $var; 22393 case self::FLOAT: 22394 if ((is_string($var) && is_numeric($var)) || is_int($var)) { 22395 $var = (float)$var; 22396 } 22397 return $var; 22398 case self::BOOL: 22399 if (is_int($var) && ($var === 0 || $var === 1)) { 22400 $var = (bool)$var; 22401 } elseif (is_string($var)) { 22402 if ($var == 'on' || $var == 'true' || $var == '1') { 22403 $var = true; 22404 } elseif ($var == 'off' || $var == 'false' || $var == '0') { 22405 $var = false; 22406 } else { 22407 throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type"); 22408 } 22409 } 22410 return $var; 22411 case self::ALIST: 22412 case self::HASH: 22413 case self::LOOKUP: 22414 if (is_string($var)) { 22415 // special case: technically, this is an array with 22416 // a single empty string item, but having an empty 22417 // array is more intuitive 22418 if ($var == '') { 22419 return array(); 22420 } 22421 if (strpos($var, "\n") === false && strpos($var, "\r") === false) { 22422 // simplistic string to array method that only works 22423 // for simple lists of tag names or alphanumeric characters 22424 $var = explode(',', $var); 22425 } else { 22426 $var = preg_split('/(,|[\n\r]+)/', $var); 22427 } 22428 // remove spaces 22429 foreach ($var as $i => $j) { 22430 $var[$i] = trim($j); 22431 } 22432 if ($type === self::HASH) { 22433 // key:value,key2:value2 22434 $nvar = array(); 22435 foreach ($var as $keypair) { 22436 $c = explode(':', $keypair, 2); 22437 if (!isset($c[1])) { 22438 continue; 22439 } 22440 $nvar[trim($c[0])] = trim($c[1]); 22441 } 22442 $var = $nvar; 22443 } 22444 } 22445 if (!is_array($var)) { 22446 break; 22447 } 22448 $keys = array_keys($var); 22449 if ($keys === array_keys($keys)) { 22450 if ($type == self::ALIST) { 22451 return $var; 22452 } elseif ($type == self::LOOKUP) { 22453 $new = array(); 22454 foreach ($var as $key) { 22455 $new[$key] = true; 22456 } 22457 return $new; 22458 } else { 22459 break; 22460 } 22461 } 22462 if ($type === self::ALIST) { 22463 trigger_error("Array list did not have consecutive integer indexes", E_USER_WARNING); 22464 return array_values($var); 22465 } 22466 if ($type === self::LOOKUP) { 22467 foreach ($var as $key => $value) { 22468 if ($value !== true) { 22469 trigger_error( 22470 "Lookup array has non-true value at key '$key'; " . 22471 "maybe your input array was not indexed numerically", 22472 E_USER_WARNING 22473 ); 22474 } 22475 $var[$key] = true; 22476 } 22477 } 22478 return $var; 22479 default: 22480 $this->errorInconsistent(__CLASS__, $type); 22481 } 22482 $this->errorGeneric($var, $type); 22483 } 22484} 22485 22486 22487 22488 22489 22490/** 22491 * This variable parser uses PHP's internal code engine. Because it does 22492 * this, it can represent all inputs; however, it is dangerous and cannot 22493 * be used by users. 22494 */ 22495class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser 22496{ 22497 22498 /** 22499 * @param mixed $var 22500 * @param int $type 22501 * @param bool $allow_null 22502 * @return null|string 22503 */ 22504 protected function parseImplementation($var, $type, $allow_null) 22505 { 22506 return $this->evalExpression($var); 22507 } 22508 22509 /** 22510 * @param string $expr 22511 * @return mixed 22512 * @throws HTMLPurifier_VarParserException 22513 */ 22514 protected function evalExpression($expr) 22515 { 22516 $var = null; 22517 $result = eval("\$var = $expr;"); 22518 if ($result === false) { 22519 throw new HTMLPurifier_VarParserException("Fatal error in evaluated code"); 22520 } 22521 return $var; 22522 } 22523} 22524 22525 22526 22527