1<?php 2// Copyright (C) 2016-2017 Combodo SARL 3// 4// This file is part of iTop. 5// 6// iTop is free software; you can redistribute it and/or modify 7// it under the terms of the GNU Affero General Public License as published by 8// the Free Software Foundation, either version 3 of the License, or 9// (at your option) any later version. 10// 11// iTop is distributed in the hope that it will be useful, 12// but WITHOUT ANY WARRANTY; without even the implied warranty of 13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14// GNU Affero General Public License for more details. 15// 16// You should have received a copy of the GNU Affero General Public License 17// along with iTop. If not, see <http://www.gnu.org/licenses/> 18/** 19 * Base class for all possible implementations of HTML Sanitization 20 */ 21abstract class HTMLSanitizer 22{ 23 public function __construct() 24 { 25 // Do nothing.. 26 } 27 28 /** 29 * Sanitizes the given HTML document 30 * @param string $sHTML 31 * @return string 32 */ 33 abstract public function DoSanitize($sHTML); 34 35 /** 36 * Sanitize an HTML string with the configured sanitizer, falling back to HTMLDOMSanitizer in case of Exception or invalid configuration 37 * @param string $sHTML 38 * @return string 39 */ 40 public static function Sanitize($sHTML) 41 { 42 $sSanitizerClass = MetaModel::GetConfig()->Get('html_sanitizer'); 43 if(!class_exists($sSanitizerClass)) 44 { 45 IssueLog::Warning('The configured "html_sanitizer" class "'.$sSanitizerClass.'" is not a valid class. Will use HTMLDOMSanitizer as the default sanitizer.'); 46 $sSanitizerClass = 'HTMLDOMSanitizer'; 47 } 48 else if(!is_subclass_of($sSanitizerClass, 'HTMLSanitizer')) 49 { 50 IssueLog::Warning('The configured "html_sanitizer" class "'.$sSanitizerClass.'" is not a subclass of HTMLSanitizer. Will use HTMLDOMSanitizer as the default sanitizer.'); 51 $sSanitizerClass = 'HTMLDOMSanitizer'; 52 } 53 54 try 55 { 56 $oSanitizer = new $sSanitizerClass(); 57 $sCleanHTML = $oSanitizer->DoSanitize($sHTML); 58 } 59 catch(Exception $e) 60 { 61 if($sSanitizerClass != 'HTMLDOMSanitizer') 62 { 63 IssueLog::Warning('Failed to sanitize an HTML string with "'.$sSanitizerClass.'". The following exception occured: '.$e->getMessage()); 64 IssueLog::Warning('Will try to sanitize with HTMLDOMSanitizer.'); 65 // try again with the HTMLDOMSanitizer 66 $oSanitizer = new HTMLDOMSanitizer(); 67 $sCleanHTML = $oSanitizer->DoSanitize($sHTML); 68 } 69 else 70 { 71 IssueLog::Error('Failed to sanitize an HTML string with "HTMLDOMSanitizer". The following exception occured: '.$e->getMessage()); 72 IssueLog::Error('The HTML will NOT be sanitized.'); 73 $sCleanHTML = $sHTML; 74 } 75 } 76 return $sCleanHTML; 77 } 78} 79 80/** 81 * Dummy HTMLSanitizer which does nothing at all! 82 * Can be used if HTML Sanitization is not important 83 * (for example when importing "safe" data during an on-boarding) 84 * and performance is at stake 85 * 86 */ 87class HTMLNullSanitizer extends HTMLSanitizer 88{ 89 /** 90 * (non-PHPdoc) 91 * @see HTMLSanitizer::Sanitize() 92 */ 93 public function DoSanitize($sHTML) 94 { 95 return $sHTML; 96 } 97 98} 99 100/** 101 * A standard-compliant HTMLSanitizer based on the HTMLPurifier library by Edward Z. Yang 102 * Complete but quite slow 103 * http://htmlpurifier.org 104 */ 105/* 106class HTMLPurifierSanitizer extends HTMLSanitizer 107{ 108 protected static $oPurifier = null; 109 110 public function __construct() 111 { 112 if (self::$oPurifier == null) 113 { 114 $sLibPath = APPROOT.'lib/htmlpurifier/HTMLPurifier.auto.php'; 115 if (!file_exists($sLibPath)) 116 { 117 throw new Exception("Missing library '$sLibPath', cannot use HTMLPurifierSanitizer."); 118 } 119 require_once($sLibPath); 120 121 $oPurifierConfig = HTMLPurifier_Config::createDefault(); 122 $oPurifierConfig->set('Core.Encoding', 'UTF-8'); // defaults to 'UTF-8' 123 $oPurifierConfig->set('HTML.Doctype', 'XHTML 1.0 Strict'); // defaults to 'XHTML 1.0 Transitional' 124 $oPurifierConfig->set('URI.AllowedSchemes', array ( 125 'http' => true, 126 'https' => true, 127 'data' => true, // This one is not present by default 128 )); 129 $sPurifierCache = APPROOT.'data/HTMLPurifier'; 130 if (!is_dir($sPurifierCache)) 131 { 132 mkdir($sPurifierCache); 133 } 134 if (!is_dir($sPurifierCache)) 135 { 136 throw new Exception("Could not create the cache directory '$sPurifierCache'"); 137 } 138 $oPurifierConfig->set('Cache.SerializerPath', $sPurifierCache); // no trailing slash 139 self::$oPurifier = new HTMLPurifier($oPurifierConfig); 140 } 141 } 142 143 public function DoSanitize($sHTML) 144 { 145 $sCleanHtml = self::$oPurifier->purify($sHTML); 146 return $sCleanHtml; 147 } 148} 149*/ 150 151class HTMLDOMSanitizer extends HTMLSanitizer 152{ 153 protected $oDoc; 154 155 /** 156 * @var array 157 * @see https://www.itophub.io/wiki/page?id=2_6_0%3Aadmin%3Arich_text_limitations 158 */ 159 protected static $aTagsWhiteList = array( 160 'html' => array(), 161 'body' => array(), 162 'a' => array('href', 'name', 'style', 'target', 'title'), 163 'p' => array('style'), 164 'blockquote' => array('style'), 165 'br' => array(), 166 'span' => array('style'), 167 'div' => array('style'), 168 'b' => array(), 169 'i' => array(), 170 'u' => array(), 171 'em' => array(), 172 'strong' => array(), 173 'img' => array('src', 'style', 'alt', 'title'), 174 'ul' => array('style'), 175 'ol' => array('style'), 176 'li' => array('style'), 177 'h1' => array('style'), 178 'h2' => array('style'), 179 'h3' => array('style'), 180 'h4' => array('style'), 181 'nav' => array('style'), 182 'section' => array('style'), 183 'code' => array('style'), 184 'table' => array('style', 'width', 'summary', 'align', 'border', 'cellpadding', 'cellspacing'), 185 'thead' => array('style'), 186 'tbody' => array('style'), 187 'tr' => array('style', 'colspan', 'rowspan'), 188 'td' => array('style', 'colspan', 'rowspan'), 189 'th' => array('style', 'colspan', 'rowspan'), 190 'fieldset' => array('style'), 191 'legend' => array('style'), 192 'font' => array('face', 'color', 'style', 'size'), 193 'big' => array(), 194 'small' => array(), 195 'tt' => array(), 196 'kbd' => array(), 197 'samp' => array(), 198 'var' => array(), 199 'del' => array(), 200 's' => array(), // strikethrough 201 'ins' => array(), 202 'cite' => array(), 203 'q' => array(), 204 'hr' => array('style'), 205 'pre' => array(), 206 ); 207 208 protected static $aAttrsWhiteList = array( 209 'src' => '/^(http:|https:|data:)/i', 210 ); 211 212 /** 213 * @var array 214 * @see https://www.itophub.io/wiki/page?id=2_6_0%3Aadmin%3Arich_text_limitations 215 */ 216 protected static $aStylesWhiteList = array( 217 'background-color', 218 'border', 219 'border-collapse', 220 'bordercolor', 221 'cellpadding', 222 'cellspacing', 223 'color', 224 'float', 225 'font', 226 'font-family', 227 'font-size', 228 'font-style', 229 'height', 230 'margin', 231 'padding', 232 'text-align', 233 'vertical-align', 234 'width', 235 'white-space', 236 ); 237 238 public function __construct() 239 { 240 // Building href validation pattern from url and email validation patterns as the patterns are not used the same way in HTML content than in standard attributes value. 241 // eg. "foo@bar.com" vs "mailto:foo@bar.com?subject=Title&body=Hello%20world" 242 if (!array_key_exists('href', self::$aAttrsWhiteList)) 243 { 244 // Regular urls 245 $sUrlPattern = utils::GetConfig()->Get('url_validation_pattern'); 246 // Mailto urls 247 $sMailtoPattern = '(mailto:(' . utils::GetConfig()->Get('email_validation_pattern') . ')(?:\?(?:subject|body)=([a-zA-Z0-9+\$_.-]*)(?:&(?:subject|body)=([a-zA-Z0-9+\$_.-]*))?)?)'; 248 249 $sPattern = $sUrlPattern . '|' . $sMailtoPattern; 250 $sPattern = '/'.str_replace('/', '\/', $sPattern).'/i'; 251 self::$aAttrsWhiteList['href'] = $sPattern; 252 } 253 } 254 255 public function DoSanitize($sHTML) 256 { 257 $this->oDoc = new DOMDocument(); 258 $this->oDoc->preserveWhitespace = true; 259 260 // MS outlook implements empty lines by the mean of <p><o:p></o:p></p> 261 // We have to transform that into <p><br></p> (which is how Thunderbird implements empty lines) 262 // Unfortunately, DOMDocument::loadHTML does not take the tag namespaces into account (once loaded there is no way to know if the tag did have a namespace) 263 // therefore we have to do the transformation upfront 264 $sHTML = preg_replace('@<o:p>\s*</o:p>@', '<br>', $sHTML); 265 266 @$this->oDoc->loadHTML('<?xml encoding="UTF-8"?>'.$sHTML); // For loading HTML chunks where the character set is not specified 267 268 $this->CleanNode($this->oDoc); 269 270 $oXPath = new DOMXPath($this->oDoc); 271 $sXPath = "//body"; 272 $oNodesList = $oXPath->query($sXPath); 273 274 if ($oNodesList->length == 0) 275 { 276 // No body, save the whole document 277 $sCleanHtml = $this->oDoc->saveHTML(); 278 } 279 else 280 { 281 // Export only the content of the body tag 282 $sCleanHtml = $this->oDoc->saveHTML($oNodesList->item(0)); 283 // remove the body tag itself 284 $sCleanHtml = str_replace( array('<body>', '</body>'), '', $sCleanHtml); 285 } 286 287 return $sCleanHtml; 288 } 289 290 protected function CleanNode(DOMNode $oElement) 291 { 292 $aAttrToRemove = array(); 293 // Gather the attributes to remove 294 if ($oElement->hasAttributes()) 295 { 296 foreach($oElement->attributes as $oAttr) 297 { 298 $sAttr = strtolower($oAttr->name); 299 if (!in_array($sAttr, self::$aTagsWhiteList[strtolower($oElement->tagName)])) 300 { 301 // Forbidden (or unknown) attribute 302 $aAttrToRemove[] = $oAttr->name; 303 } 304 else if (!$this->IsValidAttributeContent($sAttr, $oAttr->value)) 305 { 306 // Invalid content 307 $aAttrToRemove[] = $oAttr->name; 308 } 309 else if ($sAttr == 'style') 310 { 311 // Special processing for style tags 312 $sCleanStyle = $this->CleanStyle($oAttr->value); 313 if ($sCleanStyle == '') 314 { 315 // Invalid content 316 $aAttrToRemove[] = $oAttr->name; 317 } 318 else 319 { 320 $oElement->setAttribute($oAttr->name, $sCleanStyle); 321 } 322 } 323 } 324 // Now remove them 325 foreach($aAttrToRemove as $sName) 326 { 327 $oElement->removeAttribute($sName); 328 } 329 } 330 331 if ($oElement->hasChildNodes()) 332 { 333 $aChildElementsToRemove = array(); 334 // Gather the child noes to remove 335 foreach($oElement->childNodes as $oNode) 336 { 337 if (($oNode instanceof DOMElement) && (!array_key_exists(strtolower($oNode->tagName), self::$aTagsWhiteList))) 338 { 339 $aChildElementsToRemove[] = $oNode; 340 } 341 else if ($oNode instanceof DOMComment) 342 { 343 $aChildElementsToRemove[] = $oNode; 344 } 345 else 346 { 347 // Recurse 348 $this->CleanNode($oNode); 349 if (($oNode instanceof DOMElement) && (strtolower($oNode->tagName) == 'img')) 350 { 351 InlineImage::ProcessImageTag($oNode); 352 } 353 } 354 } 355 // Now remove them 356 foreach($aChildElementsToRemove as $oDomElement) 357 { 358 $oElement->removeChild($oDomElement); 359 } 360 } 361 } 362 363 protected function CleanStyle($sStyle) 364 { 365 $aAllowedStyles = array(); 366 $aItems = explode(';', $sStyle); 367 { 368 foreach($aItems as $sItem) 369 { 370 $aElements = explode(':', trim($sItem)); 371 if (in_array(trim(strtolower($aElements[0])), static::$aStylesWhiteList)) 372 { 373 $aAllowedStyles[] = trim($sItem); 374 } 375 } 376 } 377 return implode(';', $aAllowedStyles); 378 } 379 380 protected function IsValidAttributeContent($sAttributeName, $sValue) 381 { 382 if (array_key_exists($sAttributeName, self::$aAttrsWhiteList)) 383 { 384 return preg_match(self::$aAttrsWhiteList[$sAttributeName], $sValue); 385 } 386 return true; 387 } 388}