1<?php 2/** 3 * eGroupWare API: egw class to include (and configure (basic)) htmLawed by Santosh Patnaik 4 * 5 * @link http://www.egroupware.org 6 * @license http://opensource.org/licenses/gpl-license.php GPL - GNU General Public License 7 * @package api 8 * @subpackage html 9 * @author Klaus Leithoff <kl-AT-stylite.de> 10 * @version $Id$ 11 */ 12 13namespace EGroupware\Api\Html; 14 15use EGroupware\Api; 16 17require_once(__DIR__.'/htmLawed/htmLawed.php'); 18 19/** 20 * This class does NOT use anything EGroupware specific, it just calls htmLawed and supports autoloading 21 * while matching egw namespace requirements. It also provides (as a non class function ) a hook_tag function 22 * to do further tag / attribute validation 23 */ 24class HtmLawed 25{ 26 /** 27 * config options see constructor 28 * 29 * @var Configuration 30 */ 31 var $Configuration; 32 33 /** 34 * The $spec argument can be used to disallow an otherwise legal attribute for an element, 35 * or to restrict the attribute's values. This can also be helpful as a security measure 36 * (e.g., in certain versions of browsers, certain values can cause buffer overflows and 37 * denial of service attacks), or in enforcing admin policy compliance. $spec is specified 38 * as a string of text containing one or more rules, with multiple rules separated from each 39 * other by a semi-colon (;) 40 * 41 * @var Spec 42 */ 43 var $Spec; 44 45 /** 46 * Constructor 47 */ 48 function __construct() 49 { 50 // may hold some Standard configuration 51 /* 52 $cfg = array( 53 'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'), 54 'and_mark'=>array('2', '0', 'mark original <em>&</em> chars', '0', 'd'=>1), // 'd' to disable 55 'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))), 56 'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'), 57 'balance'=>array('2', '1', 'fix nestings and balance tags', '0'), 58 'base_url'=>array('', '', 'base URL', '25'), 59 'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'), 60 'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'), 61 'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'), 62 'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'), 63 'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'), 64 'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'), 65 'elements'=>array('', '', 'allowed elements', '50'), 66 'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'), 67 'hook'=>array('', '', 'name of hook function', '25'), 68 'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'), 69 'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'), 70 'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'), 71 'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 3 is a new own config value, to indicate that transformation is to be performed, but don't transform font as size transformation of numeric sizes to keywords alters the intended result too much 72 'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'), 73 'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'), 74 'parent'=>array('', 'div', 'name of parent element', '25'), 75 'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'), 76 'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'), 77 'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1), 78 'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'), 79 'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'), 80 'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'), 81 'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'), 82 'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'), 83 'allow_for_inline' => array('table'),//block elements allowed for nesting when only inline is allowed; Example span does not allow block elements as table; table is the only element tested so far 84 ); 85 */ 86 87 $this->Configuration = array('comment'=>1, //remove comments 88 'make_tag_strict'=>3,//3 is a new own config value, to indicate that transformation is to be performed, but don't transform font, as size transformation of numeric sizes to keywords alters the intended result too much 89 'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering) 90 // tidy eats away even some wanted whitespace, so we switch it off; 91 // we used it for its compacting and beautifying capabilities, which resulted in better html for further processing 92 'tidy'=>0, 93 'elements' => "* -script -meta -object", 94 'deny_attribute' => 'on*', 95 'schemes'=>'href: file, ftp, http, https, mailto, tel, phone; src: cid, data, file, ftp, http, https; *:file, http, https', 96 'hook_tag' =>"hl_my_tag_transform", 97 ); 98 $this->Spec = 'img=alt(noneof="image"/default="")'; 99 } 100 101 /** 102 * Run htmLawed 103 * 104 * @param varchar $html2check =text input Text to check 105 * @param mixed $Config = text or array 106 * @param mixed $Spec =text or array; The '$spec' argument can be used to disallow an otherwise legal attribute for an element 107 * @return varchar cleaned/fixed html 108 */ 109 function run($html2check, $Config=null, $Spec=array()) 110 { 111 //error_log(__METHOD__.__LINE__.' Input:'.$html2check); 112 if (is_array($Config) && is_array($this->Configuration)) $Config = array_merge($this->Configuration, $Config); 113 if (empty($Config)) $Config = $this->Configuration; 114 if (empty($Spec)) $Spec = $this->Spec; 115 // If we are processing mails, we take out stuff in <style> stuff </style> tags and 116 // put it back in after purifying; styles are processed for known security risks 117 // in self::getStyles 118 // we allow filtered style sections now throughout egroupware 119 /*if ($Config['hook_tag'] =="hl_email_tag_transform")*/ $styles = self::getStyles($html2check); 120 //error_log(__METHOD__.__LINE__.array2string($styles)); 121 //error_log(__METHOD__.__LINE__.' Config:'.array2string($Config)); 122 123 // mind our namespace when defining a function as hook. we handle our own defined hooks here. 124 if ($Config['hook_tag']=="hl_my_tag_transform" || $Config['hook_tag']=="hl_email_tag_transform") 125 { 126 $Config['hook_tag']=__NAMESPACE__.'\\'.$Config['hook_tag']; 127 } 128 return ($styles?$styles:'').htmLawed($html2check, $Config, $Spec); 129 } 130 131 /** 132 * get all style tag definitions, <style> stuff </style> of the html passed in 133 * and remove it from input 134 * @author Leithoff, Klaus 135 * @param string html 136 * @return string the style css 137 */ 138 static function getStyles(&$html) 139 { 140 $ct=0; 141 $newStyle = null; 142 if (stripos($html,'<style')!==false) $ct = preg_match_all('#<style(?:\s.*)?>(.+)</style>#isU', $html, $newStyle); 143 if ($ct>0) 144 { 145 //error_log(__METHOD__.__LINE__.array2string($newStyle[0])); 146 $style2buffer = implode('',$newStyle[0]); 147 // only replace what we have found, we use it here, as we use the same routine in Api\Mail\Html::replaceTagsCompletley 148 // no need to do the extra routine 149 $html = str_ireplace($newStyle[0],'',$html); 150 } 151 if ($style2buffer) 152 { 153 //error_log(__METHOD__.__LINE__.array2string($style2buffer)); 154 $test = json_encode($style2buffer); 155 //error_log(__METHOD__.__LINE__.'#'.$test.'# ->'.strlen($style2buffer).' Error:'.json_last_error()); 156 //if (json_last_error() != JSON_ERROR_NONE && strlen($style2buffer)>0) 157 if ($test=="null" && strlen($style2buffer)>0) 158 { 159 // this should not be needed, unless something fails with charset detection/ wrong charset passed 160 error_log(__METHOD__.__LINE__.' Found Invalid sequence for utf-8 in CSS:'.$style2buffer.' Carset Detected:'.Api\Translation::detect_encoding($style2buffer)); 161 $style2buffer = utf8_encode($style2buffer); 162 } 163 } 164 $style .= $style2buffer; 165 // clean out comments and stuff 166 $search = array( 167 '@url\(http:\/\/[^\)].*?\)@si', // url calls e.g. in style definitions 168// '@<!--[\s\S]*?[ \t\n\r]*-->@', // Strip multi-line comments including CDATA 169// '@<!--[\s\S]*?[ \t\n\r]*--@', // Strip broken multi-line comments including CDATA 170 ); 171 $style = preg_replace($search,"",$style); 172 173 // CSS Security 174 // http://code.google.com/p/browsersec/wiki/Part1#Cascading_stylesheets 175 $css = preg_replace('/(javascript|expession|-moz-binding)/i','',$style); 176 if (stripos($css,'script')!==false) Api\Mail\Html::replaceTagsCompletley($css,'script'); // Strip out script that may be included 177 // we need this, as styledefinitions are enclosed with curly brackets; and template stuff tries to replace everything between curly brackets that is having no horizontal whitespace 178 // as the comments as <!-- styledefinition --> in stylesheet are outdated, and ck-editor does not understand it, we remove it 179 $css_no_comment = str_replace(array(':','<!--','-->'),array(': ','',''),$css); 180 //error_log(__METHOD__.__LINE__.$css); 181 // we already removed what we have found, above, as we used pretty much the same routine as in Api\Mail\Html::replaceTagsCompletley 182 // no need to do the extra routine 183 // TODO: we may have to strip urls and maybe comments and ifs 184 //if (stripos($html,'style')!==false) Api\Mail\Html::replaceTagsCompletley($html,'style'); // clean out empty or pagewide style definitions / left over tags 185 return $css_no_comment; 186 } 187 188 /** 189 * Runs HTMLPurifier over supplied html to remove malicious code 190 * 191 * @param string $html 192 * @param array|string $config =null - config to influence the behavior of current purifying engine 193 * @param array|string $spec =null - spec to influence the behavior of current purifying engine 194 * The $spec argument can be used to disallow an otherwise legal attribute for an element, 195 * or to restrict the attribute's values 196 * @param boolean $_force =null - force the config passed to be used without merging to the default 197 */ 198 static function purify($html,$config=null,$spec=array(),$_force=false) 199 { 200 $defaultConfig = array('valid_xhtml'=>1,'safe'=>1); 201 202 if (empty($html)) return $html; // no need to process further 203 if (!empty($config) && is_string($config)) 204 { 205 //error_log(__METHOD__.__LINE__.$config); 206 $config = json_decode($config,true); 207 if (is_null($config)) error_log(__METHOD__.__LINE__." decoding of config failed; standard will be applied"); 208 } 209 210 // User preferences 211 $font = $GLOBALS['egw_info']['user']['preferences']['common']['rte_font']; 212 $font_size = $GLOBALS['egw_info']['user']['preferences']['common']['rte_font_size']; 213 214 // Check for "blank" = just user preference span - for some reason we can't match on the entity, so approximate 215 $regex = '#^<span style="[^"]*font-family:'.$font.'; font-size:'.$font_size.'pt;[^"]*">.?</span>$#us'; 216 if(preg_match($regex,$html)) 217 { 218 return ''; 219 } 220 $htmLawed = new HtmLawed(); 221 if (is_array($config) && $_force===false) $config = array_merge($defaultConfig, $config); 222 if (empty($config)) $config = $defaultConfig; 223 //error_log(__METHOD__.__LINE__.array2string($config)); 224 return $htmLawed->run($html,$config,$spec); 225 } 226} 227 228/** 229 * hl_my_tag_transform 230 * 231 * function to provide individual checks for element attribute pairs 232 * implemented so far: img checking for alt attribute == image; set this to empty 233 * a checking for title, replacing @ 234 * blockquote checking for cite, replacing @ 235 */ 236function hl_my_tag_transform($element, $attribute_array=0) 237{ 238 // If second argument is not received, it means a closing tag is being handled 239 if(is_numeric($attribute_array)){ 240 return "</$element>"; 241 } 242 243 //if ($element=='img') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); 244 if ($element=='td' && isset($attribute_array['background'])) 245 { 246 if (is_object($GLOBALS['egw']) && stripos($attribute_array['background'],$GLOBALS['egw']->link('/index.php'))!==false) 247 { 248 //error_log(__METHOD__.__LINE__.array2string($attribute_array)); 249 //$attribute_array['background'] = 'url('.$attribute_array['background'].');'; 250 } 251 else 252 { 253 // $attribute_array['background']='denied:'.$attribute_array['background']; 254 unset($attribute_array['background']);// only internal background images are allowed 255 } 256 } 257 // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged 258 if($element == 'img') 259 { 260 // Re-build 'alt' 261 if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); 262 if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); 263 } 264 if (isset($attribute_array['title'])) 265 { 266 if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); 267 } 268 if ($element == 'blockquote') 269 { 270 if (isset($attribute_array['cite'])) 271 { 272 if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); 273 } 274 } 275 /* 276 // Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged 277 if($element == 'span' && isset($attribute_array['style'])) 278 { 279 // Identify CSS properties and values 280 $css = explode(';', $attribute_array['style']); 281 $style = array(); 282 foreach($css as $v){ 283 if(($p = strpos($v, ':')) > 1 && $p < strlen($v)){ 284 $css_property_name = trim(substr($v, 0, $p)); 285 $css_property_value = trim(substr($v, $p+1)); 286 $style[] = "$css_property_name: $css_property_value"; 287 } 288 } 289 290 // Alter the CSS property as required 291 292 // Black Arial must be at a font-size of 24 293 if(isset($style['font-family']) && $style['font-family'] == 'Arial' && isset($style['color']) && $style['color'] == '#000000'){ 294 $style['font-size'] == '24'; 295 } 296 297 // And so on for other criteria 298 // ... 299 300 // Re-build 'style' 301 $attribute_array['style'] = implode('; ', $style); 302 } 303 */ 304 if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']); 305 if($element == 'a') 306 { 307 //error_log(__METHOD__.__LINE__.array2string($attribute_array)); 308 // rebuild Anchors, if processed by hl_email_tag_transform 309 if (strpos($attribute_array['href'],"denied:javascript:GoToAnchor('")===0) 310 { 311 $attribute_array['href']=str_ireplace("');",'',str_ireplace("denied:javascript:GoToAnchor('","#",$attribute_array['href'])); 312 } 313 if (strpos($attribute_array['href'],"javascript:GoToAnchor('")===0) 314 { 315 $attribute_array['href']=str_ireplace("');",'',str_ireplace("javascript:GoToAnchor('","#",$attribute_array['href'])); 316 } 317 if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']=''; 318 } 319 320 // Build the attributes string 321 $attributes = ''; 322 foreach($attribute_array as $k=>$v){ 323 $attributes .= " {$k}=\"{$v}\""; 324 } 325 326 // Return the opening tag with attributes 327 static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); 328 return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; 329} 330 331/** 332 * hl_email_tag_transform 333 * 334 * function to provide individual checks for element attribute pairs 335 * implemented so far: img -checking for alt attribute == image; set this to empty 336 * -control for/on external Images and src-length 337 * a -checking for title and href, replacing @ accordingly 338 * -navigate to local anchors without reloading the page 339 * blockquote -checking for cite, replacing @ 340 * throwing away excess div elements, that carry no style or class or id info 341 */ 342function hl_email_tag_transform($element, $attribute_array=0) 343{ 344 //error_log(__METHOD__.__LINE__.$element.'=>'.array2string($attribute_array)); 345 static $lastelement = null; 346 static $throwawaycounter = null; 347 if (is_null($lastelement)) $lastelement=''; 348 if (is_null($throwawaycounter)) $throwawaycounter = 0; 349 //if ($throwawaycounter>1) error_log(__METHOD__.__LINE__.' '.$throwawaycounter.$element.array2string($attribute_array)); 350 if ($element=='div' && $element==$lastelement && ($attribute_array==0 || empty($attribute_array))) 351 { 352 if (is_array($attribute_array)) $throwawaycounter++; 353 if ($attribute_array==0 && $throwawaycounter>0) $throwawaycounter--; 354 if ($throwawaycounter>1) return ''; 355 } 356 if ($lastelement=='div' && $element!=$lastelement && is_array($attribute_array)) $throwawaycounter = 0; 357 if (is_array($attribute_array) && !empty($attribute_array) && $element=='div') 358 { 359 $lastelement = 'div_with_attr'; 360 } 361 else 362 { 363 if (is_array($attribute_array)) $lastelement = $element; 364 } 365 // If second argument is not received, it means a closing tag is being handled 366 if(is_numeric($attribute_array)){ 367 if($element==$lastelement) $lastelement=''; 368 return "</$element>"; 369 } 370 371 //if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array)); 372 if ($element=='td' && isset($attribute_array['background'])) 373 { 374 if (stripos($attribute_array['background'],'cid:')!==false) 375 { 376 //error_log(__METHOD__.__LINE__.array2string($attribute_array)); 377 //$attribute_array['background'] = 'url('.$attribute_array['background'].');'; 378 } 379 else 380 { 381 // $attribute_array['background']='denied:'.$attribute_array['background']; 382 unset($attribute_array['background']);// only cid style background images are allowed 383 } 384 } 385 // Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged 386 if($element == 'img') 387 { 388 // Re-build 'alt' 389 if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']); 390 if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']); 391 // $GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'), 392 if (isset($attribute_array['src'])) 393 { 394 if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400))) 395 { 396 $attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']'; 397 if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; 398 $attribute_array['src']=Api\Image::find('api','error'); 399 } 400 if (!preg_match('/^cid:.*/',$attribute_array['src'])) 401 { 402 $url = explode('/', preg_replace('/^(http|https):\/\//','',$attribute_array['src'])); 403 $domains = is_array($GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalDomains']) ? 404 $GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalDomains'] : 405 array(); 406 if ($GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'] != 1 407 && !in_array($url[0], $domains) || substr($attribute_array['src'],0, 5) == 'http:') 408 { 409 //the own webserver url is not external, so it should be allowed 410 if (empty($GLOBALS['egw_info']['server']['webserver_url'])||!preg_match("$^".$GLOBALS['egw_info']['server']['webserver_url'].".*$",$attribute_array['src'])) 411 { 412 $attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']'; 413 if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt']; 414 $attribute_array['src']=Api\Image::find('mail','no-image-shown'); 415 $attribute_array['border'] = 1; 416 if ($attribute_array['style']) 417 { 418 if (stripos($attribute_array['style'],'border')!==false) $attribute_array['style'] = preg_replace('~border(:|-left:|-right:|-bottom:|-top:)+ (0px)+ (none)+;~si','',$attribute_array['style']); 419 } 420 } 421 } 422 } 423 } 424 } 425 if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']); 426 if (isset($attribute_array['title'])) 427 { 428 if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']); 429 } 430 if ($element == 'blockquote') 431 { 432 if (isset($attribute_array['cite'])) 433 { 434 if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']); 435 } 436 } 437 if($element == 'a') 438 { 439 //error_log(__METHOD__.__LINE__.array2string($attribute_array)); 440 if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']=''; 441 if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name']; 442 if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']); 443 if (strpos($attribute_array['href'],'#')===0 && (isset(Api\Mail::$htmLawed_config['transform_anchor']) && Api\Mail::$htmLawed_config['transform_anchor']===true)) 444 { 445 $attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');"; 446 } 447 448 } 449 450 // Build the attributes string 451 $attributes = ''; 452 foreach($attribute_array as $k=>$v){ 453 $attributes .= " {$k}=\"{$v}\""; 454 } 455 456 // Return the opening tag with attributes 457 static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); 458 return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>'; 459} 460 461