1<?php
2/**
3 * eGroupWare API: egw class to include (and configure (basic)) htmLawed by Santosh Patnaik
4 *
5 * @link http://www.egroupware.org
6 * @license http://opensource.org/licenses/gpl-license.php GPL - GNU General Public License
7 * @package api
8 * @subpackage html
9 * @author Klaus Leithoff <kl-AT-stylite.de>
10 * @version $Id$
11 */
12
13namespace EGroupware\Api\Html;
14
15use EGroupware\Api;
16
17require_once(__DIR__.'/htmLawed/htmLawed.php');
18
19/**
20 * This class does NOT use anything EGroupware specific, it just calls htmLawed and supports autoloading
21 * while matching egw namespace requirements. It also provides (as a non class function ) a hook_tag function
22 * to do further tag / attribute validation
23 */
24class HtmLawed
25{
26	/**
27	 * config options see constructor
28	 *
29	 * @var Configuration
30	 */
31	var $Configuration;
32
33	/**
34	 * The $spec argument can be used to disallow an otherwise legal attribute for an element,
35	 * or to restrict the attribute's values. This can also be helpful as a security measure
36	 * (e.g., in certain versions of browsers, certain values can cause buffer overflows and
37	 * denial of service attacks), or in enforcing admin policy compliance. $spec is specified
38	 * as a string of text containing one or more rules, with multiple rules separated from each
39	 * other by a semi-colon (;)
40	 *
41	 * @var Spec
42	 */
43	var $Spec;
44
45	/**
46	 * Constructor
47	 */
48	function __construct()
49	{
50		// may hold some Standard configuration
51		/*
52		$cfg = array(
53			'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'),
54			'and_mark'=>array('2', '0', 'mark original <em>&amp;</em> chars', '0', 'd'=>1), // 'd' to disable
55			'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))),
56			'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'),
57			'balance'=>array('2', '1', 'fix nestings and balance tags', '0'),
58			'base_url'=>array('', '', 'base URL', '25'),
59			'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'),
60			'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'),
61			'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'),
62			'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'),
63			'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'),
64			'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'),
65			'elements'=>array('', '', 'allowed elements', '50'),
66			'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'),
67			'hook'=>array('', '', 'name of hook function', '25'),
68			'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'),
69			'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'),
70			'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'),
71			'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 3 is a new own config value, to indicate that transformation is to be performed, but don't transform font as size transformation of numeric sizes to keywords alters the intended result too much
72			'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'),
73			'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'),
74			'parent'=>array('', 'div', 'name of parent element', '25'),
75			'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'),
76			'schemes'=>array('', 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https', 'allowed URL protocols', '50'),
77			'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1),
78			'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'),
79			'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'),
80			'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'),
81			'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'),
82			'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'),
83			'allow_for_inline' => array('table'),//block elements allowed for nesting when only inline is allowed; Example span does not allow block elements as table; table is the only element tested so far
84		);
85		*/
86
87		$this->Configuration = array('comment'=>1, //remove comments
88			'make_tag_strict'=>3,//3 is a new own config value, to indicate that transformation is to be performed, but don't transform font, as size transformation of numeric sizes to keywords alters the intended result too much
89			'balance'=>0,//turn off tag-balancing (config['balance']=>0). That will not introduce any security risk; only standards-compliant tag nesting check/filtering will be turned off (basic tag-balance will remain; i.e., there won't be any unclosed tag, etc., after filtering)
90			// tidy eats away even some wanted whitespace, so we switch it off;
91			// we used it for its compacting and beautifying capabilities, which resulted in better html for further processing
92			'tidy'=>0,
93			'elements' => "* -script -meta -object",
94			'deny_attribute' => 'on*',
95			'schemes'=>'href: file, ftp, http, https, mailto, tel, phone; src: cid, data, file, ftp, http, https; *:file, http, https',
96			'hook_tag' =>"hl_my_tag_transform",
97		);
98		$this->Spec = 'img=alt(noneof="image"/default="")';
99	}
100
101	/**
102	 * Run htmLawed
103	 *
104	 * @param varchar $html2check =text input Text to check
105	 * @param mixed $Config = text or array
106	 * @param mixed $Spec =text or array; The '$spec' argument can be used to disallow an otherwise legal attribute for an element
107	 * @return varchar cleaned/fixed html
108	 */
109	function run($html2check, $Config=null, $Spec=array())
110	{
111		//error_log(__METHOD__.__LINE__.' Input:'.$html2check);
112		if (is_array($Config) && is_array($this->Configuration)) $Config = array_merge($this->Configuration, $Config);
113		if (empty($Config)) $Config = $this->Configuration;
114		if (empty($Spec)) $Spec = $this->Spec;
115		// If we are processing mails, we take out stuff in <style> stuff </style> tags and
116		// put it back in after purifying; styles are processed for known security risks
117		// in self::getStyles
118		// we allow filtered style sections now throughout egroupware
119		/*if ($Config['hook_tag'] =="hl_email_tag_transform")*/ $styles = self::getStyles($html2check);
120		//error_log(__METHOD__.__LINE__.array2string($styles));
121		//error_log(__METHOD__.__LINE__.' Config:'.array2string($Config));
122
123		// mind our namespace when defining a function as hook. we handle our own defined hooks here.
124		if ($Config['hook_tag']=="hl_my_tag_transform" || $Config['hook_tag']=="hl_email_tag_transform")
125		{
126			$Config['hook_tag']=__NAMESPACE__.'\\'.$Config['hook_tag'];
127		}
128		return ($styles?$styles:'').htmLawed($html2check, $Config, $Spec);
129	}
130
131	/**
132	 * get all style tag definitions, <style> stuff </style> of the html passed in
133	 * and remove it from input
134	 * @author Leithoff, Klaus
135	 * @param string html
136	 * @return string the style css
137	 */
138	static function getStyles(&$html)
139	{
140		$ct=0;
141		$newStyle = null;
142		if (stripos($html,'<style')!==false)  $ct = preg_match_all('#<style(?:\s.*)?>(.+)</style>#isU', $html, $newStyle);
143		if ($ct>0)
144		{
145			//error_log(__METHOD__.__LINE__.array2string($newStyle[0]));
146			$style2buffer = implode('',$newStyle[0]);
147			// only replace what we have found, we use it here, as we use the same routine in Api\Mail\Html::replaceTagsCompletley
148			// no need to do the extra routine
149			$html = str_ireplace($newStyle[0],'',$html);
150		}
151		if ($style2buffer)
152		{
153			//error_log(__METHOD__.__LINE__.array2string($style2buffer));
154			$test = json_encode($style2buffer);
155			//error_log(__METHOD__.__LINE__.'#'.$test.'# ->'.strlen($style2buffer).' Error:'.json_last_error());
156			//if (json_last_error() != JSON_ERROR_NONE && strlen($style2buffer)>0)
157			if ($test=="null" && strlen($style2buffer)>0)
158			{
159				// this should not be needed, unless something fails with charset detection/ wrong charset passed
160				error_log(__METHOD__.__LINE__.' Found Invalid sequence for utf-8 in CSS:'.$style2buffer.' Carset Detected:'.Api\Translation::detect_encoding($style2buffer));
161				$style2buffer = utf8_encode($style2buffer);
162			}
163		}
164		$style .= $style2buffer;
165		// clean out comments and stuff
166		$search = array(
167			'@url\(http:\/\/[^\)].*?\)@si',  // url calls e.g. in style definitions
168//			'@<!--[\s\S]*?[ \t\n\r]*-->@',   // Strip multi-line comments including CDATA
169//			'@<!--[\s\S]*?[ \t\n\r]*--@',    // Strip broken multi-line comments including CDATA
170		);
171		$style = preg_replace($search,"",$style);
172
173		// CSS Security
174		// http://code.google.com/p/browsersec/wiki/Part1#Cascading_stylesheets
175		$css = preg_replace('/(javascript|expession|-moz-binding)/i','',$style);
176		if (stripos($css,'script')!==false) Api\Mail\Html::replaceTagsCompletley($css,'script'); // Strip out script that may be included
177		// we need this, as styledefinitions are enclosed with curly brackets; and template stuff tries to replace everything between curly brackets that is having no horizontal whitespace
178		// as the comments as <!-- styledefinition --> in stylesheet are outdated, and ck-editor does not understand it, we remove it
179		$css_no_comment = str_replace(array(':','<!--','-->'),array(': ','',''),$css);
180		//error_log(__METHOD__.__LINE__.$css);
181		// we already removed what we have found, above, as we used pretty much the same routine as in Api\Mail\Html::replaceTagsCompletley
182		// no need to do the extra routine
183		// TODO: we may have to strip urls and maybe comments and ifs
184		//if (stripos($html,'style')!==false) Api\Mail\Html::replaceTagsCompletley($html,'style'); // clean out empty or pagewide style definitions / left over tags
185		return $css_no_comment;
186	}
187
188	/**
189	 * Runs HTMLPurifier over supplied html to remove malicious code
190	 *
191	 * @param string $html
192	 * @param array|string $config =null - config to influence the behavior of current purifying engine
193	 * @param array|string $spec =null - spec to influence the behavior of current purifying engine
194	 *		The $spec argument can be used to disallow an otherwise legal attribute for an element,
195	 *		or to restrict the attribute's values
196	 * @param boolean $_force =null - force the config passed to be used without merging to the default
197	 */
198	static function purify($html,$config=null,$spec=array(),$_force=false)
199	{
200		$defaultConfig = array('valid_xhtml'=>1,'safe'=>1);
201
202		if (empty($html)) return $html;	// no need to process further
203		if (!empty($config) && is_string($config))
204		{
205			//error_log(__METHOD__.__LINE__.$config);
206			$config = json_decode($config,true);
207			if (is_null($config)) error_log(__METHOD__.__LINE__." decoding of config failed; standard will be applied");
208		}
209
210		// User preferences
211		$font = $GLOBALS['egw_info']['user']['preferences']['common']['rte_font'];
212		$font_size = $GLOBALS['egw_info']['user']['preferences']['common']['rte_font_size'];
213
214		// Check for "blank" = just user preference span - for some reason we can't match on the entity, so approximate
215		$regex = '#^<span style="[^"]*font-family:'.$font.'; font-size:'.$font_size.'pt;[^"]*">.?</span>$#us';
216		if(preg_match($regex,$html))
217		{
218			return '';
219		}
220		$htmLawed = new HtmLawed();
221		if (is_array($config) && $_force===false) $config = array_merge($defaultConfig, $config);
222		if (empty($config)) $config = $defaultConfig;
223		//error_log(__METHOD__.__LINE__.array2string($config));
224		return $htmLawed->run($html,$config,$spec);
225	}
226}
227
228/**
229 * hl_my_tag_transform
230 *
231 * function to provide individual checks for element attribute pairs
232 * implemented so far:	img checking for alt attribute == image; set this to empty
233 * 						a checking for title, replacing @
234 * 						blockquote checking for cite, replacing @
235 */
236function hl_my_tag_transform($element, $attribute_array=0)
237{
238	// If second argument is not received, it means a closing tag is being handled
239	if(is_numeric($attribute_array)){
240		return "</$element>";
241	}
242
243	//if ($element=='img') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
244	if ($element=='td' && isset($attribute_array['background']))
245	{
246		if (is_object($GLOBALS['egw']) && stripos($attribute_array['background'],$GLOBALS['egw']->link('/index.php'))!==false)
247		{
248			//error_log(__METHOD__.__LINE__.array2string($attribute_array));
249			//$attribute_array['background'] = 'url('.$attribute_array['background'].');';
250		}
251		else
252		{
253			// $attribute_array['background']='denied:'.$attribute_array['background'];
254			unset($attribute_array['background']);// only internal background images are allowed
255		}
256	}
257	// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
258	if($element == 'img')
259	{
260		// Re-build 'alt'
261		if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
262		if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
263	}
264	if (isset($attribute_array['title']))
265	{
266		if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
267	}
268	if ($element == 'blockquote')
269	{
270		if (isset($attribute_array['cite']))
271		{
272			if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']);
273		}
274	}
275	/*
276	// Elements other than 'span' or 'span' without a 'style' attribute are returned unchanged
277	if($element == 'span' && isset($attribute_array['style']))
278	{
279		// Identify CSS properties and values
280		$css = explode(';', $attribute_array['style']);
281		$style = array();
282		foreach($css as $v){
283			if(($p = strpos($v, ':')) > 1 && $p < strlen($v)){
284				$css_property_name = trim(substr($v, 0, $p));
285				$css_property_value = trim(substr($v, $p+1));
286				$style[] = "$css_property_name: $css_property_value";
287			}
288		}
289
290		// Alter the CSS property as required
291
292		// Black Arial must be at a font-size of 24
293		if(isset($style['font-family']) && $style['font-family'] == 'Arial' && isset($style['color']) && $style['color'] == '#000000'){
294			$style['font-size'] == '24';
295		}
296
297		// And so on for other criteria
298		// ...
299
300		// Re-build 'style'
301		$attribute_array['style'] = implode('; ', $style);
302	}
303	*/
304	if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']);
305	if($element == 'a')
306	{
307		//error_log(__METHOD__.__LINE__.array2string($attribute_array));
308		// rebuild Anchors, if processed by hl_email_tag_transform
309		if (strpos($attribute_array['href'],"denied:javascript:GoToAnchor('")===0)
310		{
311			$attribute_array['href']=str_ireplace("');",'',str_ireplace("denied:javascript:GoToAnchor('","#",$attribute_array['href']));
312		}
313		if (strpos($attribute_array['href'],"javascript:GoToAnchor('")===0)
314		{
315			$attribute_array['href']=str_ireplace("');",'',str_ireplace("javascript:GoToAnchor('","#",$attribute_array['href']));
316		}
317		if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']='';
318	}
319
320	// Build the attributes string
321	$attributes = '';
322	foreach($attribute_array as $k=>$v){
323		$attributes .= " {$k}=\"{$v}\"";
324	}
325
326	// Return the opening tag with attributes
327	static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
328	return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
329}
330
331/**
332 * hl_email_tag_transform
333 *
334 * function to provide individual checks for element attribute pairs
335 * implemented so far:	img -checking for alt attribute == image; set this to empty
336 *							-control for/on external Images and src-length
337 * 						a -checking for title and href, replacing @ accordingly
338 *						  -navigate to local anchors without reloading the page
339 * 						blockquote -checking for cite, replacing @
340 * 						throwing away excess div elements, that carry no style or class or id info
341 */
342function hl_email_tag_transform($element, $attribute_array=0)
343{
344	//error_log(__METHOD__.__LINE__.$element.'=>'.array2string($attribute_array));
345	static $lastelement = null;
346	static $throwawaycounter = null;
347	if (is_null($lastelement)) $lastelement='';
348	if (is_null($throwawaycounter)) $throwawaycounter = 0;
349	//if ($throwawaycounter>1) error_log(__METHOD__.__LINE__.' '.$throwawaycounter.$element.array2string($attribute_array));
350	if ($element=='div' && $element==$lastelement && ($attribute_array==0 || empty($attribute_array)))
351	{
352		if (is_array($attribute_array)) $throwawaycounter++;
353		if ($attribute_array==0 && $throwawaycounter>0) $throwawaycounter--;
354		if ($throwawaycounter>1) return '';
355	}
356	if ($lastelement=='div' && $element!=$lastelement && is_array($attribute_array)) $throwawaycounter = 0;
357	if (is_array($attribute_array) && !empty($attribute_array) && $element=='div')
358	{
359		$lastelement = 'div_with_attr';
360	}
361	else
362	{
363		if (is_array($attribute_array)) $lastelement = $element;
364	}
365	// If second argument is not received, it means a closing tag is being handled
366	if(is_numeric($attribute_array)){
367		if($element==$lastelement) $lastelement='';
368		return "</$element>";
369	}
370
371	//if ($element=='a') error_log(__METHOD__.__LINE__." ".$element.'->'.array2string($attribute_array));
372	if ($element=='td' && isset($attribute_array['background']))
373	{
374		if (stripos($attribute_array['background'],'cid:')!==false)
375		{
376			//error_log(__METHOD__.__LINE__.array2string($attribute_array));
377			//$attribute_array['background'] = 'url('.$attribute_array['background'].');';
378		}
379		else
380		{
381			// $attribute_array['background']='denied:'.$attribute_array['background'];
382			unset($attribute_array['background']);// only cid style background images are allowed
383		}
384	}
385	// Elements other than 'img' or 'img' without a 'img' attribute are returned unchanged
386	if($element == 'img')
387	{
388		// Re-build 'alt'
389		if (isset($attribute_array['alt'])) $attribute_array['alt'] = ($attribute_array['alt']=='image'?'':$attribute_array['alt']);
390		if (isset($attribute_array['alt'])&&strpos($attribute_array['alt'],'@')!==false) $attribute_array['alt']=str_replace('@','(at)',$attribute_array['alt']);
391		// $GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'] ? '' : 'match' => '/^cid:.*/'),
392		if (isset($attribute_array['src']))
393		{
394			if (!(strlen($attribute_array['src'])>4 && strlen($attribute_array['src']<400)))
395			{
396					$attribute_array['alt']= $attribute_array['alt'].' [blocked (reason: url length):'.$attribute_array['src'].']';
397					if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
398					$attribute_array['src']=Api\Image::find('api','error');
399			}
400			if (!preg_match('/^cid:.*/',$attribute_array['src']))
401			{
402				$url = explode('/', preg_replace('/^(http|https):\/\//','',$attribute_array['src']));
403				$domains = is_array($GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalDomains']) ?
404						$GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalDomains'] :
405						array();
406				if ($GLOBALS['egw_info']['user']['preferences']['mail']['allowExternalIMGs'] != 1
407						&& !in_array($url[0], $domains) || substr($attribute_array['src'],0, 5) == 'http:')
408				{
409					//the own webserver url is not external, so it should be allowed
410					if (empty($GLOBALS['egw_info']['server']['webserver_url'])||!preg_match("$^".$GLOBALS['egw_info']['server']['webserver_url'].".*$",$attribute_array['src']))
411					{
412						$attribute_array['alt']= $attribute_array['alt'].' [blocked external image:'.$attribute_array['src'].']';
413						if (!isset($attribute_array['title'])) $attribute_array['title']=$attribute_array['alt'];
414						$attribute_array['src']=Api\Image::find('mail','no-image-shown');
415						$attribute_array['border'] = 1;
416						if ($attribute_array['style'])
417						{
418							if (stripos($attribute_array['style'],'border')!==false) $attribute_array['style'] = preg_replace('~border(:|-left:|-right:|-bottom:|-top:)+ (0px)+ (none)+;~si','',$attribute_array['style']);
419						}
420					}
421				}
422			}
423		}
424	}
425	if (isset($attribute_array['style']) && stripos($attribute_array['style'],'script')!==false) $attribute_array['style'] = str_ireplace('script','',$attribute_array['style']);
426	if (isset($attribute_array['title']))
427	{
428		if (strpos($attribute_array['title'],'@')!==false) $attribute_array['title']=str_replace('@','(at)',$attribute_array['title']);
429	}
430	if ($element == 'blockquote')
431	{
432		if (isset($attribute_array['cite']))
433		{
434			if (strpos($attribute_array['cite'],'@')!==false) $attribute_array['cite']=str_replace('@','(at)',$attribute_array['cite']);
435		}
436	}
437	if($element == 'a')
438	{
439		//error_log(__METHOD__.__LINE__.array2string($attribute_array));
440		if (strpos($attribute_array['href'],'denied:javascript')===0) $attribute_array['href']='';
441		if (isset($attribute_array['name']) && isset($attribute_array['id'])) $attribute_array['id'] = $attribute_array['name'];
442		if (strpos($attribute_array['href'],'@')!==false) $attribute_array['href'] = str_replace('@','%40',$attribute_array['href']);
443		if (strpos($attribute_array['href'],'#')===0 && (isset(Api\Mail::$htmLawed_config['transform_anchor']) && Api\Mail::$htmLawed_config['transform_anchor']===true))
444		{
445			$attribute_array['href'] = "javascript:GoToAnchor('".trim(substr($attribute_array['href'],1))."');";
446		}
447
448	}
449
450	// Build the attributes string
451	$attributes = '';
452	foreach($attribute_array as $k=>$v){
453		$attributes .= " {$k}=\"{$v}\"";
454	}
455
456	// Return the opening tag with attributes
457	static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
458	return "<{$element}{$attributes}". (isset($empty_elements[$element]) ? ' /' : ''). '>';
459}
460
461