1<?php
2// Copyright (C) 2016-2017 Combodo SARL
3//
4//   This file is part of iTop.
5//
6//   iTop is free software; you can redistribute it and/or modify
7//   it under the terms of the GNU Affero General Public License as published by
8//   the Free Software Foundation, either version 3 of the License, or
9//   (at your option) any later version.
10//
11//   iTop is distributed in the hope that it will be useful,
12//   but WITHOUT ANY WARRANTY; without even the implied warranty of
13//   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14//   GNU Affero General Public License for more details.
15//
16//   You should have received a copy of the GNU Affero General Public License
17//   along with iTop. If not, see <http://www.gnu.org/licenses/>
18/**
19 * Base class for all possible implementations of HTML Sanitization
20 */
21abstract class HTMLSanitizer
22{
23	public function __construct()
24	{
25		// Do nothing..
26	}
27
28	/**
29	 * Sanitizes the given HTML document
30	 * @param string $sHTML
31	 * @return string
32	 */
33	abstract public function DoSanitize($sHTML);
34
35	/**
36	 * Sanitize an HTML string with the configured sanitizer, falling back to HTMLDOMSanitizer in case of Exception or invalid configuration
37	 * @param string $sHTML
38	 * @return string
39	 */
40	public static function Sanitize($sHTML)
41	{
42		$sSanitizerClass = MetaModel::GetConfig()->Get('html_sanitizer');
43		if(!class_exists($sSanitizerClass))
44		{
45			IssueLog::Warning('The configured "html_sanitizer" class "'.$sSanitizerClass.'" is not a valid class. Will use HTMLDOMSanitizer as the default sanitizer.');
46			$sSanitizerClass = 'HTMLDOMSanitizer';
47		}
48		else if(!is_subclass_of($sSanitizerClass, 'HTMLSanitizer'))
49		{
50			IssueLog::Warning('The configured "html_sanitizer" class "'.$sSanitizerClass.'" is not a subclass of HTMLSanitizer. Will use HTMLDOMSanitizer as the default sanitizer.');
51			$sSanitizerClass = 'HTMLDOMSanitizer';
52		}
53
54		try
55		{
56			$oSanitizer = new $sSanitizerClass();
57			$sCleanHTML = $oSanitizer->DoSanitize($sHTML);
58		}
59		catch(Exception $e)
60		{
61			if($sSanitizerClass != 'HTMLDOMSanitizer')
62			{
63				IssueLog::Warning('Failed to sanitize an HTML string with "'.$sSanitizerClass.'". The following exception occured: '.$e->getMessage());
64				IssueLog::Warning('Will try to sanitize with HTMLDOMSanitizer.');
65				// try again with the HTMLDOMSanitizer
66				$oSanitizer = new HTMLDOMSanitizer();
67				$sCleanHTML = $oSanitizer->DoSanitize($sHTML);
68			}
69			else
70			{
71				IssueLog::Error('Failed to sanitize an HTML string with "HTMLDOMSanitizer". The following exception occured: '.$e->getMessage());
72				IssueLog::Error('The HTML will NOT be sanitized.');
73				$sCleanHTML = $sHTML;
74			}
75		}
76		return $sCleanHTML;
77	}
78}
79
80/**
81 * Dummy HTMLSanitizer which does nothing at all!
82 * Can be used if HTML Sanitization is not important
83 * (for example when importing "safe" data during an on-boarding)
84 * and performance is at stake
85 *
86 */
87class HTMLNullSanitizer extends HTMLSanitizer
88{
89	/**
90	 * (non-PHPdoc)
91	 * @see HTMLSanitizer::Sanitize()
92	 */
93	public function DoSanitize($sHTML)
94	{
95		return $sHTML;
96	}
97
98}
99
100/**
101 * A standard-compliant HTMLSanitizer based on the HTMLPurifier library by Edward Z. Yang
102 * Complete but quite slow
103 * http://htmlpurifier.org
104 */
105/*
106class HTMLPurifierSanitizer extends HTMLSanitizer
107{
108	protected static $oPurifier = null;
109
110	public function __construct()
111	{
112		if (self::$oPurifier == null)
113		{
114			$sLibPath = APPROOT.'lib/htmlpurifier/HTMLPurifier.auto.php';
115			if (!file_exists($sLibPath))
116			{
117				throw new Exception("Missing library '$sLibPath', cannot use HTMLPurifierSanitizer.");
118			}
119			require_once($sLibPath);
120
121			$oPurifierConfig = HTMLPurifier_Config::createDefault();
122			$oPurifierConfig->set('Core.Encoding', 'UTF-8'); // defaults to 'UTF-8'
123			$oPurifierConfig->set('HTML.Doctype', 'XHTML 1.0 Strict'); // defaults to 'XHTML 1.0 Transitional'
124			$oPurifierConfig->set('URI.AllowedSchemes', array (
125				'http' => true,
126				'https' => true,
127				'data' => true, // This one is not present by default
128			));
129			$sPurifierCache = APPROOT.'data/HTMLPurifier';
130			if (!is_dir($sPurifierCache))
131			{
132				mkdir($sPurifierCache);
133			}
134			if (!is_dir($sPurifierCache))
135			{
136				throw new Exception("Could not create the cache directory '$sPurifierCache'");
137			}
138			$oPurifierConfig->set('Cache.SerializerPath', $sPurifierCache); // no trailing slash
139			self::$oPurifier = new HTMLPurifier($oPurifierConfig);
140		}
141	}
142
143	public function DoSanitize($sHTML)
144	{
145		$sCleanHtml = self::$oPurifier->purify($sHTML);
146		return $sCleanHtml;
147	}
148}
149*/
150
151class HTMLDOMSanitizer extends HTMLSanitizer
152{
153	protected $oDoc;
154
155	/**
156	 * @var array
157	 * @see https://www.itophub.io/wiki/page?id=2_6_0%3Aadmin%3Arich_text_limitations
158	 */
159	protected static $aTagsWhiteList = array(
160		'html' => array(),
161		'body' => array(),
162		'a' => array('href', 'name', 'style', 'target', 'title'),
163		'p' => array('style'),
164		'blockquote' => array('style'),
165		'br' => array(),
166		'span' => array('style'),
167		'div' => array('style'),
168		'b' => array(),
169		'i' => array(),
170		'u' => array(),
171		'em' => array(),
172		'strong' => array(),
173		'img' => array('src', 'style', 'alt', 'title'),
174		'ul' => array('style'),
175		'ol' => array('style'),
176		'li' => array('style'),
177		'h1' => array('style'),
178		'h2' => array('style'),
179		'h3' => array('style'),
180		'h4' => array('style'),
181		'nav' => array('style'),
182		'section' => array('style'),
183		'code' => array('style'),
184		'table' => array('style', 'width', 'summary', 'align', 'border', 'cellpadding', 'cellspacing'),
185		'thead' => array('style'),
186		'tbody' => array('style'),
187		'tr' => array('style', 'colspan', 'rowspan'),
188		'td' => array('style', 'colspan', 'rowspan'),
189		'th' => array('style', 'colspan', 'rowspan'),
190		'fieldset' => array('style'),
191		'legend' => array('style'),
192		'font' => array('face', 'color', 'style', 'size'),
193		'big' => array(),
194		'small' => array(),
195		'tt' => array(),
196		'kbd' => array(),
197		'samp' => array(),
198		'var' => array(),
199		'del' => array(),
200		's' => array(), // strikethrough
201		'ins' => array(),
202		'cite' => array(),
203		'q' => array(),
204		'hr' => array('style'),
205		'pre' => array(),
206	);
207
208	protected static $aAttrsWhiteList = array(
209		'src' => '/^(http:|https:|data:)/i',
210	);
211
212	/**
213	 * @var array
214	 * @see https://www.itophub.io/wiki/page?id=2_6_0%3Aadmin%3Arich_text_limitations
215	 */
216	protected static $aStylesWhiteList = array(
217		'background-color',
218		'border',
219		'border-collapse',
220		'bordercolor',
221		'cellpadding',
222		'cellspacing',
223		'color',
224		'float',
225		'font',
226		'font-family',
227		'font-size',
228		'font-style',
229		'height',
230		'margin',
231		'padding',
232		'text-align',
233		'vertical-align',
234		'width',
235		'white-space',
236	);
237
238	public function __construct()
239	{
240		// Building href validation pattern from url and email validation patterns as the patterns are not used the same way in HTML content than in standard attributes value.
241		// eg. "foo@bar.com" vs "mailto:foo@bar.com?subject=Title&body=Hello%20world"
242		if (!array_key_exists('href', self::$aAttrsWhiteList))
243		{
244			// Regular urls
245			$sUrlPattern = utils::GetConfig()->Get('url_validation_pattern');
246			// Mailto urls
247			$sMailtoPattern = '(mailto:(' . utils::GetConfig()->Get('email_validation_pattern') . ')(?:\?(?:subject|body)=([a-zA-Z0-9+\$_.-]*)(?:&(?:subject|body)=([a-zA-Z0-9+\$_.-]*))?)?)';
248
249			$sPattern = $sUrlPattern . '|' . $sMailtoPattern;
250			$sPattern = '/'.str_replace('/', '\/', $sPattern).'/i';
251			self::$aAttrsWhiteList['href'] = $sPattern;
252		}
253	}
254
255	public function DoSanitize($sHTML)
256	{
257		$this->oDoc = new DOMDocument();
258		$this->oDoc->preserveWhitespace = true;
259
260		// MS outlook implements empty lines by the mean of <p><o:p></o:p></p>
261		// We have to transform that into <p><br></p> (which is how Thunderbird implements empty lines)
262		// Unfortunately, DOMDocument::loadHTML does not take the tag namespaces into account (once loaded there is no way to know if the tag did have a namespace)
263		// therefore we have to do the transformation upfront
264		$sHTML = preg_replace('@<o:p>\s*</o:p>@', '<br>', $sHTML);
265
266		@$this->oDoc->loadHTML('<?xml encoding="UTF-8"?>'.$sHTML); // For loading HTML chunks where the character set is not specified
267
268		$this->CleanNode($this->oDoc);
269
270		$oXPath = new DOMXPath($this->oDoc);
271		$sXPath = "//body";
272		$oNodesList = $oXPath->query($sXPath);
273
274		if ($oNodesList->length == 0)
275		{
276			// No body, save the whole document
277			$sCleanHtml = $this->oDoc->saveHTML();
278		}
279		else
280		{
281			// Export only the content of the body tag
282			$sCleanHtml = $this->oDoc->saveHTML($oNodesList->item(0));
283			// remove the body tag itself
284			$sCleanHtml = str_replace( array('<body>', '</body>'), '', $sCleanHtml);
285		}
286
287		return $sCleanHtml;
288	}
289
290	protected function CleanNode(DOMNode $oElement)
291	{
292		$aAttrToRemove = array();
293		// Gather the attributes to remove
294		if ($oElement->hasAttributes())
295		{
296			foreach($oElement->attributes as $oAttr)
297			{
298				$sAttr = strtolower($oAttr->name);
299				if (!in_array($sAttr, self::$aTagsWhiteList[strtolower($oElement->tagName)]))
300				{
301					// Forbidden (or unknown) attribute
302					$aAttrToRemove[] = $oAttr->name;
303				}
304				else if (!$this->IsValidAttributeContent($sAttr, $oAttr->value))
305				{
306					// Invalid content
307					$aAttrToRemove[] = $oAttr->name;
308				}
309				else if ($sAttr == 'style')
310				{
311					// Special processing for style tags
312					$sCleanStyle = $this->CleanStyle($oAttr->value);
313					if ($sCleanStyle == '')
314					{
315						// Invalid content
316						$aAttrToRemove[] = $oAttr->name;
317					}
318					else
319					{
320						$oElement->setAttribute($oAttr->name, $sCleanStyle);
321					}
322				}
323			}
324			// Now remove them
325			foreach($aAttrToRemove as $sName)
326			{
327				$oElement->removeAttribute($sName);
328			}
329		}
330
331		if ($oElement->hasChildNodes())
332		{
333			$aChildElementsToRemove = array();
334			// Gather the child noes to remove
335			foreach($oElement->childNodes as $oNode)
336			{
337				if (($oNode instanceof DOMElement) && (!array_key_exists(strtolower($oNode->tagName), self::$aTagsWhiteList)))
338				{
339					$aChildElementsToRemove[] = $oNode;
340				}
341				else if ($oNode instanceof DOMComment)
342				{
343					$aChildElementsToRemove[] = $oNode;
344				}
345				else
346				{
347					// Recurse
348					$this->CleanNode($oNode);
349					if (($oNode instanceof DOMElement) && (strtolower($oNode->tagName) == 'img'))
350					{
351						InlineImage::ProcessImageTag($oNode);
352					}
353				}
354			}
355			// Now remove them
356			foreach($aChildElementsToRemove as $oDomElement)
357			{
358				$oElement->removeChild($oDomElement);
359			}
360		}
361	}
362
363	protected function CleanStyle($sStyle)
364	{
365		$aAllowedStyles = array();
366		$aItems = explode(';', $sStyle);
367		{
368			foreach($aItems as $sItem)
369			{
370				$aElements = explode(':', trim($sItem));
371				if (in_array(trim(strtolower($aElements[0])), static::$aStylesWhiteList))
372				{
373					$aAllowedStyles[] = trim($sItem);
374				}
375			}
376		}
377		return implode(';', $aAllowedStyles);
378	}
379
380	protected function IsValidAttributeContent($sAttributeName, $sValue)
381	{
382		if (array_key_exists($sAttributeName, self::$aAttrsWhiteList))
383		{
384			return preg_match(self::$aAttrsWhiteList[$sAttributeName], $sValue);
385		}
386		return true;
387	}
388}