1<?php
2
3namespace Wikimedia\RemexHtml\Serializer;
4
5use Wikimedia\RemexHtml\DOM\DOMFormatter;
6use Wikimedia\RemexHtml\DOM\DOMUtils;
7use Wikimedia\RemexHtml\HTMLData;
8
9/**
10 * A formatter which follows the HTML 5 fragment serialization algorithm.
11 */
12class HtmlFormatter implements Formatter, DOMFormatter {
13	/**
14	 * The elements for which a closing tag is omitted.
15	 */
16	protected $voidElements = [
17		'area' => true,
18		'base' => true,
19		'basefont' => true,
20		'bgsound' => true,
21		'br' => true,
22		'col' => true,
23		'embed' => true,
24		'frame' => true,
25		'hr' => true,
26		'img' => true,
27		'input' => true,
28		'keygen' => true,
29		'link' => true,
30		'menuitem' => true,
31		'meta' => true,
32		'param' => true,
33		'source' => true,
34		'track' => true,
35		'wbr' => true,
36	];
37
38	/**
39	 * The elements which need a leading newline in their contents to be
40	 * duplicated, since the parser strips a leading newline.
41	 */
42	protected $prefixLfElements = [
43		'pre' => true,
44		'textarea' => true,
45		'listing' => true
46	];
47
48	/**
49	 * The elements which have unescaped contents.
50	 */
51	protected $rawTextElements = [
52		'style' => true,
53		'script' => true,
54		'xmp' => true,
55		'iframe' => true,
56		'noembed' => true,
57		'noframes' => true,
58		'plaintext' => true,
59	];
60	/**
61	 * The escape table for attribute values
62	 */
63	protected $attributeEscapes = [
64		'&' => '&amp;',
65		"\xc2\xa0" => '&nbsp;',
66		'"' => '&quot;',
67	];
68	/**
69	 * The escape table for text nodes
70	 */
71	protected $textEscapes = [
72		'&' => '&amp;',
73		"\xc2\xa0" => '&nbsp;',
74		'<' => '&lt;',
75		'>' => '&gt;',
76	];
77
78	/**
79	 * Attribute namespaces which have unqualified local names
80	 */
81	protected $unqualifiedNamespaces = [
82		HTMLData::NS_HTML => true,
83		HTMLData::NS_MATHML => true,
84		HTMLData::NS_SVG => true,
85	];
86
87	protected $useSourceDoctype;
88	protected $reverseCoercion;
89
90	/**
91	 * Constructor.
92	 *
93	 * @param array $options An associative array of options:
94	 *   - scriptingFlag : Set this to false to disable scripting. True by default.
95	 *   - useSourceDoctype : Emit the doctype used in the source. If this is
96	 *     false or absent, an HTML doctype will be used.
97	 *   - reverseCoercion : When formatting a DOM node, reverse the encoding
98	 *     of invalid names. False by default.
99	 */
100	public function __construct( $options = [] ) {
101		$options += [
102			'scriptingFlag' => true,
103			'useSourceDoctype' => false,
104			'reverseCoercion' => false,
105		];
106		if ( $options['scriptingFlag'] ) {
107			$this->rawTextElements['noscript'] = true;
108		}
109		$this->useSourceDoctype = $options['useSourceDoctype'];
110		$this->reverseCoercion = $options['reverseCoercion'];
111	}
112
113	public function startDocument( $fragmentNamespace, $fragmentName ) {
114		return "<!DOCTYPE html>";
115	}
116
117	public function characters( SerializerNode $parent, $text, $start, $length ) {
118		$text = substr( $text, $start, $length );
119		if ( $parent->namespace !== HTMLData::NS_HTML
120			|| !isset( $this->rawTextElements[$parent->name] )
121		) {
122			$text = strtr( $text, $this->textEscapes );
123		}
124		return $text;
125	}
126
127	public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
128		$name = $node->name;
129		$s = "<$name";
130		foreach ( $node->attrs->getValues() as $attrName => $attrValue ) {
131			$encValue = strtr( $attrValue, $this->attributeEscapes );
132			$s .= " $attrName=\"$encValue\"";
133		}
134		$s .= '>';
135		if ( $node->namespace === HTMLData::NS_HTML ) {
136			if ( isset( $contents[0] ) && $contents[0] === "\n"
137				&& isset( $this->prefixLfElements[$name] )
138			) {
139				$s .= "\n$contents</$name>";
140			} elseif ( !isset( $this->voidElements[$name] ) ) {
141				$s .= "$contents</$name>";
142			}
143		} else {
144			$s .= "$contents</$name>";
145		}
146		return $s;
147	}
148
149	public function comment( SerializerNode $parent, $text ) {
150		return "<!--$text-->";
151	}
152
153	public function doctype( $name, $public, $system ) {
154		return '';
155	}
156
157	public function formatDOMNode( \DOMNode $node ) {
158		$contents = '';
159		if ( $node->firstChild ) {
160			foreach ( $node->childNodes as $child ) {
161				$contents .= $this->formatDOMNode( $child );
162			}
163		}
164
165		switch ( $node->nodeType ) {
166		case XML_ELEMENT_NODE:
167			'@phan-var \DOMElement $node'; /** @var \DOMElement $node */
168			return $this->formatDOMElement( $node, $contents );
169
170		case XML_DOCUMENT_NODE:
171			if ( !$this->useSourceDoctype ) {
172				return "<!DOCTYPE html>" . $contents;
173			} else {
174				return $contents;
175			}
176
177		case XML_DOCUMENT_FRAG_NODE:
178			return $contents;
179
180		case XML_TEXT_NODE:
181			'@phan-var \DOMCharacterData $node'; /** @var \DOMCharacterData $node */
182			$text = $node->data;
183			$parent = $node->parentNode;
184			if ( $parent->namespaceURI !== HTMLData::NS_HTML
185				|| !isset( $this->rawTextElements[$parent->nodeName] )
186			) {
187				$text = strtr( $text, $this->textEscapes );
188			}
189			return $text;
190
191		case XML_CDATA_SECTION_NODE:
192			'@phan-var \DOMCdataSection $node'; /** @var \DOMCdataSection $node */
193			$parent = $node->parentNode;
194			if ( $parent->namespaceURI === HTMLData::NS_HTML ) {
195				// CDATA is not allowed in HTML nodes
196				return $node->data;
197			} else {
198				return "<![CDATA[{$node->data}]]>";
199			}
200
201		case XML_PI_NODE:
202			'@phan-var \DOMProcessingInstruction $node'; /** @var \DOMProcessingInstruction $node */
203			return "<?{$node->target} {$node->data}>";
204
205		case XML_COMMENT_NODE:
206			'@phan-var \DOMComment $node'; /** @var \DOMComment $node */
207			return "<!--{$node->data}-->";
208
209		case XML_DOCUMENT_TYPE_NODE:
210			'@phan-var \DOMDocumentType $node'; /** @var \DOMDocumentType $node */
211			if ( $this->useSourceDoctype ) {
212				return "<!DOCTYPE {$node->name}>";
213			} else {
214				return '';
215			}
216
217		default:
218			return '';
219		}
220	}
221
222	public function formatDOMElement( \DOMElement $node, $contents ) {
223		$ns = $node->namespaceURI;
224		if ( $ns === null
225			|| isset( $this->unqualifiedNamespaces[$ns] )
226			|| $node->prefix === null
227		) {
228			$name = $node->localName;
229		} else {
230			$name = $node->prefix . ':' . $node->localName;
231		}
232		if ( $this->reverseCoercion ) {
233			$name = DOMUtils::uncoerceName( $name );
234		}
235
236		$s = '<' . $name;
237		foreach ( $node->attributes as $attr ) {
238			switch ( $attr->namespaceURI ) {
239			case HTMLData::NS_XML:
240				$attrName = 'xml:' . $attr->localName;
241				break;
242			case HTMLData::NS_XMLNS:
243				if ( $attr->localName === 'xmlns' ) {
244					$attrName = 'xmlns';
245				} else {
246					$attrName = 'xmlns:' . $attr->localName;
247				}
248				break;
249			case HTMLData::NS_XLINK:
250				$attrName = 'xlink:' . $attr->localName;
251				break;
252			default:
253				if ( strlen( $attr->prefix ) ) {
254					$attrName = $attr->prefix . ':' . $attr->localName;
255				} else {
256					$attrName = $attr->localName;
257				}
258			}
259			if ( $this->reverseCoercion ) {
260				$attrName = DOMUtils::uncoerceName( $attrName );
261			}
262			$encValue = strtr( $attr->value, $this->attributeEscapes );
263			$s .= " $attrName=\"$encValue\"";
264		}
265		$s .= '>';
266		if ( $ns === HTMLData::NS_HTML ) {
267			if ( isset( $contents[0] ) && $contents[0] === "\n"
268				&& isset( $this->prefixLfElements[$name] )
269			) {
270				$s .= "\n$contents</$name>";
271			} elseif ( !isset( $this->voidElements[$name] ) ) {
272				$s .= "$contents</$name>";
273			}
274		} else {
275			$s .= "$contents</$name>";
276		}
277		return $s;
278	}
279}
280
281// Retain the old namespace for backwards compatibility.
282class_alias( HtmlFormatter::class, 'RemexHtml\Serializer\HtmlFormatter' );
283