1<?php
2
3namespace Wikimedia\RemexHtml\DOM;
4
5use Wikimedia\RemexHtml\HTMLData;
6use Wikimedia\RemexHtml\Tokenizer\Attributes;
7use Wikimedia\RemexHtml\TreeBuilder\Element;
8use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
9use Wikimedia\RemexHtml\TreeBuilder\TreeHandler;
10
11/**
12 * A TreeHandler which constructs a DOMDocument.
13 *
14 * Note that this class permits third-party `DOMImplementation`s
15 * (documents other than `\DOMDocument`, nodes other than `\DOMNode`,
16 * etc) and so no enforced PHP type hints are used which name these
17 * classes directly.  For the sake of static type checking, the
18 * types *in comments* are given as if the standard PHP `\DOM*`
19 * classes are being used but at runtime everything is duck-typed.
20 */
21class DOMBuilder implements TreeHandler {
22
23	/** @var string|null The name of the input document type */
24	public $doctypeName;
25
26	/** @var string|null The public ID */
27	public $public;
28
29	/** @var string|null The system ID */
30	public $system;
31
32	/**
33	 * @var int The quirks mode. May be either TreeBuilder::NO_QUIRKS,
34	 *   TreeBuilder::LIMITED_QUIRKS or TreeBuilder::QUIRKS to indicate
35	 *   no-quirks mode, limited-quirks mode or quirks mode respectively.
36	 */
37	public $quirks;
38
39	/** @var \DOMDocument */
40	private $doc;
41
42	/** @var callable|null */
43	private $errorCallback;
44
45	/** @var bool */
46	private $suppressHtmlNamespace;
47
48	/** @var bool */
49	private $suppressIdAttribute;
50
51	/** @var \DOMImplementation */
52	private $domImplementation;
53
54	/** @var class-string */
55	private $domExceptionClass;
56
57	/** @var bool */
58	private $isFragment;
59
60	/** @var bool */
61	private $coerced = false;
62
63	/**
64	 * @param array $options An associative array of options:
65	 *   - errorCallback : A function which is called on parse errors
66	 *   - suppressHtmlNamespace : omit the namespace when creating HTML
67	 *     elements. False by default.
68	 *   - suppressIdAttribute : don't call the nonstandard
69	 *     DOMElement::setIdAttribute() method while constructing elements.
70	 *     False by default (this method is needed for efficient
71	 *     DOMDocument::getElementById() calls).  Set to true if you are
72	 *     using a W3C spec-compliant DOMImplementation and wish to avoid
73	 *     nonstandard calls.
74	 *   - domImplementation: The DOMImplementation object to use.  If this
75	 *     parameter is missing or null, a new DOMImplementation object will
76	 *     be constructed using the `domImplementationClass` option value.
77	 *     You can use a third-party DOM implementation by passing in an
78	 *     appropriately duck-typed object here.
79	 *   - domImplementationClass: The string name of the DOMImplementation
80	 *     class to use.  Defaults to `\DOMImplementation::class` but
81	 *     you can use a third-party DOM implementation by passing
82	 *     an alternative class name here.
83	 *   - domExceptionClass: The string name of the DOMException
84	 *     class to use.  Defaults to `\DOMException::class` but
85	 *     you can use a third-party DOM implementation by passing
86	 *     an alternative class name here.
87	 */
88	public function __construct( $options = [] ) {
89		$options += [
90			'suppressHtmlNamespace' => false,
91			'suppressIdAttribute' => false,
92			'errorCallback' => null,
93			'domImplementation' => null,
94			'domImplementationClass' => \DOMImplementation::class,
95			'domExceptionClass' => \DOMException::class,
96		];
97		$this->errorCallback = $options['errorCallback'];
98		$this->suppressHtmlNamespace = $options['suppressHtmlNamespace'];
99		$this->suppressIdAttribute = $options['suppressIdAttribute'];
100		$this->domImplementation = $options['domImplementation'] ??
101			new $options['domImplementationClass'];
102		$this->domExceptionClass = $options['domExceptionClass'];
103	}
104
105	private function rethrowIfNotDomException( \Throwable $t ) {
106		if ( is_a( $t, $this->domExceptionClass, false ) ) {
107			return;
108		}
109		throw $t;
110	}
111
112	/**
113	 * Get the constructed document or document fragment. In the fragment case,
114	 * a DOMElement is returned, and the caller is expected to extract its
115	 * inner contents, ignoring the wrapping element. This convention is
116	 * convenient because the wrapping element gives libxml somewhere to put
117	 * its namespace declarations. If we copied the children into a
118	 * DOMDocumentFragment, libxml would invent new prefixes for the orphaned
119	 * namespaces.
120	 *
121	 * @return \DOMNode
122	 */
123	public function getFragment() {
124		if ( $this->isFragment ) {
125			return $this->doc->documentElement;
126		} else {
127			return $this->doc;
128		}
129	}
130
131	/**
132	 * Returns true if the document was coerced due to libxml limitations. We
133	 * follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset".
134	 *
135	 * @return bool
136	 */
137	public function isCoerced() {
138		return $this->coerced;
139	}
140
141	public function startDocument( $fragmentNamespace, $fragmentName ) {
142		$this->isFragment = $fragmentNamespace !== null;
143		$this->doc = $this->createDocument();
144	}
145
146	/**
147	 * @param string|null $doctypeName
148	 * @param string|null $public
149	 * @param string|null $system
150	 * @return \DOMDocument
151	 * @suppress PhanTypeMismatchArgumentInternalReal
152	 *   Null args to DOMImplementation::createDocument
153	 */
154	protected function createDocument(
155		string $doctypeName = null,
156		string $public = null,
157		string $system = null
158	) {
159		$impl = $this->domImplementation;
160		if ( $doctypeName === '' ) {
161			$this->coerced = true;
162			$doc = $impl->createDocument( null, '' );
163		} elseif ( $doctypeName === null ) {
164			$doc = $impl->createDocument( null, '' );
165		} else {
166			$doctype = $impl->createDocumentType( $doctypeName, $public, $system );
167			$doc = $impl->createDocument( null, '', $doctype );
168		}
169		$doc->encoding = 'UTF-8';
170		return $doc;
171	}
172
173	public function endDocument( $pos ) {
174	}
175
176	private function insertNode( $preposition, $refElement, $node ) {
177		if ( $preposition === TreeBuilder::ROOT ) {
178			$parent = $this->doc;
179			$refNode = null;
180		} elseif ( $preposition === TreeBuilder::BEFORE ) {
181			$parent = $refElement->userData->parentNode;
182			$refNode = $refElement->userData;
183		} else {
184			$parent = $refElement->userData;
185			$refNode = null;
186		}
187		// @phan-suppress-next-line PhanTypeMismatchArgumentInternal
188		$parent->insertBefore( $node, $refNode );
189	}
190
191	/**
192	 * Replace unsupported characters with a code of the form U123456.
193	 *
194	 * @param string $name
195	 * @return string
196	 */
197	private function coerceName( $name ) {
198		$coercedName = DOMUtils::coerceName( $name );
199		if ( $name !== $coercedName ) {
200			$this->coerced = true;
201		}
202		return $coercedName;
203	}
204
205	protected function createNode( Element $element ) {
206		$noNS = $this->suppressHtmlNamespace && $element->namespace === HTMLData::NS_HTML;
207		try {
208			if ( $noNS ) {
209				$node = $this->doc->createElement( $element->name );
210			} else {
211				$node = $this->doc->createElementNS(
212					$element->namespace,
213					$element->name );
214			}
215		} catch ( \Throwable $e ) {
216			$this->rethrowIfNotDomException( $e );
217			'@phan-var \DOMException $e'; /** @var \DOMException $e */
218			// Attempt to escape the name so that it is more acceptable
219			if ( $noNS ) {
220				$node = $this->doc->createElement(
221					$this->coerceName( $element->name )
222				);
223			} else {
224				$node = $this->doc->createElementNS(
225					$element->namespace,
226					$this->coerceName( $element->name ) );
227			}
228		}
229
230		foreach ( $element->attrs->getObjects() as $attr ) {
231			if ( $attr->namespaceURI === null
232				&& strpos( $attr->localName, ':' ) !== false
233			) {
234				// FIXME: this apparently works to create a prefixed localName
235				// in the null namespace, but this is probably taking advantage
236				// of a bug in PHP's DOM library, and screws up in various
237				// interesting ways. For example, attributes created in this
238				// way can't be discovered via hasAttribute() or hasAttributeNS().
239				$attrNode = $this->doc->createAttribute( $attr->localName );
240				$attrNode->value = $attr->value;
241				try {
242					$node->setAttributeNodeNS( $attrNode );
243				} catch ( \Throwable $e ) {
244					$this->rethrowIfNotDomException( $e );
245					'@phan-var \DOMException $e'; /** @var \DOMException $e */
246					$node->setAttributeNS(
247						$attr->namespaceURI,
248						$this->coerceName( $attr->qualifiedName ),
249						$attr->value );
250				}
251			} else {
252				try {
253					$node->setAttributeNS(
254						$attr->namespaceURI,
255						$attr->qualifiedName,
256						$attr->value );
257				} catch ( \Throwable $e ) {
258					$this->rethrowIfNotDomException( $e );
259					'@phan-var \DOMException $e'; /** @var \DOMException $e */
260					$node->setAttributeNS(
261						$attr->namespaceURI,
262						$this->coerceName( $attr->qualifiedName ),
263						$attr->value );
264				}
265			}
266		}
267		if ( ( !$this->suppressIdAttribute ) && $node->hasAttribute( 'id' ) ) {
268			// This is a call to a non-standard DOM method required by PHP in
269			// order to implement DOMDocument::getElementById() efficiently.
270			$node->setIdAttribute( 'id', true );
271		}
272		$element->userData = $node;
273		return $node;
274	}
275
276	public function characters( $preposition, $refElement, $text, $start, $length,
277		$sourceStart, $sourceLength
278	) {
279		// Parse $preposition and $refElement as in self::insertNode()
280		if ( $preposition === TreeBuilder::ROOT ) {
281			$parent = $this->doc;
282			$refNode = null;
283		} elseif ( $preposition === TreeBuilder::BEFORE ) {
284			$parent = $refElement->userData->parentNode;
285			$refNode = $refElement->userData;
286		} else {
287			$parent = $refElement->userData;
288			$refNode = null;
289		}
290		// https://html.spec.whatwg.org/#insert-a-character
291		// If the adjusted insertion location is in a Document node, then
292		// return.
293		if ( $parent === $this->doc ) {
294			return;
295		}
296		$data = substr( $text, $start, $length );
297		// If there is a Text node immediately before the adjusted insertion
298		// location, then append data to that Text node's data.
299		if ( $refNode === null ) {
300			$prev = $parent->lastChild;
301		} else {
302			/** @var \DOMNode $refNode */
303			$prev = $refNode->previousSibling;
304		}
305		if ( $prev !== null && $prev->nodeType === XML_TEXT_NODE ) {
306			'@phan-var \DOMCharacterData $prev'; /** @var \DOMCharacterData $prev */
307			$prev->appendData( $data );
308		} else {
309			$node = $this->doc->createTextNode( $data );
310			// @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal
311			$parent->insertBefore( $node, $refNode );
312		}
313	}
314
315	public function insertElement( $preposition, $refElement, Element $element, $void,
316		$sourceStart, $sourceLength
317	) {
318		if ( $element->userData ) {
319			$node = $element->userData;
320		} else {
321			$node = $this->createNode( $element );
322		}
323		$this->insertNode( $preposition, $refElement, $node );
324	}
325
326	public function endTag( Element $element, $sourceStart, $sourceLength ) {
327	}
328
329	public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
330		if ( !$this->doc->firstChild ) {
331			$this->doc = $this->createDocument( $name, $public, $system );
332		}
333		$this->doctypeName = $name;
334		$this->public = $public;
335		$this->system = $system;
336		$this->quirks = $quirks;
337	}
338
339	public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
340		$node = $this->doc->createComment( $text );
341		$this->insertNode( $preposition, $refElement, $node );
342	}
343
344	public function error( $text, $pos ) {
345		if ( $this->errorCallback ) {
346			call_user_func( $this->errorCallback, $text, $pos );
347		}
348	}
349
350	public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
351		$node = $element->userData;
352		'@phan-var \DOMElement $node'; /** @var \DOMElement $node */
353		foreach ( $attrs->getObjects() as $name => $attr ) {
354			if ( $attr->namespaceURI === null
355				&& strpos( $attr->localName, ':' ) !== false
356			) {
357				// As noted in createNode(), we can't use hasAttribute() here.
358				// However, we can use the return value of setAttributeNodeNS()
359				// instead.
360				$attrNode = $this->doc->createAttribute( $attr->localName );
361				$attrNode->value = $attr->value;
362				try {
363					$replaced = $node->setAttributeNodeNS( $attrNode );
364				} catch ( \Throwable $e ) {
365					$this->rethrowIfNotDomException( $e );
366					'@phan-var \DOMException $e'; /** @var \DOMException $e */
367					$attrNode = $this->doc->createAttribute(
368						$this->coerceName( $attr->localName ) );
369					$attrNode->value = $attr->value;
370					$replaced = $node->setAttributeNodeNS( $attrNode );
371				}
372				if ( $replaced ) {
373					// Put it back how it was
374					$node->setAttributeNodeNS( $replaced );
375				}
376			} elseif ( $attr->namespaceURI === null ) {
377				try {
378					if ( !$node->hasAttribute( $attr->localName ) ) {
379						$node->setAttribute( $attr->localName, $attr->value );
380					}
381				} catch ( \Throwable $e ) {
382					$this->rethrowIfNotDomException( $e );
383					'@phan-var \DOMException $e'; /** @var \DOMException $e */
384					$name = $this->coerceName( $attr->localName );
385					if ( !$node->hasAttribute( $name ) ) {
386						$node->setAttribute( $name, $attr->value );
387					}
388				}
389			} else {
390				try {
391					if ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) {
392						$node->setAttributeNS( $attr->namespaceURI,
393							$attr->localName, $attr->value );
394					}
395				} catch ( \Throwable $e ) {
396					$this->rethrowIfNotDomException( $e );
397					'@phan-var \DOMException $e'; /** @var \DOMException $e */
398					$name = $this->coerceName( $attr->localName );
399					if ( !$node->hasAttributeNS( $attr->namespaceURI, $name ) ) {
400						$node->setAttributeNS( $attr->namespaceURI, $name, $attr->value );
401					}
402				}
403			}
404		}
405	}
406
407	public function removeNode( Element $element, $sourceStart ) {
408		$node = $element->userData;
409		$node->parentNode->removeChild( $node );
410	}
411
412	public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
413		$this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
414		$node = $element->userData;
415		/** @var \DOMElement $newParentNode */
416		$newParentNode = $newParent->userData;
417		while ( $node->firstChild !== $newParentNode ) {
418			$newParentNode->appendChild( $node->firstChild );
419		}
420	}
421}
422
423// Retain the old namespace for backwards compatibility.
424class_alias( DOMBuilder::class, 'RemexHtml\DOM\DOMBuilder' );
425