1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Utils;
5
6use DOMAttr;
7use DOMCharacterData;
8use DOMDocument;
9use DOMDocumentFragment;
10use DOMElement;
11use DOMNode;
12use DOMNodeList;
13use DOMText;
14use RemexHtml\DOM\DOMBuilder;
15use RemexHtml\HTMLData;
16use RemexHtml\Tokenizer\Tokenizer;
17use RemexHtml\TreeBuilder\Dispatcher;
18use RemexHtml\TreeBuilder\TreeBuilder;
19use Wikimedia\Assert\Assert;
20use Wikimedia\Parsoid\Utils\DOMCompat\TokenList;
21use Wikimedia\Parsoid\Wt2Html\XMLSerializer;
22use Wikimedia\Zest\Zest;
23
24/**
25 * Helper class that provides missing DOM level 3 methods for the PHP DOM classes.
26 * For a DOM method $node->foo( $bar) the equivalent helper is DOMCompat::foo( $node, $bar ).
27 * For a DOM property $node->foo there is a DOMCompat::getFoo( $node ) and
28 * DOMCompat::setFoo( $node, $value ).
29 * Only implements the methods that are actually used by Parsoid.
30 */
31class DOMCompat {
32
33	/**
34	 * Tab, LF, FF, CR, space
35	 * @see https://infra.spec.whatwg.org/#ascii-whitespace
36	 */
37	private static $ASCII_WHITESPACE = "\t\r\f\n ";
38
39	/**
40	 * Get document body.
41	 * Unlike the spec we return it as a native PHP DOM object.
42	 * @param DOMDocument $document
43	 * @return DOMElement|null
44	 * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-body
45	 * @suppress PhanUndeclaredProperty
46	 */
47	public static function getBody( DOMDocument $document ): ?DOMElement {
48		// Use an undeclared dynamic property as a cache.
49		// WARNING: this will not be updated if (for some reason) the
50		// document body changes.
51		if ( isset( $document->body ) ) {
52			return $document->body;
53		}
54		foreach ( $document->documentElement->childNodes as $element ) {
55			/** @var DOMElement $element */
56			if ( $element->nodeName === 'body' || $element->nodeName === 'frameset' ) {
57				$document->body = $element; // Caching!
58				return $element;
59			}
60		}
61		return null;
62	}
63
64	/**
65	 * Get document head.
66	 * Unlike the spec we return it as a native PHP DOM object.
67	 * @param DOMDocument $document
68	 * @return DOMElement|null
69	 * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-head
70	 * @suppress PhanUndeclaredProperty
71	 */
72	public static function getHead( DOMDocument $document ): ?DOMElement {
73		// Use an undeclared dynamic property as a cache.
74		// WARNING: this will not be updated if (for some reason) the
75		// document head changes.
76		if ( isset( $document->head ) ) {
77			return $document->head;
78		}
79		foreach ( $document->documentElement->childNodes as $element ) {
80			/** @var DOMElement $element */
81			if ( $element->nodeName === 'head' ) {
82				$document->head = $element; // Caching!
83				return $element;
84			}
85		}
86		return null;
87	}
88
89	/**
90	 * Get document title.
91	 * @param DOMDocument $document
92	 * @return string
93	 * @see https://html.spec.whatwg.org/multipage/dom.html#document.title
94	 */
95	public static function getTitle( DOMDocument $document ): string {
96		$titleElement = self::querySelector( $document, 'title' );
97		return $titleElement ? self::stripAndCollapseASCIIWhitespace( $titleElement->textContent ) : '';
98	}
99
100	/**
101	 * Set document title.
102	 * @param DOMDocument $document
103	 * @param string $title
104	 * @see https://html.spec.whatwg.org/multipage/dom.html#document.title
105	 */
106	public static function setTitle( DOMDocument $document, string $title ): void {
107		$titleElement = self::querySelector( $document, 'title' );
108		if ( !$titleElement ) {
109			$headElement = self::getHead( $document );
110			if ( $headElement ) {
111				$titleElement = $document->createElement( 'title' );
112				$headElement->appendChild( $titleElement );
113			}
114		}
115		if ( $titleElement ) {
116			$titleElement->textContent = $title;
117		}
118	}
119
120	/**
121	 * Return the parent element, or null if the parent is not an element.
122	 * @param DOMNode $node
123	 * @return DOMElement|null
124	 * @see https://dom.spec.whatwg.org/#dom-node-parentelement
125	 */
126	public static function getParentElement( DOMNode $node ): ?DOMElement {
127		$parent = $node->parentNode;
128		if ( $parent && $parent->nodeType === XML_ELEMENT_NODE ) {
129			/** @var DOMElement $parent */
130			return $parent;
131		}
132		return null;
133	}
134
135	/**
136	 * Return the descendant with the specified ID.
137	 * Workaround for https://bugs.php.net/bug.php?id=77686 and other issues related to
138	 * inconsistent indexing behavior.
139	 * @param DOMDocument|DOMDocumentFragment $node
140	 * @param string $id
141	 * @return DOMElement|null
142	 * @see https://dom.spec.whatwg.org/#dom-nonelementparentnode-getelementbyid
143	 */
144	public static function getElementById( DOMNode $node, string $id ): ?DOMElement {
145		Assert::parameterType( 'DOMDocument|DOMDocumentFragment', $node, '$node' );
146		$elements = Zest::getElementsById( $node, $id );
147		return $elements[0] ?? null;
148	}
149
150	/**
151	 * Workaround bug in PHP's Document::getElementById() which doesn't
152	 * actually index the 'id' attribute unless you use the non-standard
153	 * `DOMElement::setIdAttribute` method after the attribute is set;
154	 * see https://www.php.net/manual/en/domdocument.getelementbyid.php
155	 * for more details.
156	 *
157	 * @param DOMElement $element
158	 * @param string $id The desired value for the `id` attribute on $element.
159	 * @see https://phabricator.wikimedia.org/T232390
160	 */
161	public static function setIdAttribute( DOMElement $element, string $id ): void {
162		$element->setAttribute( 'id', $id );
163		$element->setIdAttribute( 'id', true );// phab:T232390
164	}
165
166	/**
167	 * Workaround bug in PHP's DOMElement::$attributes that fails to enumerate
168	 * attributes named `xmlns`.
169	 *
170	 * @param DOMElement $element
171	 * @return DOMAttr[]
172	 * @see https://phabricator.wikimedia.org/T235295
173	 */
174	public static function attributes( DOMElement $element ): array {
175		$result = [];
176		// The 'xmlns' attribute is "invisible" T235295
177		if ( $element->hasAttribute( 'xmlns' ) ) {
178			// $element->getAttributeNode actually returns a DOMNameSpaceNode
179			// This is read-only, unlike the other \DOMAttr objects
180			$attr = $element->ownerDocument->createAttributeNS(
181				'http://www.w3.org/2000/xmlns/', 'xmlns'
182			);
183			$attr->value = $element->getAttribute( 'xmlns' );
184			$result[] = $attr;
185		}
186		foreach ( $element->attributes as $attr ) {
187			// These are \DOMAttr objects
188			$result[] = $attr;
189		}
190		return $result;
191	}
192
193	/**
194	 * Workaround bug in PHP's DOMElement::hasAttributes() that fails to
195	 * enumerate attributes named `xmlns`.
196	 *
197	 * @param DOMElement $element
198	 * @return bool True if the element has any attributes
199	 * @see https://phabricator.wikimedia.org/T235295
200	 */
201	public static function hasAttributes( DOMElement $element ): bool {
202		// The 'xmlns' attribute is "invisible" T235295
203		return $element->hasAttributes() || $element->hasAttribute( 'xmlns' );
204	}
205
206	/**
207	 * Return all descendants with the specified tag name.
208	 * Workaround for PHP's getElementsByTagName being inexplicably slow in some situations
209	 * and the lack of DOMElement::getElementsByTagName().
210	 * @param DOMDocument|DOMElement $node
211	 * @param string $tagName
212	 * @return DOMNodeList
213	 * @see https://dom.spec.whatwg.org/#dom-document-getelementsbytagname
214	 * @see https://dom.spec.whatwg.org/#dom-element-getelementsbytagname
215	 * @note Note that unlike the spec this method is not guaranteed to return a DOMNodeList
216	 *   (which cannot be freely constructed in PHP), just a traversable containing DOMElements.
217	 */
218	public static function getElementsByTagName( DOMNode $node, string $tagName ): DOMNodeList {
219		Assert::parameterType( 'DOMDocument|DOMElement', $node, '$node' );
220		return Zest::getElementsByTagName( $node, $tagName );
221	}
222
223	/**
224	 * Return the last child of the node that is an Element, or null otherwise.
225	 * @param DOMDocument|DOMDocumentFragment|DOMElement $node
226	 * @return DOMElement|null
227	 * @see https://dom.spec.whatwg.org/#dom-parentnode-lastelementchild
228	 */
229	public static function getLastElementChild( DOMNode $node ): ?DOMElement {
230		Assert::parameterType( 'DOMDocument|DOMDocumentFragment|DOMElement', $node, '$node' );
231		$lastChild = $node->lastChild;
232		while ( $lastChild && $lastChild->nodeType !== XML_ELEMENT_NODE ) {
233			$lastChild = $lastChild->previousSibling;
234		}
235		return $lastChild;
236	}
237
238	/**
239	 * @param DOMDocument|DOMDocumentFragment|DOMElement $node
240	 * @param string $selector
241	 * @return DOMElement|null
242	 * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselector
243	 */
244	public static function querySelector( DOMNode $node, string $selector ): ?DOMElement {
245		return self::querySelectorAll( $node, $selector )[0] ?? null;
246	}
247
248	/**
249	 * @param DOMDocument|DOMDocumentFragment|DOMElement $node
250	 * @param string $selector
251	 * @return DOMElement[]
252	 * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselectorall
253	 * @note Note that unlike the spec this method is not guaranteed to return a DOMNodeList
254	 *   (which cannot be freely constructed in PHP), just a traversable containing DOMElements.
255	 */
256	public static function querySelectorAll( DOMNode $node, string $selector ): array {
257		Assert::parameterType( 'DOMDocument|DOMDocumentFragment|DOMElement', $node, '$node' );
258		return Zest::find( $selector, $node );
259	}
260
261	/**
262	 * Return the last preceding sibling of the node that is an element, or null otherwise.
263	 * @param DOMNode $node
264	 * @return DOMElement|null
265	 * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-previouselementsibling
266	 */
267	public static function getPreviousElementSibling( DOMNode $node ): ?DOMElement {
268		Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' );
269		$previousSibling = $node->previousSibling;
270		while ( $previousSibling && $previousSibling->nodeType !== XML_ELEMENT_NODE ) {
271			$previousSibling = $previousSibling->previousSibling;
272		}
273		return $previousSibling;
274	}
275
276	/**
277	 * Return the first following sibling of the node that is an element, or null otherwise.
278	 * @param DOMNode $node
279	 * @return DOMElement|null
280	 * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-nextelementsibling
281	 */
282	public static function getNextElementSibling( DOMNode $node ): ?DOMElement {
283		Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' );
284		$nextSibling = $node->nextSibling;
285		while ( $nextSibling && $nextSibling->nodeType !== XML_ELEMENT_NODE ) {
286			$nextSibling = $nextSibling->nextSibling;
287		}
288		return $nextSibling;
289	}
290
291	/**
292	 * Removes the node from the document.
293	 * @param DOMElement|DOMCharacterData $node
294	 * @see https://dom.spec.whatwg.org/#dom-childnode-remove
295	 */
296	public static function remove( DOMNode $node ): void {
297		Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' );
298		if ( $node->parentNode ) {
299			$node->parentNode->removeChild( $node );
300		}
301	}
302
303	/**
304	 * Get innerHTML.
305	 * @param DOMElement $element
306	 * @return string
307	 * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml
308	 */
309	public static function getInnerHTML( DOMElement $element ): string {
310		return XMLSerializer::serialize( $element, [ 'innerXML' => true ] )['html'];
311	}
312
313	/**
314	 * Set innerHTML.
315	 * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml
316	 * @param DOMElement $element
317	 * @param string $html
318	 */
319	public static function setInnerHTML( DOMElement $element, string $html ): void {
320		$domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] );
321		$treeBuilder = new TreeBuilder( $domBuilder );
322		$dispatcher = new Dispatcher( $treeBuilder );
323		$tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] );
324		$tokenizer->execute( [
325			'fragmentNamespace' => HTMLData::NS_HTML,
326			'fragmentName' => $element->tagName,
327		] );
328		// Remex returns the document fragment wrapped into a DOMElement
329		// because libxml fragment handling is not great.
330		// FIXME life would be simpler if we could make DOMBuilder use an existing document
331		$documentFragmentWrapper = $element->ownerDocument->importNode(
332			$domBuilder->getFragment(), true );
333
334		while ( $element->firstChild ) {
335			$element->removeChild( $element->firstChild );
336		}
337		// Use an iteration method that's not affected by the tree being modified during iteration
338		while ( $documentFragmentWrapper->firstChild ) {
339			$element->appendChild( $documentFragmentWrapper->firstChild );
340		}
341	}
342
343	/**
344	 * Get outerHTML.
345	 * @param DOMElement $element
346	 * @return string
347	 * @see https://w3c.github.io/DOM-Parsing/#dom-element-outerhtml
348	 */
349	public static function getOuterHTML( DOMElement $element ): string {
350		return XMLSerializer::serialize( $element, [ 'addDoctype' => false ] )['html'];
351	}
352
353	/**
354	 * Return the class list of this element.
355	 * @param DOMElement $node
356	 * @return TokenList
357	 * @see https://dom.spec.whatwg.org/#dom-element-classlist
358	 */
359	public static function getClassList( DOMElement $node ): TokenList {
360		return new TokenList( $node );
361	}
362
363	/**
364	 * @param string $text
365	 * @return string
366	 * @see https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace
367	 */
368	private static function stripAndCollapseASCIIWhitespace( string $text ): string {
369		$ws = self::$ASCII_WHITESPACE;
370		return preg_replace( "/[$ws]+/", ' ', trim( $text, $ws ) );
371	}
372
373	/**
374	 * @param DOMElement $e
375	 */
376	private static function stripEmptyTextNodes( DOMElement $e ): void {
377		$c = $e->firstChild;
378		while ( $c ) {
379			$next = $c->nextSibling;
380			if ( $c instanceof DOMText ) {
381				if ( $c->nodeValue === '' ) {
382					$e->removeChild( $c );
383				}
384			} elseif ( $c instanceof DOMElement ) {
385				self::stripEmptyTextNodes( $c );
386			}
387			$c = $next;
388		}
389	}
390
391	/**
392	 * @param DOMElement $elt root of the DOM tree that needs to be normalized
393	 */
394	public static function normalize( DOMElement $elt ): void {
395		$elt->normalize();
396
397		// Now traverse the tree rooted at $elt and remove any stray empty text nodes
398		// Unlike what https://www.w3.org/TR/DOM-Level-2-Core/core.html#ID-normalize says,
399		// the PHP DOM's normalization leaves behind upto 1 empty text node.
400		// See https://bugs.php.net/bug.php?id=78221
401		self::stripEmptyTextNodes( $elt );
402	}
403}
404