1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Utils; 5 6use DOMAttr; 7use DOMCharacterData; 8use DOMDocument; 9use DOMDocumentFragment; 10use DOMElement; 11use DOMNode; 12use DOMNodeList; 13use DOMText; 14use RemexHtml\DOM\DOMBuilder; 15use RemexHtml\HTMLData; 16use RemexHtml\Tokenizer\Tokenizer; 17use RemexHtml\TreeBuilder\Dispatcher; 18use RemexHtml\TreeBuilder\TreeBuilder; 19use Wikimedia\Assert\Assert; 20use Wikimedia\Parsoid\Utils\DOMCompat\TokenList; 21use Wikimedia\Parsoid\Wt2Html\XMLSerializer; 22use Wikimedia\Zest\Zest; 23 24/** 25 * Helper class that provides missing DOM level 3 methods for the PHP DOM classes. 26 * For a DOM method $node->foo( $bar) the equivalent helper is DOMCompat::foo( $node, $bar ). 27 * For a DOM property $node->foo there is a DOMCompat::getFoo( $node ) and 28 * DOMCompat::setFoo( $node, $value ). 29 * Only implements the methods that are actually used by Parsoid. 30 */ 31class DOMCompat { 32 33 /** 34 * Tab, LF, FF, CR, space 35 * @see https://infra.spec.whatwg.org/#ascii-whitespace 36 */ 37 private static $ASCII_WHITESPACE = "\t\r\f\n "; 38 39 /** 40 * Get document body. 41 * Unlike the spec we return it as a native PHP DOM object. 42 * @param DOMDocument $document 43 * @return DOMElement|null 44 * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-body 45 * @suppress PhanUndeclaredProperty 46 */ 47 public static function getBody( DOMDocument $document ): ?DOMElement { 48 // Use an undeclared dynamic property as a cache. 49 // WARNING: this will not be updated if (for some reason) the 50 // document body changes. 51 if ( isset( $document->body ) ) { 52 return $document->body; 53 } 54 foreach ( $document->documentElement->childNodes as $element ) { 55 /** @var DOMElement $element */ 56 if ( $element->nodeName === 'body' || $element->nodeName === 'frameset' ) { 57 $document->body = $element; // Caching! 58 return $element; 59 } 60 } 61 return null; 62 } 63 64 /** 65 * Get document head. 66 * Unlike the spec we return it as a native PHP DOM object. 67 * @param DOMDocument $document 68 * @return DOMElement|null 69 * @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-head 70 * @suppress PhanUndeclaredProperty 71 */ 72 public static function getHead( DOMDocument $document ): ?DOMElement { 73 // Use an undeclared dynamic property as a cache. 74 // WARNING: this will not be updated if (for some reason) the 75 // document head changes. 76 if ( isset( $document->head ) ) { 77 return $document->head; 78 } 79 foreach ( $document->documentElement->childNodes as $element ) { 80 /** @var DOMElement $element */ 81 if ( $element->nodeName === 'head' ) { 82 $document->head = $element; // Caching! 83 return $element; 84 } 85 } 86 return null; 87 } 88 89 /** 90 * Get document title. 91 * @param DOMDocument $document 92 * @return string 93 * @see https://html.spec.whatwg.org/multipage/dom.html#document.title 94 */ 95 public static function getTitle( DOMDocument $document ): string { 96 $titleElement = self::querySelector( $document, 'title' ); 97 return $titleElement ? self::stripAndCollapseASCIIWhitespace( $titleElement->textContent ) : ''; 98 } 99 100 /** 101 * Set document title. 102 * @param DOMDocument $document 103 * @param string $title 104 * @see https://html.spec.whatwg.org/multipage/dom.html#document.title 105 */ 106 public static function setTitle( DOMDocument $document, string $title ): void { 107 $titleElement = self::querySelector( $document, 'title' ); 108 if ( !$titleElement ) { 109 $headElement = self::getHead( $document ); 110 if ( $headElement ) { 111 $titleElement = $document->createElement( 'title' ); 112 $headElement->appendChild( $titleElement ); 113 } 114 } 115 if ( $titleElement ) { 116 $titleElement->textContent = $title; 117 } 118 } 119 120 /** 121 * Return the parent element, or null if the parent is not an element. 122 * @param DOMNode $node 123 * @return DOMElement|null 124 * @see https://dom.spec.whatwg.org/#dom-node-parentelement 125 */ 126 public static function getParentElement( DOMNode $node ): ?DOMElement { 127 $parent = $node->parentNode; 128 if ( $parent && $parent->nodeType === XML_ELEMENT_NODE ) { 129 /** @var DOMElement $parent */ 130 return $parent; 131 } 132 return null; 133 } 134 135 /** 136 * Return the descendant with the specified ID. 137 * Workaround for https://bugs.php.net/bug.php?id=77686 and other issues related to 138 * inconsistent indexing behavior. 139 * @param DOMDocument|DOMDocumentFragment $node 140 * @param string $id 141 * @return DOMElement|null 142 * @see https://dom.spec.whatwg.org/#dom-nonelementparentnode-getelementbyid 143 */ 144 public static function getElementById( DOMNode $node, string $id ): ?DOMElement { 145 Assert::parameterType( 'DOMDocument|DOMDocumentFragment', $node, '$node' ); 146 $elements = Zest::getElementsById( $node, $id ); 147 return $elements[0] ?? null; 148 } 149 150 /** 151 * Workaround bug in PHP's Document::getElementById() which doesn't 152 * actually index the 'id' attribute unless you use the non-standard 153 * `DOMElement::setIdAttribute` method after the attribute is set; 154 * see https://www.php.net/manual/en/domdocument.getelementbyid.php 155 * for more details. 156 * 157 * @param DOMElement $element 158 * @param string $id The desired value for the `id` attribute on $element. 159 * @see https://phabricator.wikimedia.org/T232390 160 */ 161 public static function setIdAttribute( DOMElement $element, string $id ): void { 162 $element->setAttribute( 'id', $id ); 163 $element->setIdAttribute( 'id', true );// phab:T232390 164 } 165 166 /** 167 * Workaround bug in PHP's DOMElement::$attributes that fails to enumerate 168 * attributes named `xmlns`. 169 * 170 * @param DOMElement $element 171 * @return DOMAttr[] 172 * @see https://phabricator.wikimedia.org/T235295 173 */ 174 public static function attributes( DOMElement $element ): array { 175 $result = []; 176 // The 'xmlns' attribute is "invisible" T235295 177 if ( $element->hasAttribute( 'xmlns' ) ) { 178 // $element->getAttributeNode actually returns a DOMNameSpaceNode 179 // This is read-only, unlike the other \DOMAttr objects 180 $attr = $element->ownerDocument->createAttributeNS( 181 'http://www.w3.org/2000/xmlns/', 'xmlns' 182 ); 183 $attr->value = $element->getAttribute( 'xmlns' ); 184 $result[] = $attr; 185 } 186 foreach ( $element->attributes as $attr ) { 187 // These are \DOMAttr objects 188 $result[] = $attr; 189 } 190 return $result; 191 } 192 193 /** 194 * Workaround bug in PHP's DOMElement::hasAttributes() that fails to 195 * enumerate attributes named `xmlns`. 196 * 197 * @param DOMElement $element 198 * @return bool True if the element has any attributes 199 * @see https://phabricator.wikimedia.org/T235295 200 */ 201 public static function hasAttributes( DOMElement $element ): bool { 202 // The 'xmlns' attribute is "invisible" T235295 203 return $element->hasAttributes() || $element->hasAttribute( 'xmlns' ); 204 } 205 206 /** 207 * Return all descendants with the specified tag name. 208 * Workaround for PHP's getElementsByTagName being inexplicably slow in some situations 209 * and the lack of DOMElement::getElementsByTagName(). 210 * @param DOMDocument|DOMElement $node 211 * @param string $tagName 212 * @return DOMNodeList 213 * @see https://dom.spec.whatwg.org/#dom-document-getelementsbytagname 214 * @see https://dom.spec.whatwg.org/#dom-element-getelementsbytagname 215 * @note Note that unlike the spec this method is not guaranteed to return a DOMNodeList 216 * (which cannot be freely constructed in PHP), just a traversable containing DOMElements. 217 */ 218 public static function getElementsByTagName( DOMNode $node, string $tagName ): DOMNodeList { 219 Assert::parameterType( 'DOMDocument|DOMElement', $node, '$node' ); 220 return Zest::getElementsByTagName( $node, $tagName ); 221 } 222 223 /** 224 * Return the last child of the node that is an Element, or null otherwise. 225 * @param DOMDocument|DOMDocumentFragment|DOMElement $node 226 * @return DOMElement|null 227 * @see https://dom.spec.whatwg.org/#dom-parentnode-lastelementchild 228 */ 229 public static function getLastElementChild( DOMNode $node ): ?DOMElement { 230 Assert::parameterType( 'DOMDocument|DOMDocumentFragment|DOMElement', $node, '$node' ); 231 $lastChild = $node->lastChild; 232 while ( $lastChild && $lastChild->nodeType !== XML_ELEMENT_NODE ) { 233 $lastChild = $lastChild->previousSibling; 234 } 235 return $lastChild; 236 } 237 238 /** 239 * @param DOMDocument|DOMDocumentFragment|DOMElement $node 240 * @param string $selector 241 * @return DOMElement|null 242 * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselector 243 */ 244 public static function querySelector( DOMNode $node, string $selector ): ?DOMElement { 245 return self::querySelectorAll( $node, $selector )[0] ?? null; 246 } 247 248 /** 249 * @param DOMDocument|DOMDocumentFragment|DOMElement $node 250 * @param string $selector 251 * @return DOMElement[] 252 * @see https://dom.spec.whatwg.org/#dom-parentnode-queryselectorall 253 * @note Note that unlike the spec this method is not guaranteed to return a DOMNodeList 254 * (which cannot be freely constructed in PHP), just a traversable containing DOMElements. 255 */ 256 public static function querySelectorAll( DOMNode $node, string $selector ): array { 257 Assert::parameterType( 'DOMDocument|DOMDocumentFragment|DOMElement', $node, '$node' ); 258 return Zest::find( $selector, $node ); 259 } 260 261 /** 262 * Return the last preceding sibling of the node that is an element, or null otherwise. 263 * @param DOMNode $node 264 * @return DOMElement|null 265 * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-previouselementsibling 266 */ 267 public static function getPreviousElementSibling( DOMNode $node ): ?DOMElement { 268 Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' ); 269 $previousSibling = $node->previousSibling; 270 while ( $previousSibling && $previousSibling->nodeType !== XML_ELEMENT_NODE ) { 271 $previousSibling = $previousSibling->previousSibling; 272 } 273 return $previousSibling; 274 } 275 276 /** 277 * Return the first following sibling of the node that is an element, or null otherwise. 278 * @param DOMNode $node 279 * @return DOMElement|null 280 * @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-nextelementsibling 281 */ 282 public static function getNextElementSibling( DOMNode $node ): ?DOMElement { 283 Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' ); 284 $nextSibling = $node->nextSibling; 285 while ( $nextSibling && $nextSibling->nodeType !== XML_ELEMENT_NODE ) { 286 $nextSibling = $nextSibling->nextSibling; 287 } 288 return $nextSibling; 289 } 290 291 /** 292 * Removes the node from the document. 293 * @param DOMElement|DOMCharacterData $node 294 * @see https://dom.spec.whatwg.org/#dom-childnode-remove 295 */ 296 public static function remove( DOMNode $node ): void { 297 Assert::parameterType( 'DOMElement|DOMCharacterData', $node, '$node' ); 298 if ( $node->parentNode ) { 299 $node->parentNode->removeChild( $node ); 300 } 301 } 302 303 /** 304 * Get innerHTML. 305 * @param DOMElement $element 306 * @return string 307 * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml 308 */ 309 public static function getInnerHTML( DOMElement $element ): string { 310 return XMLSerializer::serialize( $element, [ 'innerXML' => true ] )['html']; 311 } 312 313 /** 314 * Set innerHTML. 315 * @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml 316 * @param DOMElement $element 317 * @param string $html 318 */ 319 public static function setInnerHTML( DOMElement $element, string $html ): void { 320 $domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] ); 321 $treeBuilder = new TreeBuilder( $domBuilder ); 322 $dispatcher = new Dispatcher( $treeBuilder ); 323 $tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); 324 $tokenizer->execute( [ 325 'fragmentNamespace' => HTMLData::NS_HTML, 326 'fragmentName' => $element->tagName, 327 ] ); 328 // Remex returns the document fragment wrapped into a DOMElement 329 // because libxml fragment handling is not great. 330 // FIXME life would be simpler if we could make DOMBuilder use an existing document 331 $documentFragmentWrapper = $element->ownerDocument->importNode( 332 $domBuilder->getFragment(), true ); 333 334 while ( $element->firstChild ) { 335 $element->removeChild( $element->firstChild ); 336 } 337 // Use an iteration method that's not affected by the tree being modified during iteration 338 while ( $documentFragmentWrapper->firstChild ) { 339 $element->appendChild( $documentFragmentWrapper->firstChild ); 340 } 341 } 342 343 /** 344 * Get outerHTML. 345 * @param DOMElement $element 346 * @return string 347 * @see https://w3c.github.io/DOM-Parsing/#dom-element-outerhtml 348 */ 349 public static function getOuterHTML( DOMElement $element ): string { 350 return XMLSerializer::serialize( $element, [ 'addDoctype' => false ] )['html']; 351 } 352 353 /** 354 * Return the class list of this element. 355 * @param DOMElement $node 356 * @return TokenList 357 * @see https://dom.spec.whatwg.org/#dom-element-classlist 358 */ 359 public static function getClassList( DOMElement $node ): TokenList { 360 return new TokenList( $node ); 361 } 362 363 /** 364 * @param string $text 365 * @return string 366 * @see https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace 367 */ 368 private static function stripAndCollapseASCIIWhitespace( string $text ): string { 369 $ws = self::$ASCII_WHITESPACE; 370 return preg_replace( "/[$ws]+/", ' ', trim( $text, $ws ) ); 371 } 372 373 /** 374 * @param DOMElement $e 375 */ 376 private static function stripEmptyTextNodes( DOMElement $e ): void { 377 $c = $e->firstChild; 378 while ( $c ) { 379 $next = $c->nextSibling; 380 if ( $c instanceof DOMText ) { 381 if ( $c->nodeValue === '' ) { 382 $e->removeChild( $c ); 383 } 384 } elseif ( $c instanceof DOMElement ) { 385 self::stripEmptyTextNodes( $c ); 386 } 387 $c = $next; 388 } 389 } 390 391 /** 392 * @param DOMElement $elt root of the DOM tree that needs to be normalized 393 */ 394 public static function normalize( DOMElement $elt ): void { 395 $elt->normalize(); 396 397 // Now traverse the tree rooted at $elt and remove any stray empty text nodes 398 // Unlike what https://www.w3.org/TR/DOM-Level-2-Core/core.html#ID-normalize says, 399 // the PHP DOM's normalization leaves behind upto 1 empty text node. 400 // See https://bugs.php.net/bug.php?id=78221 401 self::stripEmptyTextNodes( $elt ); 402 } 403} 404