1<?php 2 3namespace Wikimedia\RemexHtml\DOM; 4 5use Wikimedia\RemexHtml\HTMLData; 6use Wikimedia\RemexHtml\Tokenizer\Attributes; 7use Wikimedia\RemexHtml\TreeBuilder\Element; 8use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; 9use Wikimedia\RemexHtml\TreeBuilder\TreeHandler; 10 11/** 12 * A TreeHandler which constructs a DOMDocument. 13 * 14 * Note that this class permits third-party `DOMImplementation`s 15 * (documents other than `\DOMDocument`, nodes other than `\DOMNode`, 16 * etc) and so no enforced PHP type hints are used which name these 17 * classes directly. For the sake of static type checking, the 18 * types *in comments* are given as if the standard PHP `\DOM*` 19 * classes are being used but at runtime everything is duck-typed. 20 */ 21class DOMBuilder implements TreeHandler { 22 23 /** @var string|null The name of the input document type */ 24 public $doctypeName; 25 26 /** @var string|null The public ID */ 27 public $public; 28 29 /** @var string|null The system ID */ 30 public $system; 31 32 /** 33 * @var int The quirks mode. May be either TreeBuilder::NO_QUIRKS, 34 * TreeBuilder::LIMITED_QUIRKS or TreeBuilder::QUIRKS to indicate 35 * no-quirks mode, limited-quirks mode or quirks mode respectively. 36 */ 37 public $quirks; 38 39 /** @var \DOMDocument */ 40 private $doc; 41 42 /** @var callable|null */ 43 private $errorCallback; 44 45 /** @var bool */ 46 private $suppressHtmlNamespace; 47 48 /** @var bool */ 49 private $suppressIdAttribute; 50 51 /** @var \DOMImplementation */ 52 private $domImplementation; 53 54 /** @var class-string */ 55 private $domExceptionClass; 56 57 /** @var bool */ 58 private $isFragment; 59 60 /** @var bool */ 61 private $coerced = false; 62 63 /** 64 * @param array $options An associative array of options: 65 * - errorCallback : A function which is called on parse errors 66 * - suppressHtmlNamespace : omit the namespace when creating HTML 67 * elements. False by default. 68 * - suppressIdAttribute : don't call the nonstandard 69 * DOMElement::setIdAttribute() method while constructing elements. 70 * False by default (this method is needed for efficient 71 * DOMDocument::getElementById() calls). Set to true if you are 72 * using a W3C spec-compliant DOMImplementation and wish to avoid 73 * nonstandard calls. 74 * - domImplementation: The DOMImplementation object to use. If this 75 * parameter is missing or null, a new DOMImplementation object will 76 * be constructed using the `domImplementationClass` option value. 77 * You can use a third-party DOM implementation by passing in an 78 * appropriately duck-typed object here. 79 * - domImplementationClass: The string name of the DOMImplementation 80 * class to use. Defaults to `\DOMImplementation::class` but 81 * you can use a third-party DOM implementation by passing 82 * an alternative class name here. 83 * - domExceptionClass: The string name of the DOMException 84 * class to use. Defaults to `\DOMException::class` but 85 * you can use a third-party DOM implementation by passing 86 * an alternative class name here. 87 */ 88 public function __construct( $options = [] ) { 89 $options += [ 90 'suppressHtmlNamespace' => false, 91 'suppressIdAttribute' => false, 92 'errorCallback' => null, 93 'domImplementation' => null, 94 'domImplementationClass' => \DOMImplementation::class, 95 'domExceptionClass' => \DOMException::class, 96 ]; 97 $this->errorCallback = $options['errorCallback']; 98 $this->suppressHtmlNamespace = $options['suppressHtmlNamespace']; 99 $this->suppressIdAttribute = $options['suppressIdAttribute']; 100 $this->domImplementation = $options['domImplementation'] ?? 101 new $options['domImplementationClass']; 102 $this->domExceptionClass = $options['domExceptionClass']; 103 } 104 105 private function rethrowIfNotDomException( \Throwable $t ) { 106 if ( is_a( $t, $this->domExceptionClass, false ) ) { 107 return; 108 } 109 throw $t; 110 } 111 112 /** 113 * Get the constructed document or document fragment. In the fragment case, 114 * a DOMElement is returned, and the caller is expected to extract its 115 * inner contents, ignoring the wrapping element. This convention is 116 * convenient because the wrapping element gives libxml somewhere to put 117 * its namespace declarations. If we copied the children into a 118 * DOMDocumentFragment, libxml would invent new prefixes for the orphaned 119 * namespaces. 120 * 121 * @return \DOMNode 122 */ 123 public function getFragment() { 124 if ( $this->isFragment ) { 125 return $this->doc->documentElement; 126 } else { 127 return $this->doc; 128 } 129 } 130 131 /** 132 * Returns true if the document was coerced due to libxml limitations. We 133 * follow HTML 5.1 § 8.2.7 "Coercing an HTML DOM into an infoset". 134 * 135 * @return bool 136 */ 137 public function isCoerced() { 138 return $this->coerced; 139 } 140 141 public function startDocument( $fragmentNamespace, $fragmentName ) { 142 $this->isFragment = $fragmentNamespace !== null; 143 $this->doc = $this->createDocument(); 144 } 145 146 /** 147 * @param string|null $doctypeName 148 * @param string|null $public 149 * @param string|null $system 150 * @return \DOMDocument 151 * @suppress PhanTypeMismatchArgumentInternalReal 152 * Null args to DOMImplementation::createDocument 153 */ 154 protected function createDocument( 155 string $doctypeName = null, 156 string $public = null, 157 string $system = null 158 ) { 159 $impl = $this->domImplementation; 160 if ( $doctypeName === '' ) { 161 $this->coerced = true; 162 $doc = $impl->createDocument( null, '' ); 163 } elseif ( $doctypeName === null ) { 164 $doc = $impl->createDocument( null, '' ); 165 } else { 166 $doctype = $impl->createDocumentType( $doctypeName, $public, $system ); 167 $doc = $impl->createDocument( null, '', $doctype ); 168 } 169 $doc->encoding = 'UTF-8'; 170 return $doc; 171 } 172 173 public function endDocument( $pos ) { 174 } 175 176 private function insertNode( $preposition, $refElement, $node ) { 177 if ( $preposition === TreeBuilder::ROOT ) { 178 $parent = $this->doc; 179 $refNode = null; 180 } elseif ( $preposition === TreeBuilder::BEFORE ) { 181 $parent = $refElement->userData->parentNode; 182 $refNode = $refElement->userData; 183 } else { 184 $parent = $refElement->userData; 185 $refNode = null; 186 } 187 // @phan-suppress-next-line PhanTypeMismatchArgumentInternal 188 $parent->insertBefore( $node, $refNode ); 189 } 190 191 /** 192 * Replace unsupported characters with a code of the form U123456. 193 * 194 * @param string $name 195 * @return string 196 */ 197 private function coerceName( $name ) { 198 $coercedName = DOMUtils::coerceName( $name ); 199 if ( $name !== $coercedName ) { 200 $this->coerced = true; 201 } 202 return $coercedName; 203 } 204 205 protected function createNode( Element $element ) { 206 $noNS = $this->suppressHtmlNamespace && $element->namespace === HTMLData::NS_HTML; 207 try { 208 if ( $noNS ) { 209 $node = $this->doc->createElement( $element->name ); 210 } else { 211 $node = $this->doc->createElementNS( 212 $element->namespace, 213 $element->name ); 214 } 215 } catch ( \Throwable $e ) { 216 $this->rethrowIfNotDomException( $e ); 217 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 218 // Attempt to escape the name so that it is more acceptable 219 if ( $noNS ) { 220 $node = $this->doc->createElement( 221 $this->coerceName( $element->name ) 222 ); 223 } else { 224 $node = $this->doc->createElementNS( 225 $element->namespace, 226 $this->coerceName( $element->name ) ); 227 } 228 } 229 230 foreach ( $element->attrs->getObjects() as $attr ) { 231 if ( $attr->namespaceURI === null 232 && strpos( $attr->localName, ':' ) !== false 233 ) { 234 // FIXME: this apparently works to create a prefixed localName 235 // in the null namespace, but this is probably taking advantage 236 // of a bug in PHP's DOM library, and screws up in various 237 // interesting ways. For example, attributes created in this 238 // way can't be discovered via hasAttribute() or hasAttributeNS(). 239 $attrNode = $this->doc->createAttribute( $attr->localName ); 240 $attrNode->value = $attr->value; 241 try { 242 $node->setAttributeNodeNS( $attrNode ); 243 } catch ( \Throwable $e ) { 244 $this->rethrowIfNotDomException( $e ); 245 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 246 $node->setAttributeNS( 247 $attr->namespaceURI, 248 $this->coerceName( $attr->qualifiedName ), 249 $attr->value ); 250 } 251 } else { 252 try { 253 $node->setAttributeNS( 254 $attr->namespaceURI, 255 $attr->qualifiedName, 256 $attr->value ); 257 } catch ( \Throwable $e ) { 258 $this->rethrowIfNotDomException( $e ); 259 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 260 $node->setAttributeNS( 261 $attr->namespaceURI, 262 $this->coerceName( $attr->qualifiedName ), 263 $attr->value ); 264 } 265 } 266 } 267 if ( ( !$this->suppressIdAttribute ) && $node->hasAttribute( 'id' ) ) { 268 // This is a call to a non-standard DOM method required by PHP in 269 // order to implement DOMDocument::getElementById() efficiently. 270 $node->setIdAttribute( 'id', true ); 271 } 272 $element->userData = $node; 273 return $node; 274 } 275 276 public function characters( $preposition, $refElement, $text, $start, $length, 277 $sourceStart, $sourceLength 278 ) { 279 // Parse $preposition and $refElement as in self::insertNode() 280 if ( $preposition === TreeBuilder::ROOT ) { 281 $parent = $this->doc; 282 $refNode = null; 283 } elseif ( $preposition === TreeBuilder::BEFORE ) { 284 $parent = $refElement->userData->parentNode; 285 $refNode = $refElement->userData; 286 } else { 287 $parent = $refElement->userData; 288 $refNode = null; 289 } 290 // https://html.spec.whatwg.org/#insert-a-character 291 // If the adjusted insertion location is in a Document node, then 292 // return. 293 if ( $parent === $this->doc ) { 294 return; 295 } 296 $data = substr( $text, $start, $length ); 297 // If there is a Text node immediately before the adjusted insertion 298 // location, then append data to that Text node's data. 299 if ( $refNode === null ) { 300 $prev = $parent->lastChild; 301 } else { 302 /** @var \DOMNode $refNode */ 303 $prev = $refNode->previousSibling; 304 } 305 if ( $prev !== null && $prev->nodeType === XML_TEXT_NODE ) { 306 '@phan-var \DOMCharacterData $prev'; /** @var \DOMCharacterData $prev */ 307 $prev->appendData( $data ); 308 } else { 309 $node = $this->doc->createTextNode( $data ); 310 // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal 311 $parent->insertBefore( $node, $refNode ); 312 } 313 } 314 315 public function insertElement( $preposition, $refElement, Element $element, $void, 316 $sourceStart, $sourceLength 317 ) { 318 if ( $element->userData ) { 319 $node = $element->userData; 320 } else { 321 $node = $this->createNode( $element ); 322 } 323 $this->insertNode( $preposition, $refElement, $node ); 324 } 325 326 public function endTag( Element $element, $sourceStart, $sourceLength ) { 327 } 328 329 public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { 330 if ( !$this->doc->firstChild ) { 331 $this->doc = $this->createDocument( $name, $public, $system ); 332 } 333 $this->doctypeName = $name; 334 $this->public = $public; 335 $this->system = $system; 336 $this->quirks = $quirks; 337 } 338 339 public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) { 340 $node = $this->doc->createComment( $text ); 341 $this->insertNode( $preposition, $refElement, $node ); 342 } 343 344 public function error( $text, $pos ) { 345 if ( $this->errorCallback ) { 346 call_user_func( $this->errorCallback, $text, $pos ); 347 } 348 } 349 350 public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) { 351 $node = $element->userData; 352 '@phan-var \DOMElement $node'; /** @var \DOMElement $node */ 353 foreach ( $attrs->getObjects() as $name => $attr ) { 354 if ( $attr->namespaceURI === null 355 && strpos( $attr->localName, ':' ) !== false 356 ) { 357 // As noted in createNode(), we can't use hasAttribute() here. 358 // However, we can use the return value of setAttributeNodeNS() 359 // instead. 360 $attrNode = $this->doc->createAttribute( $attr->localName ); 361 $attrNode->value = $attr->value; 362 try { 363 $replaced = $node->setAttributeNodeNS( $attrNode ); 364 } catch ( \Throwable $e ) { 365 $this->rethrowIfNotDomException( $e ); 366 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 367 $attrNode = $this->doc->createAttribute( 368 $this->coerceName( $attr->localName ) ); 369 $attrNode->value = $attr->value; 370 $replaced = $node->setAttributeNodeNS( $attrNode ); 371 } 372 if ( $replaced ) { 373 // Put it back how it was 374 $node->setAttributeNodeNS( $replaced ); 375 } 376 } elseif ( $attr->namespaceURI === null ) { 377 try { 378 if ( !$node->hasAttribute( $attr->localName ) ) { 379 $node->setAttribute( $attr->localName, $attr->value ); 380 } 381 } catch ( \Throwable $e ) { 382 $this->rethrowIfNotDomException( $e ); 383 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 384 $name = $this->coerceName( $attr->localName ); 385 if ( !$node->hasAttribute( $name ) ) { 386 $node->setAttribute( $name, $attr->value ); 387 } 388 } 389 } else { 390 try { 391 if ( !$node->hasAttributeNS( $attr->namespaceURI, $attr->localName ) ) { 392 $node->setAttributeNS( $attr->namespaceURI, 393 $attr->localName, $attr->value ); 394 } 395 } catch ( \Throwable $e ) { 396 $this->rethrowIfNotDomException( $e ); 397 '@phan-var \DOMException $e'; /** @var \DOMException $e */ 398 $name = $this->coerceName( $attr->localName ); 399 if ( !$node->hasAttributeNS( $attr->namespaceURI, $name ) ) { 400 $node->setAttributeNS( $attr->namespaceURI, $name, $attr->value ); 401 } 402 } 403 } 404 } 405 } 406 407 public function removeNode( Element $element, $sourceStart ) { 408 $node = $element->userData; 409 $node->parentNode->removeChild( $node ); 410 } 411 412 public function reparentChildren( Element $element, Element $newParent, $sourceStart ) { 413 $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 ); 414 $node = $element->userData; 415 /** @var \DOMElement $newParentNode */ 416 $newParentNode = $newParent->userData; 417 while ( $node->firstChild !== $newParentNode ) { 418 $newParentNode->appendChild( $node->firstChild ); 419 } 420 } 421} 422 423// Retain the old namespace for backwards compatibility. 424class_alias( DOMBuilder::class, 'RemexHtml\DOM\DOMBuilder' ); 425