1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Utils; 5 6use Composer\Semver\Semver; 7use stdClass; 8use Wikimedia\Assert\Assert; 9use Wikimedia\Parsoid\Config\Env; 10use Wikimedia\Parsoid\Core\DataParsoid; 11use Wikimedia\Parsoid\Core\DomSourceRange; 12use Wikimedia\Parsoid\DOM\Document; 13use Wikimedia\Parsoid\DOM\Element; 14use Wikimedia\Parsoid\DOM\Node; 15use Wikimedia\Parsoid\Tokens\SourceRange; 16 17/** 18 * These helpers pertain to HTML and data attributes of a node. 19 */ 20class DOMDataUtils { 21 public const DATA_OBJECT_ATTR_NAME = 'data-object-id'; 22 23 /** 24 * Return the dynamic "bag" property of a Document. 25 * @param Document $doc 26 * @return DataBag 27 */ 28 private static function getBag( Document $doc ): DataBag { 29 // This is a dynamic property; it is not declared. 30 // All references go through here so we can suppress phan's complaint. 31 // @phan-suppress-next-line PhanUndeclaredProperty 32 return $doc->bag; 33 } 34 35 /** 36 * @param Document $doc 37 */ 38 public static function prepareDoc( Document $doc ) { 39 // `bag` is a deliberate dynamic property; see DOMDataUtils::getBag() 40 // @phan-suppress-next-line PhanUndeclaredProperty dynamic property 41 $doc->bag = new DataBag(); 42 43 // Cache the head and body. 44 DOMCompat::getHead( $doc ); 45 DOMCompat::getBody( $doc ); 46 } 47 48 /** 49 * Stash $obj in $doc and return an id for later retrieval 50 * @param Document $doc 51 * @param stdClass $obj 52 * @return int 53 */ 54 public static function stashObjectInDoc( Document $doc, stdClass $obj ): int { 55 return self::getBag( $doc )->stashObject( $obj ); 56 } 57 58 /** 59 * Does this node have any attributes? 60 * @param Element $node 61 * @return bool 62 */ 63 public static function noAttrs( Element $node ): bool { 64 $numAttrs = count( DOMCompat::attributes( $node ) ); 65 return $numAttrs === 0 || 66 ( $numAttrs === 1 && $node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ); 67 } 68 69 /** 70 * Get data object from a node. 71 * 72 * @param Element $node node 73 * @return stdClass 74 */ 75 public static function getNodeData( Element $node ): stdClass { 76 if ( !$node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ) { 77 // Initialized on first request 78 $dataObject = new stdClass; 79 self::setNodeData( $node, $dataObject ); 80 return $dataObject; 81 } 82 83 $docId = $node->getAttribute( self::DATA_OBJECT_ATTR_NAME ); 84 if ( $docId !== '' ) { 85 $dataObject = self::getBag( $node->ownerDocument )->getObject( (int)$docId ); 86 } else { 87 $dataObject = null; // Make phan happy 88 } 89 Assert::invariant( isset( $dataObject ), 'Bogus docId given!' ); 90 '@phan-var stdClass $dataObject'; // @var stdClass $dataObject 91 if ( isset( $dataObject->storedId ) ) { 92 PHPUtils::unreachable( 93 'Trying to fetch node data without loading!' . 94 // If this node's data-object id is different from storedId, 95 // it will indicate that the data-parsoid object was shared 96 // between nodes without getting cloned. Useful for debugging. 97 'Node id: ' . $node->getAttribute( self::DATA_OBJECT_ATTR_NAME ) . 98 'Stored data: ' . PHPUtils::jsonEncode( $dataObject ) 99 ); 100 } 101 return $dataObject; 102 } 103 104 /** 105 * Set node data. 106 * 107 * @param Element $node node 108 * @param stdClass $data data 109 */ 110 public static function setNodeData( Element $node, stdClass $data ): void { 111 $docId = self::stashObjectInDoc( $node->ownerDocument, $data ); 112 $node->setAttribute( self::DATA_OBJECT_ATTR_NAME, (string)$docId ); 113 } 114 115 /** 116 * Get data parsoid info from a node. 117 * 118 * @param Element $node node 119 * @return DataParsoid 120 */ 121 public static function getDataParsoid( Element $node ): stdClass { 122 $data = self::getNodeData( $node ); 123 if ( !isset( $data->parsoid ) ) { 124 $data->parsoid = new stdClass; 125 } 126 if ( !isset( $data->parsoid->tmp ) ) { 127 $data->parsoid->tmp = new stdClass; 128 } 129 return $data->parsoid; 130 } 131 132 /** Set data parsoid info on a node. 133 * 134 * @param Element $node node 135 * @param stdClass $dp data-parsoid 136 */ 137 public static function setDataParsoid( Element $node, stdClass $dp ): void { 138 $data = self::getNodeData( $node ); 139 $data->parsoid = $dp; 140 } 141 142 /** 143 * Get data diff info from a node. 144 * 145 * @param Element $node node 146 * @return ?stdClass 147 */ 148 public static function getDataParsoidDiff( Element $node ): ?stdClass { 149 $data = self::getNodeData( $node ); 150 // We won't set a default value for this property 151 return $data->parsoid_diff ?? null; 152 } 153 154 /** Set data diff info on a node. 155 * 156 * @param Element $node node 157 * @param ?stdClass $diffObj data-parsoid-diff object 158 */ 159 public static function setDataParsoidDiff( Element $node, ?stdClass $diffObj ): void { 160 $data = self::getNodeData( $node ); 161 $data->parsoid_diff = $diffObj; 162 } 163 164 /** 165 * Get data meta wiki info from a node. 166 * 167 * @param Element $node node 168 * @return stdClass 169 */ 170 public static function getDataMw( Element $node ): stdClass { 171 $data = self::getNodeData( $node ); 172 if ( !isset( $data->mw ) ) { 173 $data->mw = new stdClass; 174 } 175 return $data->mw; 176 } 177 178 /** Set data meta wiki info from a node. 179 * 180 * @param Element $node node 181 * @param ?stdClass $dmw data-mw 182 */ 183 public static function setDataMw( Element $node, ?stdClass $dmw ): void { 184 $data = self::getNodeData( $node ); 185 $data->mw = $dmw; 186 } 187 188 /** 189 * Check if there is meta wiki info in a node. 190 * 191 * @param Element $node node 192 * @return bool 193 */ 194 public static function validDataMw( Element $node ): bool { 195 return (array)self::getDataMw( $node ) !== []; 196 } 197 198 /** 199 * Get an object from a JSON-encoded XML attribute on a node. 200 * 201 * @param Element $node node 202 * @param string $name name 203 * @param mixed $defaultVal 204 * @return mixed 205 */ 206 public static function getJSONAttribute( Element $node, string $name, $defaultVal ) { 207 if ( !$node->hasAttribute( $name ) ) { 208 return $defaultVal; 209 } 210 $attVal = $node->getAttribute( $name ); 211 $decoded = PHPUtils::jsonDecode( $attVal, false ); 212 if ( $decoded !== null ) { 213 return $decoded; 214 } else { 215 error_log( 'ERROR: Could not decode attribute-val ' . $attVal . 216 ' for ' . $name . ' on node ' . DOMCompat::nodeName( $node ) ); 217 return $defaultVal; 218 } 219 } 220 221 /** 222 * Set a attribute on a node with a JSON-encoded object. 223 * 224 * @param Element $node node 225 * @param string $name Name of the attribute. 226 * @param mixed $obj value of the attribute to 227 */ 228 public static function setJSONAttribute( Element $node, string $name, $obj ): void { 229 $val = $obj === [] ? '{}' : PHPUtils::jsonEncode( $obj ); 230 $node->setAttribute( $name, $val ); 231 } 232 233 /** 234 * Set shadow info on a node; similar to the method on tokens. 235 * Records a key = value pair in data-parsoid['a'] property. 236 * 237 * This is effectively a call of 'setShadowInfoIfModified' except 238 * there is no original value, so by definition, $val is modified. 239 * 240 * @param Element $node node 241 * @param string $name Name of the attribute. 242 * @param mixed $val val 243 */ 244 public static function setShadowInfo( Element $node, string $name, $val ): void { 245 $dp = self::getDataParsoid( $node ); 246 if ( !isset( $dp->a ) ) { 247 $dp->a = []; 248 } 249 if ( !isset( $dp->sa ) ) { 250 $dp->sa = []; 251 } 252 $dp->a[$name] = $val; 253 } 254 255 /** 256 * Set shadow info on a node; similar to the method on tokens. 257 * 258 * If the new value ($val) for the key ($name) is different from the 259 * original value ($origVal): 260 * - the new value is recorded in data-parsoid->a and 261 * - the original value is recorded in data-parsoid->sa 262 * 263 * @param Element $node node 264 * @param string $name Name of the attribute. 265 * @param mixed $val val 266 * @param mixed $origVal original value (null is a valid value) 267 * @param bool $skipOrig 268 */ 269 public static function setShadowInfoIfModified( 270 Element $node, string $name, $val, $origVal, bool $skipOrig = false 271 ): void { 272 if ( !$skipOrig && ( $val === $origVal || $origVal === null ) ) { 273 return; 274 } 275 $dp = self::getDataParsoid( $node ); 276 if ( !isset( $dp->a ) ) { 277 $dp->a = []; 278 } 279 if ( !isset( $dp->sa ) ) { 280 $dp->sa = []; 281 } 282 // FIXME: This is a hack to not overwrite already shadowed info. 283 // We should either fix the call site that depends on this 284 // behaviour to do an explicit check, or double down on this 285 // by porting it to the token method as well. 286 if ( !$skipOrig && !array_key_exists( $name, $dp->a ) ) { 287 $dp->sa[$name] = $origVal; 288 } 289 $dp->a[$name] = $val; 290 } 291 292 /** 293 * Set an attribute and shadow info to a node. 294 * Similar to the method on tokens 295 * 296 * @param Element $node node 297 * @param string $name Name of the attribute. 298 * @param mixed $val value 299 * @param mixed $origVal original value 300 * @param bool $skipOrig 301 */ 302 public static function addNormalizedAttribute( 303 Element $node, string $name, $val, $origVal, bool $skipOrig = false 304 ): void { 305 if ( $name === 'id' ) { 306 DOMCompat::setIdAttribute( $node, $val ); 307 } else { 308 $node->setAttribute( $name, $val ); 309 } 310 self::setShadowInfoIfModified( $node, $name, $val, $origVal, $skipOrig ); 311 } 312 313 /** 314 * Get this document's pagebundle object 315 * @param Document $doc 316 * @return stdClass 317 */ 318 public static function getPageBundle( Document $doc ): stdClass { 319 return self::getBag( $doc )->getPageBundle(); 320 } 321 322 /** 323 * Removes the `data-*` attribute from a node, and migrates the data to the 324 * document's JSON store. Generates a unique id with the following format: 325 * ``` 326 * mw<base64-encoded counter> 327 * ``` 328 * but attempts to keep user defined ids. 329 * 330 * @param Element $node node 331 * @param Env $env environment 332 * @param stdClass $data data 333 * @param array $idIndex Index of used id attributes in the DOM 334 */ 335 public static function storeInPageBundle( 336 Element $node, Env $env, stdClass $data, array $idIndex 337 ): void { 338 $uid = $node->getAttribute( 'id' ) ?? ''; 339 $document = $node->ownerDocument; 340 $pb = self::getPageBundle( $document ); 341 $docDp = $pb->parsoid; 342 $origId = $uid ?: null; 343 if ( array_key_exists( $uid, $docDp->ids ) ) { 344 $uid = null; 345 // FIXME: Protect mw ids while tokenizing to avoid false positives. 346 $env->log( 'info', 'Wikitext for this page has duplicate ids: ' . $origId ); 347 } 348 if ( !$uid ) { 349 do { 350 $docDp->counter += 1; 351 // PORT-FIXME: NOTE that we aren't updating the idIndex here because 352 // we are generating unique ids that will not conflict. In any case, 353 // the idIndex is a workaround for the PHP DOM's issues and we might 354 // switch out of this in the future anyway. 355 $uid = 'mw' . PHPUtils::counterToBase64( $docDp->counter ); 356 } while ( isset( $idIndex[$uid] ) ); 357 self::addNormalizedAttribute( $node, 'id', $uid, $origId ); 358 } 359 $docDp->ids[$uid] = $data->parsoid; 360 if ( isset( $data->mw ) ) { 361 $pb->mw->ids[$uid] = $data->mw; 362 } 363 } 364 365 /** 366 * @param Document $doc doc 367 * @param stdClass $obj object 368 */ 369 public static function injectPageBundle( Document $doc, stdClass $obj ): void { 370 $pb = PHPUtils::jsonEncode( $obj ); 371 $script = $doc->createElement( 'script' ); 372 DOMCompat::setIdAttribute( $script, 'mw-pagebundle' ); 373 $script->setAttribute( 'type', 'application/x-mw-pagebundle' ); 374 $script->appendChild( $doc->createTextNode( $pb ) ); 375 DOMCompat::getHead( $doc )->appendChild( $script ); 376 } 377 378 /** 379 * @param Document $doc doc 380 * @return stdClass|null 381 */ 382 public static function extractPageBundle( Document $doc ): ?stdClass { 383 $pb = null; 384 $dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' ); 385 if ( $dpScriptElt ) { 386 $dpScriptElt->parentNode->removeChild( $dpScriptElt ); 387 $pb = PHPUtils::jsonDecode( $dpScriptElt->textContent, false ); 388 } 389 return $pb; 390 } 391 392 /** 393 * Walk DOM from node downward calling loadDataAttribs 394 * 395 * @param Node $node node 396 * @param array $options options 397 */ 398 public static function visitAndLoadDataAttribs( Node $node, array $options = [] ): void { 399 DOMUtils::visitDOM( $node, [ self::class, 'loadDataAttribs' ], $options ); 400 } 401 402 /** 403 * Massage the data parsoid object loaded from a node attribute 404 * into expected shape. When we create a first-class object for 405 * data-parsoid, this will move into the constructor. 406 * 407 * @param stdClass $dp 408 * @param array $options 409 * @param ?Element $node 410 */ 411 public static function massageLoadedDataParsoid( 412 stdClass $dp, array $options = [], ?Element $node = null 413 ): void { 414 if ( isset( $dp->sa ) ) { 415 $dp->sa = (array)$dp->sa; 416 } 417 if ( isset( $dp->a ) ) { 418 $dp->a = (array)$dp->a; 419 } 420 if ( isset( $dp->dsr ) ) { 421 $dp->dsr = DomSourceRange::fromArray( $dp->dsr ); 422 } 423 if ( isset( $dp->tsr ) ) { 424 // tsr is generally for tokens, not DOM trees. 425 $dp->tsr = SourceRange::fromArray( $dp->tsr ); 426 } 427 if ( isset( $dp->extTagOffsets ) ) { 428 $dp->extTagOffsets = DomSourceRange::fromArray( $dp->extTagOffsets ); 429 } 430 if ( isset( $dp->extLinkContentOffsets ) ) { 431 $dp->extLinkContentOffsets = 432 SourceRange::fromArray( $dp->extLinkContentOffsets ); 433 } 434 if ( !empty( $options['markNew'] ) ) { 435 $dp->tmp = PHPUtils::arrayToObject( $dp->tmp ?? [] ); 436 $dp->tmp->isNew = !$node->hasAttribute( 'data-parsoid' ); 437 } 438 if ( isset( $dp->optList ) ) { 439 foreach ( $dp->optList as &$item ) { 440 $item = (array)$item; 441 } 442 } 443 } 444 445 /** 446 * These are intended be used on a document after post-processing, so that 447 * the underlying .dataobject is transparently applied (in the store case) 448 * and reloaded (in the load case), rather than worrying about keeping 449 * the attributes up-to-date throughout that phase. For the most part, 450 * using this.ppTo* should be sufficient and using these directly should be 451 * avoided. 452 * 453 * @param Node $node node 454 * @param array $options options 455 */ 456 public static function loadDataAttribs( Node $node, array $options ): void { 457 if ( !( $node instanceof Element ) ) { 458 return; 459 } 460 // Reset the node data object's stored state, since we're reloading it 461 self::setNodeData( $node, new stdClass ); 462 $dp = self::getJSONAttribute( $node, 'data-parsoid', new stdClass ); 463 self::massageLoadedDataParsoid( $dp, $options, $node ); 464 self::setDataParsoid( $node, $dp ); 465 $node->removeAttribute( 'data-parsoid' ); 466 $dmw = self::getJSONAttribute( $node, 'data-mw', null ); 467 self::setDataMw( $node, $dmw ); 468 $node->removeAttribute( 'data-mw' ); 469 $dpd = self::getJSONAttribute( $node, 'data-parsoid-diff', null ); 470 self::setDataParsoidDiff( $node, $dpd ); 471 $node->removeAttribute( 'data-parsoid-diff' ); 472 } 473 474 /** 475 * Builds an index of id attributes seen in the DOM 476 * @param Node $node 477 * @return array 478 */ 479 public static function usedIdIndex( Node $node ): array { 480 $index = []; 481 DOMUtils::visitDOM( DOMCompat::getBody( $node->ownerDocument ), 482 static function ( Node $n, ?array $options = null ) use ( &$index ) { 483 if ( $n instanceof Element && $n->hasAttribute( 'id' ) ) { 484 $index[$n->getAttribute( 'id' )] = true; 485 } 486 }, 487 [] 488 ); 489 return $index; 490 } 491 492 /** 493 * Walk DOM from node downward calling storeDataAttribs 494 * 495 * @param Node $node node 496 * @param array $options options 497 */ 498 public static function visitAndStoreDataAttribs( Node $node, array $options = [] ): void { 499 // PORT-FIXME: storeDataAttribs calls storeInPageBundle which calls getElementById. 500 // PHP's `getElementById` implementation is broken, and we work around that by 501 // using Zest which uses XPath. So, getElementById call can be O(n) and calling it 502 // on on every element of the DOM via vistDOM here makes it O(n^2) instead of O(n). 503 // So, we work around that by building an index and avoiding getElementById entirely 504 // in storeInPageBundle. 505 if ( !empty( $options['storeInPageBundle'] ) ) { 506 $options['idIndex'] = self::usedIdIndex( $node ); 507 } 508 DOMUtils::visitDOM( $node, [ self::class, 'storeDataAttribs' ], $options ); 509 } 510 511 /** 512 * PORT_FIXME This function needs an accurate description 513 * 514 * @param Node $node node 515 * @param ?array $options options 516 */ 517 public static function storeDataAttribs( Node $node, ?array $options = null ): void { 518 $options = $options ?? []; 519 if ( !( $node instanceof Element ) ) { 520 return; 521 } 522 Assert::invariant( empty( $options['discardDataParsoid'] ) || empty( $options['keepTmp'] ), 523 'Conflicting options: discardDataParsoid and keepTmp are both enabled.' ); 524 $dp = self::getDataParsoid( $node ); 525 // $dp will be a DataParsoid object once but currently it is an stdClass 526 // with a fake type hint. Unfake it to prevent phan complaining about unset(). 527 '@phan-var stdClass $dp'; 528 // @phan-suppress-next-line PhanRedundantCondition 529 $discardDataParsoid = !empty( $options['discardDataParsoid'] ); 530 if ( !empty( $dp->tmp->isNew ) ) { 531 // Only necessary to support the cite extension's getById, 532 // that's already been loaded once. 533 // 534 // This is basically a hack to ensure that DOMUtils.isNewElt 535 // continues to work since we effectively rely on the absence 536 // of data-parsoid to identify new elements. But, loadDataAttribs 537 // creates an empty {} if one doesn't exist. So, this hack 538 // ensures that a loadDataAttribs + storeDataAttribs pair don't 539 // dirty the node by introducing an empty data-parsoid attribute 540 // where one didn't exist before. 541 // 542 // Ideally, we'll find a better solution for this edge case later. 543 $discardDataParsoid = true; 544 } 545 $data = null; 546 if ( !$discardDataParsoid ) { 547 // @phan-suppress-next-line PhanRedundantCondition 548 if ( !empty( $options['keepTmp'] ) ) { 549 if ( isset( $dp->tmp->tplRanges ) ) { 550 unset( $dp->tmp->tplRanges ); 551 } 552 } else { 553 unset( $dp->tmp ); 554 } 555 556 if ( !empty( $options['storeInPageBundle'] ) ) { 557 $data = (object)[ 'parsoid' => $dp ]; 558 } else { 559 self::setJSONAttribute( $node, 'data-parsoid', $dp ); 560 } 561 } 562 // We need to serialize diffs only under special circumstances. 563 // So, do it on demand. 564 if ( !empty( $options['storeDiffMark'] ) ) { 565 $dpDiff = self::getDataParsoidDiff( $node ); 566 if ( $dpDiff ) { 567 self::setJSONAttribute( $node, 'data-parsoid-diff', $dpDiff ); 568 } 569 } 570 // Strip invalid data-mw attributes 571 if ( self::validDataMw( $node ) ) { 572 if ( 573 !empty( $options['storeInPageBundle'] ) && isset( $options['env'] ) && 574 // The pagebundle didn't have data-mw before 999.x 575 Semver::satisfies( $options['env']->getOutputContentVersion(), '^999.0.0' ) 576 ) { 577 $data = $data ?: new stdClass; 578 $data->mw = self::getDataMw( $node ); 579 } else { 580 self::setJSONAttribute( $node, 'data-mw', self::getDataMw( $node ) ); 581 } 582 } 583 // Store pagebundle 584 if ( $data !== null ) { 585 self::storeInPageBundle( $node, $options['env'], $data, $options['idIndex'] ); 586 } 587 588 // Indicate that this node's data has been stored so that if we try 589 // to access it after the fact we're aware and remove the attribute 590 // since it's no longer needed. 591 $nd = self::getNodeData( $node ); 592 $nd->storedId = $node->getAttribute( self::DATA_OBJECT_ATTR_NAME ); 593 $node->removeAttribute( self::DATA_OBJECT_ATTR_NAME ); 594 } 595} 596