1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Wt2Html; 5 6use Closure; 7use DateTime; 8use Exception; 9use Generator; 10use Wikimedia\ObjectFactory; 11use Wikimedia\Parsoid\Config\Env; 12use Wikimedia\Parsoid\DOM\Document; 13use Wikimedia\Parsoid\DOM\Element; 14use Wikimedia\Parsoid\DOM\Node; 15use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; 16use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; 17use Wikimedia\Parsoid\Tokens\SourceRange; 18use Wikimedia\Parsoid\Utils\ContentUtils; 19use Wikimedia\Parsoid\Utils\DOMCompat; 20use Wikimedia\Parsoid\Utils\DOMDataUtils; 21use Wikimedia\Parsoid\Utils\DOMTraverser; 22use Wikimedia\Parsoid\Utils\DOMUtils; 23use Wikimedia\Parsoid\Utils\PHPUtils; 24use Wikimedia\Parsoid\Utils\Utils; 25use Wikimedia\Parsoid\Wt2Html\PP\Handlers\CleanUp; 26use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DedupeStyles; 27use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DisplaySpace; 28use Wikimedia\Parsoid\Wt2Html\PP\Handlers\HandleLinkNeighbours; 29use Wikimedia\Parsoid\Wt2Html\PP\Handlers\Headings; 30use Wikimedia\Parsoid\Wt2Html\PP\Handlers\LiFixups; 31use Wikimedia\Parsoid\Wt2Html\PP\Handlers\TableFixups; 32use Wikimedia\Parsoid\Wt2Html\PP\Handlers\UnpackDOMFragments; 33use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddLinkClasses; 34use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddMediaInfo; 35use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddRedLinks; 36use Wikimedia\Parsoid\Wt2Html\PP\Processors\ComputeDSR; 37use Wikimedia\Parsoid\Wt2Html\PP\Processors\ConvertOffsets; 38use Wikimedia\Parsoid\Wt2Html\PP\Processors\I18n; 39use Wikimedia\Parsoid\Wt2Html\PP\Processors\LangConverter; 40use Wikimedia\Parsoid\Wt2Html\PP\Processors\Linter; 41use Wikimedia\Parsoid\Wt2Html\PP\Processors\MarkFosteredContent; 42use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTemplateMarkerMetas; 43use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTrailingNLs; 44use Wikimedia\Parsoid\Wt2Html\PP\Processors\Normalize; 45use Wikimedia\Parsoid\Wt2Html\PP\Processors\ProcessTreeBuilderFixups; 46use Wikimedia\Parsoid\Wt2Html\PP\Processors\PWrap; 47use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapSections; 48use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapTemplates; 49 50/** 51 * Perform post-processing steps on an already-built HTML DOM. 52 */ 53class DOMPostProcessor extends PipelineStage { 54 /** @var array */ 55 private $options; 56 57 /** @var array */ 58 private $seenIds; 59 60 /** @var array */ 61 private $processors; 62 63 /** @var ParsoidExtensionAPI Provides post-processing support to extensions */ 64 private $extApi; 65 66 /** @var array */ 67 private $metadataMap; 68 69 /** @var string */ 70 private $timeProfile = ''; 71 72 /** 73 * @param Env $env 74 * @param array $options 75 * @param string $stageId 76 * @param ?PipelineStage $prevStage 77 */ 78 public function __construct( 79 Env $env, array $options = [], string $stageId = "", 80 ?PipelineStage $prevStage = null 81 ) { 82 parent::__construct( $env, $prevStage ); 83 84 $this->options = $options; 85 $this->seenIds = []; 86 $this->processors = []; 87 $this->extApi = new ParsoidExtensionAPI( $env ); 88 89 // map from mediawiki metadata names to RDFa property names 90 $this->metadataMap = [ 91 'ns' => [ 92 'property' => 'mw:pageNamespace', 93 'content' => '%d', 94 ], 95 'id' => [ 96 'property' => 'mw:pageId', 97 'content' => '%d', 98 ], 99 100 // DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266) 101 102 // 'rev_revid' is used to set the overall subject of the document, we don't 103 // need to add a specific <meta> or <link> element for it. 104 105 'rev_parentid' => [ 106 'rel' => 'dc:replaces', 107 'resource' => 'mwr:revision/%d', 108 ], 109 'rev_timestamp' => [ 110 'property' => 'dc:modified', 111 'content' => static function ( $m ) { 112 # Convert from TS_MW ("mediawiki timestamp") format 113 $dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] ); 114 # Note that DateTime::ISO8601 is not actually ISO8601, alas. 115 return $dt->format( 'Y-m-d\TH:i:s.000\Z' ); 116 }, 117 ], 118 'rev_sha1' => [ 119 'property' => 'mw:revisionSHA1', 120 'content' => '%s', 121 ] 122 ]; 123 } 124 125 /** 126 * @param ?array $processors 127 */ 128 public function registerProcessors( ?array $processors ): void { 129 if ( empty( $processors ) ) { 130 $processors = $this->getDefaultProcessors(); 131 } 132 133 foreach ( $processors as $p ) { 134 if ( !empty( $p['omit'] ) ) { 135 continue; 136 } 137 if ( empty( $p['name'] ) ) { 138 $p['name'] = Utils::stripNamespace( $p['Processor'] ); 139 } 140 if ( empty( $p['shortcut'] ) ) { 141 $p['shortcut'] = $p['name']; 142 } 143 if ( !empty( $p['isTraverser'] ) ) { 144 $t = new DOMTraverser(); 145 foreach ( $p['handlers'] as $h ) { 146 $t->addHandler( $h['nodeName'], $h['action'] ); 147 } 148 $p['proc'] = function ( ...$args ) use ( $t ) { 149 $args[] = null; 150 return $t->traverse( $this->env, ...$args ); 151 }; 152 } else { 153 $classNameOrSpec = $p['Processor']; 154 if ( empty( $p['isExtPP'] ) ) { 155 // Internal processor w/ ::run() method, class name given 156 // @phan-suppress-next-line PhanNonClassMethodCall 157 $c = new $classNameOrSpec(); 158 $p['proc'] = function ( ...$args ) use ( $c ) { 159 return $c->run( $this->env, ...$args ); 160 }; 161 } else { 162 // Extension post processor, object factory spec given 163 $c = ObjectFactory::getObjectFromSpec( $classNameOrSpec, [ 164 'allowClassName' => true, 165 'assertClass' => ExtDOMProcessor::class, 166 ] ); 167 $p['proc'] = function ( ...$args ) use ( $c ) { 168 return $c->wtPostprocess( $this->extApi, ...$args ); 169 }; 170 } 171 } 172 $this->processors[] = $p; 173 } 174 } 175 176 /** 177 * @return array 178 */ 179 public function getDefaultProcessors(): array { 180 $env = $this->env; 181 $options = $this->options; 182 $seenIds = &$this->seenIds; 183 $usedIdIndex = []; 184 185 $tableFixer = new TableFixups( $env ); 186 187 /* --------------------------------------------------------------------------- 188 * FIXME: 189 * 1. PipelineFactory caches pipelines per env 190 * 2. PipelineFactory.parse uses a default cache key 191 * 3. ParserTests uses a shared/global env object for all tests. 192 * 4. ParserTests also uses PipelineFactory.parse (via env.getContentHandler()) 193 * => the pipeline constructed for the first test that runs wt2html 194 * is used for all subsequent wt2html tests 195 * 5. If we are selectively turning on/off options on a per-test basis 196 * in parser tests, those options won't work if those options are 197 * also used to configure pipeline construction (including which DOM passes 198 * are enabled). 199 * 200 * Ex: if (env.wrapSections) { addPP('wrapSections', wrapSections); } 201 * 202 * This won't do what you expect it to do. This is primarily a 203 * parser tests script issue -- but given the abstraction layers that 204 * are on top of the parser pipeline construction, fixing that is 205 * not straightforward right now. So, this note is a warning to future 206 * developers to pay attention to how they construct pipelines. 207 * --------------------------------------------------------------------------- */ 208 209 $processors = [ 210 // Common post processing 211 [ 212 'Processor' => MarkFosteredContent::class, 213 'shortcut' => 'fostered' 214 ], 215 [ 216 'Processor' => ProcessTreeBuilderFixups::class, 217 'shortcut' => 'process-fixups' 218 ], 219 [ 220 'Processor' => Normalize::class 221 ], 222 [ 223 'Processor' => PWrap::class, 224 'shortcut' => 'pwrap', 225 'skipNested' => true 226 ], 227 // This is run at all levels since, for now, we don't have a generic 228 // solution to running top level passes on HTML stashed in data-mw. 229 // See T214994 for that. 230 // 231 // Also, the gallery extension's "packed" mode would otherwise need a 232 // post-processing pass to scale media after it has been fetched. That 233 // introduces an ordering dependency that may or may not complicate things. 234 [ 235 'Processor' => AddMediaInfo::class, 236 'shortcut' => 'media' 237 ], 238 // Run this after 'ProcessTreeBuilderFixups' because the mw:StartTag 239 // and mw:EndTag metas would otherwise interfere with the 240 // firstChild/lastChild check that this pass does. 241 [ 242 'Processor' => MigrateTemplateMarkerMetas::class, 243 'shortcut' => 'migrate-metas' 244 ], 245 [ 246 'Processor' => MigrateTrailingNLs::class, 247 'shortcut' => 'migrate-nls' 248 ], 249 // dsr computation and tpl encap are only relevant for top-level content 250 [ 251 'Processor' => ComputeDSR::class, 252 'shortcut' => 'dsr', 253 'omit' => !empty( $options['inTemplate'] ) 254 ], 255 [ 256 'Processor' => WrapTemplates::class, 257 'shortcut' => 'tplwrap', 258 'omit' => !empty( $options['inTemplate'] ) 259 ], 260 // 1. Link prefixes and suffixes 261 // 2. Unpack DOM fragments 262 [ 263 'name' => 'HandleLinkNeighbours,UnpackDOMFragments', 264 'shortcut' => 'dom-unpack', 265 'isTraverser' => true, 266 'handlers' => [ 267 [ 268 'nodeName' => 'a', 269 'action' => [ HandleLinkNeighbours::class, 'handler' ] 270 ], 271 [ 272 'nodeName' => null, 273 'action' => [ UnpackDOMFragments::class, 'handler' ] 274 ] 275 ] 276 ] 277 ]; 278 279 /** 280 * FIXME: There are two potential ordering problems here. 281 * 282 * 1. unpackDOMFragment should always run immediately 283 * before these extensionPostProcessors, which we do currently. 284 * This ensures packed content get processed correctly by extensions 285 * before additional transformations are run on the DOM. 286 * 287 * This ordering issue is handled through documentation. 288 * 289 * 2. This has existed all along (in the PHP parser as well as Parsoid 290 * which is probably how the ref-in-ref hack works - because of how 291 * parser functions and extension tags are procesed, #tag:ref doesn't 292 * see a nested ref anymore) and this patch only exposes that problem 293 * more clearly with the unpackOutput property. 294 * 295 * * Consider the set of extensions that 296 * (a) process wikitext 297 * (b) provide an extensionPostProcessor 298 * (c) run the extensionPostProcessor only on the top-level 299 * As of today, there is exactly one extension (Cite) that has all 300 * these properties, so the problem below is a speculative problem 301 * for today. But, this could potentially be a problem in the future. 302 * 303 * * Let us say there are at least two of them, E1 and E2 that 304 * support extension tags <e1> and <e2> respectively. 305 * 306 * * Let us say in an instance of <e1> on the page, <e2> is present 307 * and in another instance of <e2> on the page, <e1> is present. 308 * 309 * * In what order should E1's and E2's extensionPostProcessors be 310 * run on the top-level? Depending on what these handlers do, you 311 * could get potentially different results. You can see this quite 312 * starkly with the unpackOutput flag. 313 * 314 * * The ideal solution to this problem is to require that every extension's 315 * extensionPostProcessor be idempotent which lets us run these 316 * post processors repeatedly till the DOM stabilizes. But, this 317 * still doesn't necessarily guarantee that ordering doesn't matter. 318 * It just guarantees that with the unpackOutput flag set to false 319 * multiple extensions, all sealed fragments get fully processed. 320 * So, we still need to worry about that problem. 321 * 322 * But, idempotence *could* potentially be a sufficient property in most cases. 323 * To see this, consider that there is a Footnotes extension which is similar 324 * to the Cite extension in that they both extract inline content in the 325 * page source to a separate section of output and leave behind pointers to 326 * the global section in the output DOM. Given this, the Cite and Footnote 327 * extension post processors would essentially walk the dom and 328 * move any existing inline content into that global section till it is 329 * done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>, 330 * we ultimately end up with all footnote content in the footnotes section 331 * and all ref content in the references section and the DOM stabilizes. 332 * Ordering is irrelevant here. 333 * 334 * So, perhaps one way of catching these problems would be in code review 335 * by analyzing what the DOM postprocessor does and see if it introduces 336 * potential ordering issues. 337 */ 338 foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) { 339 foreach ( $domProcs as $i => $domProcSpec ) { 340 $processors[] = [ 341 'isExtPP' => true, // This is an extension DOM post processor 342 'name' => "pp:$extName:$i", 343 'Processor' => $domProcSpec, 344 ]; 345 } 346 } 347 348 $processors = array_merge( $processors, [ 349 [ 350 'name' => 'MigrateTrailingCategories,TableFixups,DedupeStyles', 351 'shortcut' => 'fixups', 352 'isTraverser' => true, 353 'skipNested' => true, 354 'handlers' => [ 355 // Move trailing categories in <li>s out of the list 356 [ 357 'nodeName' => 'li', 358 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 359 ], 360 [ 361 'nodeName' => 'dt', 362 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 363 ], 364 [ 365 'nodeName' => 'dd', 366 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 367 ], 368 // 2. Fix up issues from templated table cells and table cell attributes 369 [ 370 'nodeName' => 'td', 371 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 372 return $tableFixer->stripDoubleTDs( $node, $this->frame ); 373 } 374 ], 375 [ 376 'nodeName' => 'td', 377 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 378 return $tableFixer->handleTableCellTemplates( $node, $this->frame ); 379 } 380 ], 381 [ 382 'nodeName' => 'th', 383 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 384 return $tableFixer->handleTableCellTemplates( $node, $this->frame ); 385 } 386 ], 387 // 3. Deduplicate template styles 388 // (should run after dom-fragment expansion + after extension post-processors) 389 [ 390 'nodeName' => 'style', 391 'action' => [ DedupeStyles::class, 'dedupe' ] 392 ] 393 ] 394 ], 395 // Benefits from running after determining which media are redlinks 396 [ 397 'name' => 'Headings-genAnchors', 398 'shortcut' => 'heading-ids', 399 'isTraverser' => true, 400 'skipNested' => true, 401 'handlers' => [ 402 [ 403 'nodeName' => null, 404 'action' => [ Headings::class, 'genAnchors' ] 405 ], 406 [ 407 'nodeName' => null, 408 'action' => static function ( $node, $env ) use ( &$seenIds ) { 409 return Headings::dedupeHeadingIds( $seenIds, $node ); 410 } 411 ] 412 ] 413 ], 414 [ 415 'Processor' => Linter::class, 416 'omit' => !$env->getSiteConfig()->linting(), 417 'skipNested' => true 418 ], 419 // Strip marker metas -- removes left over marker metas (ex: metas 420 // nested in expanded tpl/extension output). 421 [ 422 'name' => 'CleanUp-stripMarkerMetas', 423 'shortcut' => 'strip-metas', 424 'isTraverser' => true, 425 'handlers' => [ 426 [ 427 'nodeName' => 'meta', 428 'action' => [ CleanUp::class, 'stripMarkerMetas' ] 429 ] 430 ] 431 ], 432 // Language conversion and Red link marking are done here 433 // *before* we cleanup and save data-parsoid because they 434 // are also used in pb2pb/html2html passes, and we want to 435 // keep their input/output formats consistent. 436 [ 437 'Processor' => LangConverter::class, 438 'shortcut' => 'lang-converter', 439 'skipNested' => true 440 ], 441 [ 442 'Processor' => AddRedLinks::class, 443 'shortcut' => 'redlinks', 444 'skipNested' => true, 445 'omit' => $env->noDataAccess(), 446 ], 447 [ 448 'name' => 'DisplaySpace', 449 'shortcut' => 'displayspace', 450 'skipNested' => true, 451 'isTraverser' => true, 452 'handlers' => [ 453 [ 454 'nodeName' => '#text', 455 'action' => [ DisplaySpace::class, 'leftHandler' ] 456 ], 457 [ 458 'nodeName' => '#text', 459 'action' => [ DisplaySpace::class, 'rightHandler' ] 460 ], 461 ] 462 ], 463 [ 464 'Processor' => AddLinkClasses::class, 465 'shortcut' => 'linkclasses', 466 // Note that embedded content doesn't get these classes 467 'skipNested' => true 468 ], 469 // Add <section> wrappers around sections 470 [ 471 'Processor' => WrapSections::class, 472 'shortcut' => 'sections', 473 'skipNested' => true 474 ], 475 [ 476 'Processor' => ConvertOffsets::class, 477 'shortcut' => 'convertoffsets', 478 'skipNested' => true, 479 ], 480 [ 481 'Processor' => I18n::class, 482 'shortcut' => 'i18n', 483 // FIXME(T214994): This should probably be `true`, since we 484 // want this to be another html2html type pass, but then our 485 // processor would need to handle nested content. Redlinks, 486 // displayspace, and others are ignoring that for now though, 487 // so let's wait until there's a more general mechanism. 488 'skipNested' => false, 489 ], 490 [ 491 'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanupAndSaveDataParsoid', 492 'shortcut' => 'cleanup', 493 'isTraverser' => true, 494 'handlers' => [ 495 // Strip empty elements from template content 496 [ 497 'nodeName' => null, 498 'action' => [ CleanUp::class, 'handleEmptyElements' ] 499 ], 500 // Save data.parsoid into data-parsoid html attribute. 501 // Make this its own thing so that any changes to the DOM 502 // don't affect other handlers that run alongside it. 503 [ 504 'nodeName' => null, 505 'action' => static function ( 506 $node, $env, $options, $atTopLevel, $tplInfo 507 ) use ( &$usedIdIndex ) { 508 if ( $atTopLevel && DOMUtils::isBody( $node ) ) { 509 $usedIdIndex = DOMDataUtils::usedIdIndex( $node ); 510 } 511 return CleanUp::cleanupAndSaveDataParsoid( 512 $usedIdIndex, $node, $env, $atTopLevel, 513 $tplInfo 514 ); 515 } 516 ] 517 ] 518 ], 519 ] ); 520 521 return $processors; 522 } 523 524 /** 525 * @inheritDoc 526 */ 527 public function setSourceOffsets( SourceRange $so ): void { 528 $this->options['sourceOffsets'] = $so; 529 } 530 531 /** 532 * @inheritDoc 533 */ 534 public function resetState( array $options ): void { 535 parent::resetState( $options ); 536 537 // $this->env->getPageConfig()->meta->displayTitle = null; 538 $this->seenIds = []; 539 } 540 541 /** 542 * Create an element in the document.head with the given attrs. 543 * 544 * @param Document $document 545 * @param string $tagName 546 * @param array $attrs 547 */ 548 private function appendToHead( Document $document, string $tagName, array $attrs = [] ): void { 549 $elt = $document->createElement( $tagName ); 550 DOMUtils::addAttributes( $elt, $attrs ); 551 ( DOMCompat::getHead( $document ) )->appendChild( $elt ); 552 } 553 554 /** 555 * While unnecessary for Wikimedia clients, a stylesheet url in the <head> 556 * is useful for clients like Kiwix and others who might not want to process 557 * the meta tags to construct the resourceloader url. 558 * 559 * Given that these clients will be consuming Parsoid HTML outside a MediaWiki skin, 560 * the clients are effectively responsible for their own "skin". But, once again, 561 * as a courtesy, we are hardcoding the vector skin modules for them. But, note 562 * that this may cause page elements to render differently than how they render 563 * on Wikimedia sites with the vector skin since this is probably missing a number 564 * of other modules. 565 * 566 * All that said, note that JS-generated parts of the page will still require them 567 * to have more intimate knowledge of how to process the JS modules. Except for 568 * <graph>s, page content doesn't require JS modules at this point. So, where these 569 * clients want to invest in the necessary logic to construct a better resourceloader 570 * url, they could simply delete / ignore this stylesheet. 571 * 572 * @param Document $document 573 * @param Env $env 574 * @param string $lang 575 * @param array $styleModules 576 */ 577 private function addCourtesyBasicStyleSheet( 578 Document $document, Env $env, string $lang, array $styleModules 579 ): void { 580 $styleModules = array_unique( array_merge( $styleModules, [ 581 'mediawiki.skinning.content.parsoid', 582 // Use the base styles that API output and fallback skin use. 583 'mediawiki.skinning.interface', 584 // Make sure to include contents of user generated styles 585 // e.g. MediaWiki:Common.css / MediaWiki:Mobile.css 586 'site.styles' 587 ] ) ); 588 589 $styleURI = $env->getSiteConfig()->getModulesLoadURI() . 590 '?lang=' . $lang . '&modules=' . 591 PHPUtils::encodeURIComponent( implode( '|', $styleModules ) ) . 592 '&only=styles&skin=vector'; 593 $this->appendToHead( $document, 'link', [ 'rel' => 'stylesheet', 'href' => $styleURI ] ); 594 } 595 596 /** 597 * Export used style modules via a meta tag (and via a stylesheet for now to aid some clients) 598 * @param Document $document 599 * @param Env $env 600 * @param string $lang 601 */ 602 private function exportStyleModules( Document $document, Env $env, string $lang ): void { 603 // Styles from modules returned from preprocessor / parse requests 604 $styleModules = $env->getOutputProperties()['modulestyles'] ?? []; 605 if ( $styleModules ) { 606 // FIXME: Maybe think about using an associative array or DS\Set 607 $styleModules = array_unique( $styleModules ); 608 609 // mw:styleModules are CSS modules that are render-blocking. 610 $this->appendToHead( $document, 'meta', [ 611 'property' => 'mw:styleModules', 612 'content' => implode( '|', $styleModules ) 613 ] ); 614 } 615 616 $this->addCourtesyBasicStyleSheet( $document, $env, $lang, $styleModules ); 617 } 618 619 /** 620 * Export general modules (usually JS scripts) via a meta tag 621 * @param Document $document 622 * @param Env $env 623 */ 624 private function exportGeneralModules( Document $document, Env $env ): void { 625 // Styles from modules returned from preprocessor / parse requests 626 $generalModules = $env->getOutputProperties()['modules'] ?? []; 627 if ( $generalModules ) { 628 // mw:generalModules can be processed via JS (and async) and are usually (but 629 // not always) JS scripts. 630 $this->appendToHead( $document, 'meta', [ 631 'property' => 'mw:generalModules', 632 'content' => implode( '|', array_unique( $generalModules ) ) 633 ] ); 634 } 635 } 636 637 /** 638 * Export used JS config vars via a meta tag 639 * @param Document $document 640 * @param Env $env 641 */ 642 private function exportJSConfigVars( Document $document, Env $env ): void { 643 $vars = $env->getOutputProperties()['jsconfigvars'] ?? []; 644 if ( $vars ) { 645 try { 646 $content = PHPUtils::jsonEncode( $vars ); 647 } catch ( Exception $e ) { 648 // Similar to ResourceLoader::makeConfigSetScript. See T289358 649 $env->log( 650 'warn', 'JSON serialization of config data failed. ' . 651 'This usually means the config data is not valid UTF-8.' 652 ); 653 return; 654 } 655 $this->appendToHead( $document, 'meta', [ 656 'property' => 'mw:jsConfigVars', 657 'content' => $content, 658 ] ); 659 } 660 } 661 662 /** 663 * @param Element $body 664 * @param Env $env 665 */ 666 private function updateBodyClasslist( Element $body, Env $env ): void { 667 $dir = $env->getPageConfig()->getPageLanguageDir(); 668 $bodyCL = DOMCompat::getClassList( $body ); 669 $bodyCL->add( 'mw-content-' . $dir ); 670 $bodyCL->add( 'sitedir-' . $dir ); 671 $bodyCL->add( $dir ); 672 $body->setAttribute( 'dir', $dir ); 673 674 // Set 'mw-body-content' directly on the body. 675 // This is the designated successor for #bodyContent in core skins. 676 $bodyCL->add( 'mw-body-content' ); 677 // Set 'parsoid-body' to add the desired layout styling from Vector. 678 $bodyCL->add( 'parsoid-body' ); 679 // Also, add the 'mediawiki' class. 680 // Some Mediawiki:Common.css seem to target this selector. 681 $bodyCL->add( 'mediawiki' ); 682 // Set 'mw-parser-output' directly on the body. 683 // Templates target this class as part of the TemplateStyles RFC 684 // FIXME: This isn't expected to be found on the same element as the 685 // body class above, since some css targets it as a descendant. 686 // In visual diff'ing, we migrate the body contents to a wrapper div 687 // with this class to reduce visual differences. Consider getting 688 // rid of it. 689 $bodyCL->add( 'mw-parser-output' ); 690 } 691 692 /** 693 * FIXME: consider moving to DOMUtils or Env. 694 * 695 * @param Env $env 696 * @param Document $document 697 */ 698 public function addMetaData( Env $env, Document $document ): void { 699 // add <head> element if it was missing 700 if ( !( DOMCompat::getHead( $document ) instanceof Element ) ) { 701 $document->documentElement->insertBefore( 702 $document->createElement( 'head' ), 703 DOMCompat::getBody( $document ) 704 ); 705 } 706 707 // add mw: and mwr: RDFa prefixes 708 $prefixes = [ 709 'dc: http://purl.org/dc/terms/', 710 'mw: http://mediawiki.org/rdf/' 711 ]; 712 $document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) ); 713 714 // (From wfParseUrl in core:) 715 // Protocol-relative URLs are handled really badly by parse_url(). 716 // It's so bad that the easiest way to handle them is to just prepend 717 // 'https:' and strip the protocol out later. 718 $baseURI = $env->getSiteConfig()->baseURI(); 719 $wasRelative = substr( $baseURI, 0, 2 ) == '//'; 720 if ( $wasRelative ) { 721 $baseURI = "https:$baseURI"; 722 } 723 // add 'https://' to baseURI if it was missing 724 $pu = parse_url( $baseURI ); 725 $mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) . 726 $baseURI . 'Special:Redirect/'; 727 728 ( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix ); 729 730 // add <head> content based on page meta data: 731 732 // Set the charset first. 733 $this->appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] ); 734 735 // Add page / revision metadata to the <head> 736 // PORT-FIXME: We will need to do some refactoring to eliminate 737 // this hardcoding. Probably even merge thi sinto metadataMap 738 $pageConfig = $env->getPageConfig(); 739 $revProps = [ 740 'id' => $pageConfig->getPageId(), 741 'ns' => $pageConfig->getNs(), 742 'rev_parentid' => $pageConfig->getParentRevisionId(), 743 'rev_revid' => $pageConfig->getRevisionId(), 744 'rev_sha1' => $pageConfig->getRevisionSha1(), 745 'rev_timestamp' => $pageConfig->getRevisionTimestamp() 746 ]; 747 foreach ( $revProps as $key => $value ) { 748 // generate proper attributes for the <meta> or <link> tag 749 if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) { 750 continue; 751 } 752 753 $attrs = []; 754 $mdm = $this->metadataMap[$key]; 755 756 /** FIXME: The JS side has a bunch of other checks here */ 757 758 foreach ( $mdm as $k => $v ) { 759 // evaluate a function, or perform sprintf-style formatting, or 760 // use string directly, depending on value in metadataMap 761 if ( $v instanceof Closure ) { 762 $v = $v( $revProps ); 763 } elseif ( strpos( $v, '%' ) !== false ) { 764 // @phan-suppress-next-line PhanPluginPrintfVariableFormatString 765 $v = sprintf( $v, $value ); 766 } 767 $attrs[$k] = $v; 768 } 769 770 // <link> is used if there's a resource or href attribute. 771 $this->appendToHead( $document, 772 isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta', 773 $attrs 774 ); 775 } 776 777 if ( $revProps['rev_revid'] ) { 778 $document->documentElement->setAttribute( 779 'about', $mwrPrefix . 'revision/' . $revProps['rev_revid'] 780 ); 781 } 782 783 // Normalize before comparison 784 if ( 785 str_replace( '_', ' ', $env->getSiteConfig()->mainpage() ) === 786 str_replace( '_', ' ', $env->getPageConfig()->getTitle() ) 787 ) { 788 $this->appendToHead( $document, 'meta', [ 789 'property' => 'isMainPage', 790 'content' => 'true' /* HTML attribute values should be strings */ 791 ] ); 792 } 793 794 // Set the parsoid content-type strings 795 // FIXME: Should we be using http-equiv for this? 796 $this->appendToHead( $document, 'meta', [ 797 'property' => 'mw:htmlVersion', 798 'content' => $env->getOutputContentVersion() 799 ] 800 ); 801 // Temporary backward compatibility for clients 802 // This could be skipped if we support a version downgrade path 803 // with a major version bump. 804 $this->appendToHead( $document, 'meta', [ 805 'property' => 'mw:html:version', 806 'content' => $env->getOutputContentVersion() 807 ] 808 ); 809 810 $expTitle = strtr( $env->getPageConfig()->getTitle(), ' ', '_' ); 811 $expTitle = explode( '/', $expTitle ); 812 $expTitle = array_map( static function ( $comp ) { 813 return PHPUtils::encodeURIComponent( $comp ); 814 }, $expTitle ); 815 816 $this->appendToHead( $document, 'link', [ 817 'rel' => 'dc:isVersionOf', 818 'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle ) 819 ] ); 820 821 DOMCompat::setTitle( 822 $document, 823 // PORT-FIXME: There isn't a place anywhere yet for displayTitle 824 /* $env->getPageConfig()->meta->displayTitle || */ 825 $env->getPageConfig()->getTitle() 826 ); 827 828 // Add base href pointing to the wiki root 829 $this->appendToHead( $document, 'base', [ 830 'href' => $env->getSiteConfig()->baseURI() 831 ] ); 832 833 // Stick data attributes in the head 834 if ( $env->pageBundle ) { 835 DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) ); 836 } 837 838 // PageConfig guarantees language will always be non-null. 839 $lang = $env->getPageConfig()->getPageLanguage(); 840 $body = DOMCompat::getBody( $document ); 841 $body->setAttribute( 'lang', Utils::bcp47n( $lang ) ); 842 $this->updateBodyClasslist( $body, $env ); 843 $this->exportJSConfigVars( $document, $env ); 844 $this->exportGeneralModules( $document, $env ); 845 $this->exportStyleModules( $document, $env, $lang ); 846 847 // Indicate whether LanguageConverter is enabled, so that downstream 848 // caches can split on variant (if necessary) 849 $this->appendToHead( $document, 'meta', [ 850 'http-equiv' => 'content-language', 851 'content' => $env->htmlContentLanguage() 852 ] 853 ); 854 $this->appendToHead( $document, 'meta', [ 855 'http-equiv' => 'vary', 856 'content' => $env->htmlVary() 857 ] 858 ); 859 860 if ( $env->profiling() ) { 861 $profile = $env->getCurrentProfile(); 862 $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); 863 $body->appendChild( $body->ownerDocument->createComment( $this->timeProfile ) ); 864 $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); 865 } 866 } 867 868 /** 869 * @param Node $node 870 */ 871 public function doPostProcess( Node $node ): void { 872 $env = $this->env; 873 874 $hasDumpFlags = $env->hasDumpFlags(); 875 876 if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-builder' ) ) { 877 $opts = []; 878 ContentUtils::dumpDOM( $node, 'DOM: after tree builder', $opts ); 879 } 880 881 $startTime = null; 882 $endTime = null; 883 $prefix = null; 884 $traceLevel = null; 885 $resourceCategory = null; 886 887 $profile = null; 888 if ( $env->profiling() ) { 889 $profile = $env->getCurrentProfile(); 890 if ( $this->atTopLevel ) { 891 $this->timeProfile = str_repeat( "-", 85 ) . "\n"; 892 $prefix = 'TOP'; 893 // Turn off DOM pass timing tracing on non-top-level documents 894 $resourceCategory = 'DOMPasses:TOP'; 895 } else { 896 $prefix = '---'; 897 $resourceCategory = 'DOMPasses:NESTED'; 898 } 899 $startTime = PHPUtils::getStartHRTime(); 900 $env->log( 'debug/time/dompp', $prefix . '; start=' . $startTime ); 901 } 902 903 for ( $i = 0; $i < count( $this->processors ); $i++ ) { 904 $pp = $this->processors[$i]; 905 if ( !empty( $pp['skipNested'] ) && !$this->atTopLevel ) { 906 continue; 907 } 908 909 $ppName = null; 910 $ppStart = null; 911 912 // Trace 913 if ( $profile ) { 914 $ppName = $pp['name'] . str_repeat( 915 " ", 916 ( strlen( $pp['name'] ) < 30 ) ? 30 - strlen( $pp['name'] ) : 0 917 ); 918 $ppStart = PHPUtils::getStartHRTime(); 919 $env->log( 'debug/time/dompp', $prefix . '; ' . $ppName . ' start' ); 920 } 921 922 $opts = null; 923 if ( $hasDumpFlags ) { 924 $opts = [ 925 'env' => $env, 926 'dumpFragmentMap' => $this->atTopLevel, 927 'keepTmp' => true 928 ]; 929 930 if ( $env->hasDumpFlag( 'dom:pre-' . $pp['shortcut'] ) ) { 931 ContentUtils::dumpDOM( $node, 'DOM: pre-' . $pp['shortcut'], $opts ); 932 } 933 } 934 935 // Excessive to do it here always, but protects against future changes 936 // to how $this->frame may be updated. 937 $pp['proc']( $node, [ 'frame' => $this->frame ] + $this->options, $this->atTopLevel ); 938 939 if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-' . $pp['shortcut'] ) ) { 940 ContentUtils::dumpDOM( $node, 'DOM: post-' . $pp['shortcut'], $opts ); 941 } 942 943 if ( $profile ) { 944 $ppElapsed = PHPUtils::getHRTimeDifferential( $ppStart ); 945 $env->log( 946 'debug/time/dompp', 947 $prefix . '; ' . $ppName . ' end; time = ' . $ppElapsed 948 ); 949 if ( $this->atTopLevel ) { 950 $this->timeProfile .= str_pad( $prefix . '; ' . $ppName, 65 ) . 951 ' time = ' . 952 str_pad( number_format( $ppElapsed, 2 ), 10, ' ', STR_PAD_LEFT ) . "\n"; 953 } 954 $profile->bumpTimeUse( $resourceCategory, $ppElapsed, 'DOM' ); 955 } 956 } 957 958 if ( $profile ) { 959 $endTime = PHPUtils::getStartHRTime(); 960 $env->log( 961 'debug/time/dompp', 962 $prefix . '; end=' . number_format( $endTime, 2 ) . '; time = ' . 963 number_format( PHPUtils::getHRTimeDifferential( $startTime ), 2 ) 964 ); 965 } 966 967 // For sub-pipeline documents, we are done. 968 // For the top-level document, we generate <head> and add it. 969 if ( $this->atTopLevel ) { 970 self::addMetaData( $env, $node->ownerDocument ); 971 if ( $env->hasDumpFlag( 'wt2html:limits' ) ) { 972 /* 973 * PORT-FIXME: Not yet implemented 974 $env->printWt2HtmlResourceUsage( [ 975 'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) ) 976 ] ); 977 */ 978 } 979 } 980 } 981 982 /** 983 * @inheritDoc 984 */ 985 public function process( $node, array $opts = null ) { 986 '@phan-var Node $node'; // @var Node $node 987 $this->doPostProcess( $node ); 988 return $node; 989 } 990 991 /** 992 * @inheritDoc 993 */ 994 public function processChunkily( $input, ?array $options ): Generator { 995 if ( $this->prevStage ) { 996 // The previous stage will yield a DOM. 997 // FIXME: Should we change the signature of that to return a DOM 998 // If we do so, a pipeline stage returns either a generator or 999 // concrete output (in this case, a DOM). 1000 $node = $this->prevStage->processChunkily( $input, $options )->current(); 1001 } else { 1002 $node = $input; 1003 } 1004 $this->process( $node ); 1005 yield $node; 1006 } 1007} 1008