1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Wt2Html; 5 6use Closure; 7use DateTime; 8use DOMDocument; 9use DOMElement; 10use DOMNode; 11use Generator; 12use Wikimedia\ObjectFactory; 13use Wikimedia\Parsoid\Config\Env; 14use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; 15use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; 16use Wikimedia\Parsoid\Tokens\SourceRange; 17use Wikimedia\Parsoid\Utils\ContentUtils; 18use Wikimedia\Parsoid\Utils\DOMCompat; 19use Wikimedia\Parsoid\Utils\DOMDataUtils; 20use Wikimedia\Parsoid\Utils\DOMTraverser; 21use Wikimedia\Parsoid\Utils\DOMUtils; 22use Wikimedia\Parsoid\Utils\PHPUtils; 23use Wikimedia\Parsoid\Utils\Utils; 24use Wikimedia\Parsoid\Wt2Html\PP\Handlers\CleanUp; 25use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DedupeStyles; 26use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DisplaySpace; 27use Wikimedia\Parsoid\Wt2Html\PP\Handlers\HandleLinkNeighbours; 28use Wikimedia\Parsoid\Wt2Html\PP\Handlers\Headings; 29use Wikimedia\Parsoid\Wt2Html\PP\Handlers\LiFixups; 30use Wikimedia\Parsoid\Wt2Html\PP\Handlers\TableFixups; 31use Wikimedia\Parsoid\Wt2Html\PP\Handlers\UnpackDOMFragments; 32use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddExtLinkClasses; 33use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddMediaInfo; 34use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddRedLinks; 35use Wikimedia\Parsoid\Wt2Html\PP\Processors\ComputeDSR; 36use Wikimedia\Parsoid\Wt2Html\PP\Processors\ConvertOffsets; 37use Wikimedia\Parsoid\Wt2Html\PP\Processors\I18n; 38use Wikimedia\Parsoid\Wt2Html\PP\Processors\LangConverter; 39use Wikimedia\Parsoid\Wt2Html\PP\Processors\Linter; 40use Wikimedia\Parsoid\Wt2Html\PP\Processors\MarkFosteredContent; 41use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTemplateMarkerMetas; 42use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTrailingNLs; 43use Wikimedia\Parsoid\Wt2Html\PP\Processors\Normalize; 44use Wikimedia\Parsoid\Wt2Html\PP\Processors\ProcessTreeBuilderFixups; 45use Wikimedia\Parsoid\Wt2Html\PP\Processors\PWrap; 46use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapSections; 47use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapTemplates; 48 49/** 50 * Perform post-processing steps on an already-built HTML DOM. 51 */ 52class DOMPostProcessor extends PipelineStage { 53 /** @var array */ 54 private $options; 55 56 /** @var array */ 57 private $seenIds; 58 59 /** @var array */ 60 private $processors; 61 62 /** @var ParsoidExtensionAPI Provides post-processing support to extensions */ 63 private $extApi; 64 65 /** @var array */ 66 private $metadataMap; 67 68 /** @var string */ 69 private $timeProfile = ''; 70 71 /** 72 * @param Env $env 73 * @param array $options 74 * @param string $stageId 75 * @param ?PipelineStage $prevStage 76 */ 77 public function __construct( 78 Env $env, array $options = [], string $stageId = "", 79 ?PipelineStage $prevStage = null 80 ) { 81 parent::__construct( $env, $prevStage ); 82 83 $this->options = $options; 84 $this->seenIds = []; 85 $this->processors = []; 86 $this->extApi = new ParsoidExtensionAPI( $env ); 87 88 // map from mediawiki metadata names to RDFa property names 89 $this->metadataMap = [ 90 'ns' => [ 91 'property' => 'mw:pageNamespace', 92 'content' => '%d', 93 ], 94 'id' => [ 95 'property' => 'mw:pageId', 96 'content' => '%d', 97 ], 98 99 // DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266) 100 101 // 'rev_revid' is used to set the overall subject of the document, we don't 102 // need to add a specific <meta> or <link> element for it. 103 104 'rev_parentid' => [ 105 'rel' => 'dc:replaces', 106 'resource' => 'mwr:revision/%d', 107 ], 108 'rev_timestamp' => [ 109 'property' => 'dc:modified', 110 'content' => function ( $m ) { 111 # Convert from TS_MW ("mediawiki timestamp") format 112 $dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] ); 113 # Note that DateTime::ISO8601 is not actually ISO8601, alas. 114 return $dt->format( 'Y-m-d\TH:i:s.000\Z' ); 115 }, 116 ], 117 'rev_sha1' => [ 118 'property' => 'mw:revisionSHA1', 119 'content' => '%s', 120 ] 121 ]; 122 } 123 124 /** 125 * @param ?array $processors 126 */ 127 public function registerProcessors( ?array $processors ): void { 128 if ( empty( $processors ) ) { 129 $processors = $this->getDefaultProcessors(); 130 } 131 132 foreach ( $processors as $p ) { 133 if ( !empty( $p['omit'] ) ) { 134 continue; 135 } 136 if ( empty( $p['name'] ) ) { 137 $p['name'] = Utils::stripNamespace( $p['Processor'] ); 138 } 139 if ( empty( $p['shortcut'] ) ) { 140 $p['shortcut'] = $p['name']; 141 } 142 if ( !empty( $p['isTraverser'] ) ) { 143 $t = new DOMTraverser(); 144 foreach ( $p['handlers'] as $h ) { 145 $t->addHandler( $h['nodeName'], $h['action'] ); 146 } 147 $p['proc'] = function ( ...$args ) use ( $t ) { 148 $args[] = null; 149 return $t->traverse( $this->env, ...$args ); 150 }; 151 } else { 152 $classNameOrSpec = $p['Processor']; 153 if ( empty( $p['isExtPP'] ) ) { 154 // Internal processor w/ ::run() method, class name given 155 // @phan-suppress-next-line PhanNonClassMethodCall 156 $c = new $classNameOrSpec(); 157 $p['proc'] = function ( ...$args ) use ( $c ) { 158 return $c->run( $this->env, ...$args ); 159 }; 160 } else { 161 // Extension post processor, object factory spec given 162 $c = ObjectFactory::getObjectFromSpec( $classNameOrSpec, [ 163 'allowClassName' => true, 164 'assertClass' => ExtDOMProcessor::class, 165 ] ); 166 $p['proc'] = function ( ...$args ) use ( $c ) { 167 return $c->wtPostprocess( $this->extApi, ...$args ); 168 }; 169 } 170 } 171 $this->processors[] = $p; 172 } 173 } 174 175 /** 176 * @return array 177 */ 178 public function getDefaultProcessors(): array { 179 $env = $this->env; 180 $options = $this->options; 181 $seenIds = &$this->seenIds; 182 $usedIdIndex = []; 183 184 $tableFixer = new TableFixups( $env ); 185 186 /* --------------------------------------------------------------------------- 187 * FIXME: 188 * 1. PipelineFactory caches pipelines per env 189 * 2. PipelineFactory.parse uses a default cache key 190 * 3. ParserTests uses a shared/global env object for all tests. 191 * 4. ParserTests also uses PipelineFactory.parse (via env.getContentHandler()) 192 * => the pipeline constructed for the first test that runs wt2html 193 * is used for all subsequent wt2html tests 194 * 5. If we are selectively turning on/off options on a per-test basis 195 * in parser tests, those options won't work if those options are 196 * also used to configure pipeline construction (including which DOM passes 197 * are enabled). 198 * 199 * Ex: if (env.wrapSections) { addPP('wrapSections', wrapSections); } 200 * 201 * This won't do what you expect it to do. This is primarily a 202 * parser tests script issue -- but given the abstraction layers that 203 * are on top of the parser pipeline construction, fixing that is 204 * not straightforward right now. So, this note is a warning to future 205 * developers to pay attention to how they construct pipelines. 206 * --------------------------------------------------------------------------- */ 207 208 $processors = [ 209 // Common post processing 210 [ 211 'Processor' => MarkFosteredContent::class, 212 'shortcut' => 'fostered' 213 ], 214 [ 215 'Processor' => ProcessTreeBuilderFixups::class, 216 'shortcut' => 'process-fixups' 217 ], 218 [ 219 'Processor' => Normalize::class 220 ], 221 [ 222 'Processor' => PWrap::class, 223 'shortcut' => 'pwrap', 224 'skipNested' => true 225 ], 226 // This is run at all levels since, for now, we don't have a generic 227 // solution to running top level passes on HTML stashed in data-mw. 228 // See T214994 for that. 229 // 230 // Also, the gallery extension's "packed" mode would otherwise need a 231 // post-processing pass to scale media after it has been fetched. That 232 // introduces an ordering dependency that may or may not complicate things. 233 [ 234 'Processor' => AddMediaInfo::class, 235 'shortcut' => 'media' 236 ], 237 // Run this after 'ProcessTreeBuilderFixups' because the mw:StartTag 238 // and mw:EndTag metas would otherwise interfere with the 239 // firstChild/lastChild check that this pass does. 240 [ 241 'Processor' => MigrateTemplateMarkerMetas::class, 242 'shortcut' => 'migrate-metas' 243 ], 244 [ 245 'Processor' => MigrateTrailingNLs::class, 246 'shortcut' => 'migrate-nls' 247 ], 248 // dsr computation and tpl encap are only relevant for top-level content 249 [ 250 'Processor' => ComputeDSR::class, 251 'shortcut' => 'dsr', 252 'omit' => !empty( $options['inTemplate'] ) 253 ], 254 [ 255 'Processor' => WrapTemplates::class, 256 'shortcut' => 'tplwrap', 257 'omit' => !empty( $options['inTemplate'] ) 258 ], 259 // 1. Link prefixes and suffixes 260 // 2. Unpack DOM fragments 261 [ 262 'name' => 'HandleLinkNeighbours,UnpackDOMFragments', 263 'shortcut' => 'dom-unpack', 264 'isTraverser' => true, 265 'handlers' => [ 266 [ 267 'nodeName' => 'a', 268 'action' => [ HandleLinkNeighbours::class, 'handler' ] 269 ], 270 [ 271 'nodeName' => null, 272 'action' => [ UnpackDOMFragments::class, 'handler' ] 273 ] 274 ] 275 ] 276 ]; 277 278 /** 279 * FIXME: There are two potential ordering problems here. 280 * 281 * 1. unpackDOMFragment should always run immediately 282 * before these extensionPostProcessors, which we do currently. 283 * This ensures packed content get processed correctly by extensions 284 * before additional transformations are run on the DOM. 285 * 286 * This ordering issue is handled through documentation. 287 * 288 * 2. This has existed all along (in the PHP parser as well as Parsoid 289 * which is probably how the ref-in-ref hack works - because of how 290 * parser functions and extension tags are procesed, #tag:ref doesn't 291 * see a nested ref anymore) and this patch only exposes that problem 292 * more clearly with the unpackOutput property. 293 * 294 * * Consider the set of extensions that 295 * (a) process wikitext 296 * (b) provide an extensionPostProcessor 297 * (c) run the extensionPostProcessor only on the top-level 298 * As of today, there is exactly one extension (Cite) that has all 299 * these properties, so the problem below is a speculative problem 300 * for today. But, this could potentially be a problem in the future. 301 * 302 * * Let us say there are at least two of them, E1 and E2 that 303 * support extension tags <e1> and <e2> respectively. 304 * 305 * * Let us say in an instance of <e1> on the page, <e2> is present 306 * and in another instance of <e2> on the page, <e1> is present. 307 * 308 * * In what order should E1's and E2's extensionPostProcessors be 309 * run on the top-level? Depending on what these handlers do, you 310 * could get potentially different results. You can see this quite 311 * starkly with the unpackOutput flag. 312 * 313 * * The ideal solution to this problem is to require that every extension's 314 * extensionPostProcessor be idempotent which lets us run these 315 * post processors repeatedly till the DOM stabilizes. But, this 316 * still doesn't necessarily guarantee that ordering doesn't matter. 317 * It just guarantees that with the unpackOutput flag set to false 318 * multiple extensions, all sealed fragments get fully processed. 319 * So, we still need to worry about that problem. 320 * 321 * But, idempotence *could* potentially be a sufficient property in most cases. 322 * To see this, consider that there is a Footnotes extension which is similar 323 * to the Cite extension in that they both extract inline content in the 324 * page source to a separate section of output and leave behind pointers to 325 * the global section in the output DOM. Given this, the Cite and Footnote 326 * extension post processors would essentially walk the dom and 327 * move any existing inline content into that global section till it is 328 * done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>, 329 * we ultimately end up with all footnote content in the footnotes section 330 * and all ref content in the references section and the DOM stabilizes. 331 * Ordering is irrelevant here. 332 * 333 * So, perhaps one way of catching these problems would be in code review 334 * by analyzing what the DOM postprocessor does and see if it introduces 335 * potential ordering issues. 336 */ 337 foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) { 338 foreach ( $domProcs as $i => $domProcSpec ) { 339 $processors[] = [ 340 'isExtPP' => true, // This is an extension DOM post processor 341 'name' => "pp:$extName:$i", 342 'Processor' => $domProcSpec, 343 ]; 344 } 345 } 346 347 $processors = array_merge( $processors, [ 348 [ 349 'name' => 'LiFixups,TableFixups,DedupeStyles', 350 'shortcut' => 'fixups', 351 'isTraverser' => true, 352 'skipNested' => true, 353 'handlers' => [ 354 // 1. Deal with <li>-hack and move trailing categories in <li>s out of the list 355 [ 356 'nodeName' => 'li', 357 'action' => [ LiFixups::class, 'handleLIHack' ], 358 ], 359 [ 360 'nodeName' => 'li', 361 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 362 ], 363 [ 364 'nodeName' => 'dt', 365 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 366 ], 367 [ 368 'nodeName' => 'dd', 369 'action' => [ LiFixups::class, 'migrateTrailingCategories' ] 370 ], 371 // 2. Fix up issues from templated table cells and table cell attributes 372 [ 373 'nodeName' => 'td', 374 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 375 return $tableFixer->stripDoubleTDs( $node, $this->frame ); 376 } 377 ], 378 [ 379 'nodeName' => 'td', 380 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 381 return $tableFixer->handleTableCellTemplates( $node, $this->frame ); 382 } 383 ], 384 [ 385 'nodeName' => 'th', 386 'action' => function ( $node, $env, $options ) use ( &$tableFixer ) { 387 return $tableFixer->handleTableCellTemplates( $node, $this->frame ); 388 } 389 ], 390 // 3. Deduplicate template styles 391 // (should run after dom-fragment expansion + after extension post-processors) 392 [ 393 'nodeName' => 'style', 394 'action' => [ DedupeStyles::class, 'dedupe' ] 395 ] 396 ] 397 ], 398 // Benefits from running after determining which media are redlinks 399 [ 400 'name' => 'Headings-genAnchors', 401 'shortcut' => 'heading-ids', 402 'isTraverser' => true, 403 'skipNested' => true, 404 'handlers' => [ 405 [ 406 'nodeName' => null, 407 'action' => [ Headings::class, 'genAnchors' ] 408 ], 409 [ 410 'nodeName' => null, 411 'action' => function ( $node, $env ) use ( &$seenIds ) { 412 return Headings::dedupeHeadingIds( $seenIds, $node ); 413 } 414 ] 415 ] 416 ], 417 [ 418 'Processor' => Linter::class, 419 'omit' => !$env->getSiteConfig()->linting(), 420 'skipNested' => true 421 ], 422 // Strip marker metas -- removes left over marker metas (ex: metas 423 // nested in expanded tpl/extension output). 424 [ 425 'name' => 'CleanUp-stripMarkerMetas', 426 'shortcut' => 'strip-metas', 427 'isTraverser' => true, 428 'handlers' => [ 429 [ 430 'nodeName' => 'meta', 431 'action' => [ CleanUp::class, 'stripMarkerMetas' ] 432 ] 433 ] 434 ], 435 // Language conversion and Red link marking are done here 436 // *before* we cleanup and save data-parsoid because they 437 // are also used in pb2pb/html2html passes, and we want to 438 // keep their input/output formats consistent. 439 [ 440 'Processor' => LangConverter::class, 441 'shortcut' => 'lang-converter', 442 'skipNested' => true 443 ], 444 [ 445 'Processor' => AddRedLinks::class, 446 'shortcut' => 'redlinks', 447 'skipNested' => true, 448 'omit' => $env->noDataAccess(), 449 ], 450 [ 451 'name' => 'DisplaySpace', 452 'shortcut' => 'displayspace', 453 'skipNested' => true, 454 'isTraverser' => true, 455 'handlers' => [ 456 [ 457 'nodeName' => '#text', 458 'action' => [ DisplaySpace::class, 'leftHandler' ] 459 ], 460 [ 461 'nodeName' => '#text', 462 'action' => [ DisplaySpace::class, 'rightHandler' ] 463 ], 464 ] 465 ], 466 [ 467 'Processor' => AddExtLinkClasses::class, 468 'shortcut' => 'linkclasses', 469 'skipNested' => true 470 ], 471 // Add <section> wrappers around sections 472 [ 473 'Processor' => WrapSections::class, 474 'shortcut' => 'sections', 475 'skipNested' => true 476 ], 477 [ 478 'Processor' => ConvertOffsets::class, 479 'shortcut' => 'convertoffsets', 480 'skipNested' => true, 481 ], 482 [ 483 'Processor' => I18n::class, 484 'shortcut' => 'i18n', 485 // FIXME(T214994): This should probably be `true`, since we 486 // want this to be another html2html type pass, but then our 487 // processor would need to handle nested content. Redlinks, 488 // displayspace, and others are ignoring that for now though, 489 // so let's wait until there's a more general mechanism. 490 'skipNested' => false, 491 ], 492 [ 493 'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanupAndSaveDataParsoid', 494 'shortcut' => 'cleanup', 495 'isTraverser' => true, 496 'handlers' => [ 497 // Strip empty elements from template content 498 [ 499 'nodeName' => null, 500 'action' => [ CleanUp::class, 'handleEmptyElements' ] 501 ], 502 // Save data.parsoid into data-parsoid html attribute. 503 // Make this its own thing so that any changes to the DOM 504 // don't affect other handlers that run alongside it. 505 [ 506 'nodeName' => null, 507 'action' => function ( 508 $node, $env, $options, $atTopLevel, $tplInfo 509 ) use ( &$usedIdIndex ) { 510 if ( $atTopLevel && DOMUtils::isBody( $node ) ) { 511 $usedIdIndex = DOMDataUtils::usedIdIndex( $node ); 512 } 513 return CleanUp::cleanupAndSaveDataParsoid( 514 $usedIdIndex, $node, $env, $atTopLevel, 515 $tplInfo 516 ); 517 } 518 ] 519 ] 520 ], 521 ] ); 522 523 return $processors; 524 } 525 526 /** 527 * @inheritDoc 528 */ 529 public function setSourceOffsets( SourceRange $so ): void { 530 $this->options['sourceOffsets'] = $so; 531 } 532 533 /** 534 * @inheritDoc 535 */ 536 public function resetState( array $options ): void { 537 parent::resetState( $options ); 538 539 // $this->env->getPageConfig()->meta->displayTitle = null; 540 $this->seenIds = []; 541 } 542 543 /** 544 * Create an element in the document.head with the given attrs. 545 * 546 * @param DOMDocument $document 547 * @param string $tagName 548 * @param array $attrs 549 */ 550 private function appendToHead( DOMDocument $document, string $tagName, array $attrs = [] ): void { 551 $elt = $document->createElement( $tagName ); 552 DOMUtils::addAttributes( $elt, $attrs ); 553 ( DOMCompat::getHead( $document ) )->appendChild( $elt ); 554 } 555 556 /** 557 * Get the array of style modules to add to <head> 558 * @param DOMDocument $document 559 * @param Env $env 560 * @param string $lang 561 */ 562 private function exportStyleModules( DOMDocument $document, Env $env, string $lang ): void { 563 // Hack: link styles 564 $styleModules = [ 565 'mediawiki.skinning.content.parsoid', 566 // Use the base styles that apioutput and fallback skin use. 567 'mediawiki.skinning.interface', 568 // Make sure to include contents of user generated styles 569 // e.g. MediaWiki:Common.css / MediaWiki:Mobile.css 570 'site.styles' 571 ]; 572 573 // Styles from modules returned from preprocessor / parse requests 574 $outputProps = $env->getOutputProperties(); 575 if ( isset( $outputProps['modulestyles'] ) ) { 576 $styleModules = array_merge( $styleModules, $outputProps['modulestyles'] ); 577 } 578 579 // FIXME: Maybe think about using an associative array or DS\Set 580 $styleModules = array_unique( $styleModules ); 581 $styleURI = $env->getSiteConfig()->getModulesLoadURI() . 582 '?lang=' . $lang . '&modules=' . 583 PHPUtils::encodeURIComponent( implode( '|', $styleModules ) ) . 584 // FIXME: Hardcodes vector skin 585 '&only=styles&skin=vector'; 586 587 // FIXME: We should add the list of style modules in a meta tag and 588 // have clients massage that into a a style URI based on skin and 589 // other baseline style modules they need for rendering. 590 $this->appendToHead( $document, 'link', [ 'rel' => 'stylesheet', 'href' => $styleURI ] ); 591 } 592 593 /** 594 * @param DOMElement $body 595 * @param Env $env 596 */ 597 private function updateBodyClasslist( DOMElement $body, Env $env ): void { 598 $dir = $env->getPageConfig()->getPageLanguageDir(); 599 $bodyCL = DOMCompat::getClassList( $body ); 600 $bodyCL->add( 'mw-content-' . $dir ); 601 $bodyCL->add( 'sitedir-' . $dir ); 602 $bodyCL->add( $dir ); 603 $body->setAttribute( 'dir', $dir ); 604 605 // Set 'mw-body-content' directly on the body. 606 // This is the designated successor for #bodyContent in core skins. 607 $bodyCL->add( 'mw-body-content' ); 608 // Set 'parsoid-body' to add the desired layout styling from Vector. 609 $bodyCL->add( 'parsoid-body' ); 610 // Also, add the 'mediawiki' class. 611 // Some Mediawiki:Common.css seem to target this selector. 612 $bodyCL->add( 'mediawiki' ); 613 // Set 'mw-parser-output' directly on the body. 614 // Templates target this class as part of the TemplateStyles RFC 615 $bodyCL->add( 'mw-parser-output' ); 616 } 617 618 /** 619 * FIXME: consider moving to DOMUtils or Env. 620 * 621 * @param Env $env 622 * @param DOMDocument $document 623 */ 624 public function addMetaData( Env $env, DOMDocument $document ): void { 625 // add <head> element if it was missing 626 if ( !( DOMCompat::getHead( $document ) instanceof DOMElement ) ) { 627 $document->documentElement->insertBefore( 628 $document->createElement( 'head' ), 629 DOMCompat::getBody( $document ) 630 ); 631 } 632 633 // add mw: and mwr: RDFa prefixes 634 $prefixes = [ 635 'dc: http://purl.org/dc/terms/', 636 'mw: http://mediawiki.org/rdf/' 637 ]; 638 $document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) ); 639 640 // (From wfParseUrl in core:) 641 // Protocol-relative URLs are handled really badly by parse_url(). 642 // It's so bad that the easiest way to handle them is to just prepend 643 // 'https:' and strip the protocol out later. 644 $baseURI = $env->getSiteConfig()->baseURI(); 645 $wasRelative = substr( $baseURI, 0, 2 ) == '//'; 646 if ( $wasRelative ) { 647 $baseURI = "https:$baseURI"; 648 } 649 // add 'https://' to baseURI if it was missing 650 $pu = parse_url( $baseURI ); 651 $mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) . 652 $baseURI . 'Special:Redirect/'; 653 654 ( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix ); 655 656 // add <head> content based on page meta data: 657 658 // Set the charset first. 659 $this->appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] ); 660 661 // Add page / revision metadata to the <head> 662 // PORT-FIXME: We will need to do some refactoring to eliminate 663 // this hardcoding. Probably even merge thi sinto metadataMap 664 $pageConfig = $env->getPageConfig(); 665 $revProps = [ 666 'id' => $pageConfig->getPageId(), 667 'ns' => $pageConfig->getNs(), 668 'rev_parentid' => $pageConfig->getParentRevisionId(), 669 'rev_revid' => $pageConfig->getRevisionId(), 670 'rev_sha1' => $pageConfig->getRevisionSha1(), 671 'rev_timestamp' => $pageConfig->getRevisionTimestamp() 672 ]; 673 foreach ( $revProps as $key => $value ) { 674 // generate proper attributes for the <meta> or <link> tag 675 if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) { 676 continue; 677 } 678 679 $attrs = []; 680 $mdm = $this->metadataMap[$key]; 681 682 /** FIXME: The JS side has a bunch of other checks here */ 683 684 foreach ( $mdm as $k => $v ) { 685 // evaluate a function, or perform sprintf-style formatting, or 686 // use string directly, depending on value in metadataMap 687 if ( $v instanceof Closure ) { 688 $v = $v( $revProps ); 689 } elseif ( strpos( $v, '%' ) !== false ) { 690 // @phan-suppress-next-line PhanPluginPrintfVariableFormatString 691 $v = sprintf( $v, $value ); 692 } 693 $attrs[$k] = $v; 694 } 695 696 // <link> is used if there's a resource or href attribute. 697 $this->appendToHead( $document, 698 isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta', 699 $attrs 700 ); 701 } 702 703 if ( $revProps['rev_revid'] ) { 704 $document->documentElement->setAttribute( 705 'about', $mwrPrefix . 'revision/' . $revProps['rev_revid'] 706 ); 707 } 708 709 // Normalize before comparison 710 if ( 711 preg_replace( '/_/', ' ', $env->getSiteConfig()->mainpage() ) === 712 preg_replace( '/_/', ' ', $env->getPageConfig()->getTitle() ) 713 ) { 714 $this->appendToHead( $document, 'meta', [ 715 'property' => 'isMainPage', 716 'content' => 'true' /* HTML attribute values should be strings */ 717 ] ); 718 } 719 720 // Set the parsoid content-type strings 721 // FIXME: Should we be using http-equiv for this? 722 $this->appendToHead( $document, 'meta', [ 723 'property' => 'mw:html:version', 724 'content' => $env->getOutputContentVersion() 725 ] 726 ); 727 728 $expTitle = strtr( $env->getPageConfig()->getTitle(), ' ', '_' ); 729 $expTitle = explode( '/', $expTitle ); 730 $expTitle = array_map( function ( $comp ) { 731 return PHPUtils::encodeURIComponent( $comp ); 732 }, $expTitle ); 733 734 $this->appendToHead( $document, 'link', [ 735 'rel' => 'dc:isVersionOf', 736 'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle ) 737 ] ); 738 739 DOMCompat::setTitle( 740 $document, 741 // PORT-FIXME: There isn't a place anywhere yet for displayTitle 742 /* $env->getPageConfig()->meta->displayTitle || */ 743 $env->getPageConfig()->getTitle() 744 ); 745 746 // Add base href pointing to the wiki root 747 $this->appendToHead( $document, 'base', [ 748 'href' => $env->getSiteConfig()->baseURI() 749 ] ); 750 751 // Stick data attributes in the head 752 if ( $env->pageBundle ) { 753 DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) ); 754 } 755 756 // PageConfig guarantees language will always be non-null. 757 $lang = $env->getPageConfig()->getPageLanguage(); 758 $body = DOMCompat::getBody( $document ); 759 $body->setAttribute( 'lang', Utils::bcp47n( $lang ) ); 760 $this->updateBodyClasslist( $body, $env ); 761 $this->exportStyleModules( $document, $env, $lang ); 762 763 // Indicate whether LanguageConverter is enabled, so that downstream 764 // caches can split on variant (if necessary) 765 $this->appendToHead( $document, 'meta', [ 766 'http-equiv' => 'content-language', 767 'content' => $env->htmlContentLanguage() 768 ] 769 ); 770 $this->appendToHead( $document, 'meta', [ 771 'http-equiv' => 'vary', 772 'content' => $env->htmlVary() 773 ] 774 ); 775 776 if ( $env->profiling() ) { 777 $profile = $env->getCurrentProfile(); 778 $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); 779 $body->appendChild( $body->ownerDocument->createComment( $this->timeProfile ) ); 780 $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); 781 } 782 } 783 784 /** 785 * @param DOMNode $node 786 */ 787 public function doPostProcess( DOMNode $node ): void { 788 $env = $this->env; 789 790 $hasDumpFlags = $env->hasDumpFlags(); 791 792 if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-builder' ) ) { 793 $opts = []; 794 ContentUtils::dumpDOM( $node, 'DOM: after tree builder', $opts ); 795 } 796 797 $startTime = null; 798 $endTime = null; 799 $prefix = null; 800 $traceLevel = null; 801 $resourceCategory = null; 802 803 $profile = null; 804 if ( $env->profiling() ) { 805 $profile = $env->getCurrentProfile(); 806 if ( $this->atTopLevel ) { 807 $this->timeProfile = str_repeat( "-", 85 ) . "\n"; 808 $prefix = 'TOP'; 809 // Turn off DOM pass timing tracing on non-top-level documents 810 $resourceCategory = 'DOMPasses:TOP'; 811 } else { 812 $prefix = '---'; 813 $resourceCategory = 'DOMPasses:NESTED'; 814 } 815 $startTime = PHPUtils::getStartHRTime(); 816 $env->log( 'debug/time/dompp', $prefix . '; start=' . $startTime ); 817 } 818 819 for ( $i = 0; $i < count( $this->processors ); $i++ ) { 820 $pp = $this->processors[$i]; 821 if ( !empty( $pp['skipNested'] ) && !$this->atTopLevel ) { 822 continue; 823 } 824 825 $ppName = null; 826 $ppStart = null; 827 828 // Trace 829 if ( $profile ) { 830 $ppName = $pp['name'] . str_repeat( 831 " ", 832 ( strlen( $pp['name'] ) < 30 ) ? 30 - strlen( $pp['name'] ) : 0 833 ); 834 $ppStart = PHPUtils::getStartHRTime(); 835 $env->log( 'debug/time/dompp', $prefix . '; ' . $ppName . ' start' ); 836 } 837 838 $opts = null; 839 if ( $hasDumpFlags ) { 840 $opts = [ 841 'env' => $env, 842 'dumpFragmentMap' => $this->atTopLevel, 843 'keepTmp' => true 844 ]; 845 846 if ( $env->hasDumpFlag( 'dom:pre-' . $pp['shortcut'] ) ) { 847 ContentUtils::dumpDOM( $node, 'DOM: pre-' . $pp['shortcut'], $opts ); 848 } 849 } 850 851 // Excessive to do it here always, but protects against future changes 852 // to how $this->frame may be updated. 853 $pp['proc']( $node, [ 'frame' => $this->frame ] + $this->options, $this->atTopLevel ); 854 855 if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-' . $pp['shortcut'] ) ) { 856 ContentUtils::dumpDOM( $node, 'DOM: post-' . $pp['shortcut'], $opts ); 857 } 858 859 if ( $profile ) { 860 $ppElapsed = PHPUtils::getHRTimeDifferential( $ppStart ); 861 $env->log( 862 'debug/time/dompp', 863 $prefix . '; ' . $ppName . ' end; time = ' . $ppElapsed 864 ); 865 if ( $this->atTopLevel ) { 866 $this->timeProfile .= str_pad( $prefix . '; ' . $ppName, 65 ) . 867 ' time = ' . 868 str_pad( number_format( $ppElapsed, 2 ), 10, ' ', STR_PAD_LEFT ) . "\n"; 869 } 870 $profile->bumpTimeUse( $resourceCategory, $ppElapsed, 'DOM' ); 871 } 872 } 873 874 if ( $profile ) { 875 $endTime = PHPUtils::getStartHRTime(); 876 $env->log( 877 'debug/time/dompp', 878 $prefix . '; end=' . number_format( $endTime, 2 ) . '; time = ' . 879 number_format( PHPUtils::getHRTimeDifferential( $startTime ), 2 ) 880 ); 881 } 882 883 // For sub-pipeline documents, we are done. 884 // For the top-level document, we generate <head> and add it. 885 if ( $this->atTopLevel ) { 886 self::addMetaData( $env, $node->ownerDocument ); 887 if ( $env->hasDumpFlag( 'wt2html:limits' ) ) { 888 /* 889 * PORT-FIXME: Not yet implemented 890 $env->printWt2HtmlResourceUsage( [ 891 'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) ) 892 ] ); 893 */ 894 } 895 } 896 } 897 898 /** 899 * @inheritDoc 900 */ 901 public function process( $node, array $opts = null ) { 902 '@phan-var DOMNode $node'; // @var DOMNode $node 903 $this->doPostProcess( $node ); 904 return $node; 905 } 906 907 /** 908 * @inheritDoc 909 */ 910 public function processChunkily( $input, ?array $options ): Generator { 911 if ( $this->prevStage ) { 912 // The previous stage will yield a DOM. 913 // FIXME: Should we change the signature of that to return a DOM 914 // If we do so, a pipeline stage returns either a generator or 915 // concrete output (in this case, a DOM). 916 $node = $this->prevStage->processChunkily( $input, $options )->current(); 917 } else { 918 $node = $input; 919 } 920 $this->process( $node ); 921 yield $node; 922 } 923} 924