1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Wt2Html;
5
6use Closure;
7use DateTime;
8use DOMDocument;
9use DOMElement;
10use DOMNode;
11use Generator;
12use Wikimedia\ObjectFactory;
13use Wikimedia\Parsoid\Config\Env;
14use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor;
15use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
16use Wikimedia\Parsoid\Tokens\SourceRange;
17use Wikimedia\Parsoid\Utils\ContentUtils;
18use Wikimedia\Parsoid\Utils\DOMCompat;
19use Wikimedia\Parsoid\Utils\DOMDataUtils;
20use Wikimedia\Parsoid\Utils\DOMTraverser;
21use Wikimedia\Parsoid\Utils\DOMUtils;
22use Wikimedia\Parsoid\Utils\PHPUtils;
23use Wikimedia\Parsoid\Utils\Utils;
24use Wikimedia\Parsoid\Wt2Html\PP\Handlers\CleanUp;
25use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DedupeStyles;
26use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DisplaySpace;
27use Wikimedia\Parsoid\Wt2Html\PP\Handlers\HandleLinkNeighbours;
28use Wikimedia\Parsoid\Wt2Html\PP\Handlers\Headings;
29use Wikimedia\Parsoid\Wt2Html\PP\Handlers\LiFixups;
30use Wikimedia\Parsoid\Wt2Html\PP\Handlers\TableFixups;
31use Wikimedia\Parsoid\Wt2Html\PP\Handlers\UnpackDOMFragments;
32use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddExtLinkClasses;
33use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddMediaInfo;
34use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddRedLinks;
35use Wikimedia\Parsoid\Wt2Html\PP\Processors\ComputeDSR;
36use Wikimedia\Parsoid\Wt2Html\PP\Processors\ConvertOffsets;
37use Wikimedia\Parsoid\Wt2Html\PP\Processors\I18n;
38use Wikimedia\Parsoid\Wt2Html\PP\Processors\LangConverter;
39use Wikimedia\Parsoid\Wt2Html\PP\Processors\Linter;
40use Wikimedia\Parsoid\Wt2Html\PP\Processors\MarkFosteredContent;
41use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTemplateMarkerMetas;
42use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTrailingNLs;
43use Wikimedia\Parsoid\Wt2Html\PP\Processors\Normalize;
44use Wikimedia\Parsoid\Wt2Html\PP\Processors\ProcessTreeBuilderFixups;
45use Wikimedia\Parsoid\Wt2Html\PP\Processors\PWrap;
46use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapSections;
47use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapTemplates;
48
49/**
50 * Perform post-processing steps on an already-built HTML DOM.
51 */
52class DOMPostProcessor extends PipelineStage {
53	/** @var array */
54	private $options;
55
56	/** @var array */
57	private $seenIds;
58
59	/** @var array */
60	private $processors;
61
62	/** @var ParsoidExtensionAPI Provides post-processing support to extensions */
63	private $extApi;
64
65	/** @var array */
66	private $metadataMap;
67
68	/** @var string */
69	private $timeProfile = '';
70
71	/**
72	 * @param Env $env
73	 * @param array $options
74	 * @param string $stageId
75	 * @param ?PipelineStage $prevStage
76	 */
77	public function __construct(
78		Env $env, array $options = [], string $stageId = "",
79		?PipelineStage $prevStage = null
80	) {
81		parent::__construct( $env, $prevStage );
82
83		$this->options = $options;
84		$this->seenIds = [];
85		$this->processors = [];
86		$this->extApi = new ParsoidExtensionAPI( $env );
87
88		// map from mediawiki metadata names to RDFa property names
89		$this->metadataMap = [
90			'ns' => [
91				'property' => 'mw:pageNamespace',
92				'content' => '%d',
93			],
94			'id' => [
95				'property' => 'mw:pageId',
96				'content' => '%d',
97			],
98
99			// DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266)
100
101			// 'rev_revid' is used to set the overall subject of the document, we don't
102			// need to add a specific <meta> or <link> element for it.
103
104			'rev_parentid' => [
105				'rel' => 'dc:replaces',
106				'resource' => 'mwr:revision/%d',
107			],
108			'rev_timestamp' => [
109				'property' => 'dc:modified',
110				'content' => function ( $m ) {
111					# Convert from TS_MW ("mediawiki timestamp") format
112					$dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] );
113					# Note that DateTime::ISO8601 is not actually ISO8601, alas.
114					return $dt->format( 'Y-m-d\TH:i:s.000\Z' );
115				},
116			],
117			'rev_sha1' => [
118				'property' => 'mw:revisionSHA1',
119				'content' => '%s',
120			]
121		];
122	}
123
124	/**
125	 * @param ?array $processors
126	 */
127	public function registerProcessors( ?array $processors ): void {
128		if ( empty( $processors ) ) {
129			$processors = $this->getDefaultProcessors();
130		}
131
132		foreach ( $processors as $p ) {
133			if ( !empty( $p['omit'] ) ) {
134				continue;
135			}
136			if ( empty( $p['name'] ) ) {
137				$p['name'] = Utils::stripNamespace( $p['Processor'] );
138			}
139			if ( empty( $p['shortcut'] ) ) {
140				$p['shortcut'] = $p['name'];
141			}
142			if ( !empty( $p['isTraverser'] ) ) {
143				$t = new DOMTraverser();
144				foreach ( $p['handlers'] as $h ) {
145					$t->addHandler( $h['nodeName'], $h['action'] );
146				}
147				$p['proc'] = function ( ...$args ) use ( $t ) {
148					$args[] = null;
149					return $t->traverse( $this->env, ...$args );
150				};
151			} else {
152				$classNameOrSpec = $p['Processor'];
153				if ( empty( $p['isExtPP'] ) ) {
154					// Internal processor w/ ::run() method, class name given
155					// @phan-suppress-next-line PhanNonClassMethodCall
156					$c = new $classNameOrSpec();
157					$p['proc'] = function ( ...$args ) use ( $c ) {
158						return $c->run( $this->env, ...$args );
159					};
160				} else {
161					// Extension post processor, object factory spec given
162					$c = ObjectFactory::getObjectFromSpec( $classNameOrSpec, [
163						'allowClassName' => true,
164						'assertClass' => ExtDOMProcessor::class,
165					] );
166					$p['proc'] = function ( ...$args ) use ( $c ) {
167						return $c->wtPostprocess( $this->extApi, ...$args );
168					};
169				}
170			}
171			$this->processors[] = $p;
172		}
173	}
174
175	/**
176	 * @return array
177	 */
178	public function getDefaultProcessors(): array {
179		$env = $this->env;
180		$options = $this->options;
181		$seenIds = &$this->seenIds;
182		$usedIdIndex = [];
183
184		$tableFixer = new TableFixups( $env );
185
186		/* ---------------------------------------------------------------------------
187		 * FIXME:
188		 * 1. PipelineFactory caches pipelines per env
189		 * 2. PipelineFactory.parse uses a default cache key
190		 * 3. ParserTests uses a shared/global env object for all tests.
191		 * 4. ParserTests also uses PipelineFactory.parse (via env.getContentHandler())
192		 *    => the pipeline constructed for the first test that runs wt2html
193		 *       is used for all subsequent wt2html tests
194		 * 5. If we are selectively turning on/off options on a per-test basis
195		 *    in parser tests, those options won't work if those options are
196		 *    also used to configure pipeline construction (including which DOM passes
197		 *    are enabled).
198		 *
199		 *    Ex: if (env.wrapSections) { addPP('wrapSections', wrapSections); }
200		 *
201		 *    This won't do what you expect it to do. This is primarily a
202		 *    parser tests script issue -- but given the abstraction layers that
203		 *    are on top of the parser pipeline construction, fixing that is
204		 *    not straightforward right now. So, this note is a warning to future
205		 *    developers to pay attention to how they construct pipelines.
206		 * --------------------------------------------------------------------------- */
207
208		$processors = [
209			// Common post processing
210			[
211				'Processor' => MarkFosteredContent::class,
212				'shortcut' => 'fostered'
213			],
214			[
215				'Processor' => ProcessTreeBuilderFixups::class,
216				'shortcut' => 'process-fixups'
217			],
218			[
219				'Processor' => Normalize::class
220			],
221			[
222				'Processor' => PWrap::class,
223				'shortcut' => 'pwrap',
224				'skipNested' => true
225			],
226			// This is run at all levels since, for now, we don't have a generic
227			// solution to running top level passes on HTML stashed in data-mw.
228			// See T214994 for that.
229			//
230			// Also, the gallery extension's "packed" mode would otherwise need a
231			// post-processing pass to scale media after it has been fetched.  That
232			// introduces an ordering dependency that may or may not complicate things.
233			[
234				'Processor' => AddMediaInfo::class,
235				'shortcut' => 'media'
236			],
237			// Run this after 'ProcessTreeBuilderFixups' because the mw:StartTag
238			// and mw:EndTag metas would otherwise interfere with the
239			// firstChild/lastChild check that this pass does.
240			[
241				'Processor' => MigrateTemplateMarkerMetas::class,
242				'shortcut' => 'migrate-metas'
243			],
244			[
245				'Processor' => MigrateTrailingNLs::class,
246				'shortcut' => 'migrate-nls'
247			],
248			// dsr computation and tpl encap are only relevant for top-level content
249			[
250				'Processor' => ComputeDSR::class,
251				'shortcut' => 'dsr',
252				'omit' => !empty( $options['inTemplate'] )
253			],
254			[
255				'Processor' => WrapTemplates::class,
256				'shortcut' => 'tplwrap',
257				'omit' => !empty( $options['inTemplate'] )
258			],
259			// 1. Link prefixes and suffixes
260			// 2. Unpack DOM fragments
261			[
262				'name' => 'HandleLinkNeighbours,UnpackDOMFragments',
263				'shortcut' => 'dom-unpack',
264				'isTraverser' => true,
265				'handlers' => [
266					[
267						'nodeName' => 'a',
268						'action' => [ HandleLinkNeighbours::class, 'handler' ]
269					],
270					[
271						'nodeName' => null,
272						'action' => [ UnpackDOMFragments::class, 'handler' ]
273					]
274				]
275			]
276		];
277
278		/**
279		 * FIXME: There are two potential ordering problems here.
280		 *
281		 * 1. unpackDOMFragment should always run immediately
282		 *    before these extensionPostProcessors, which we do currently.
283		 *    This ensures packed content get processed correctly by extensions
284		 *    before additional transformations are run on the DOM.
285		 *
286		 * This ordering issue is handled through documentation.
287		 *
288		 * 2. This has existed all along (in the PHP parser as well as Parsoid
289		 *    which is probably how the ref-in-ref hack works - because of how
290		 *    parser functions and extension tags are procesed, #tag:ref doesn't
291		 *    see a nested ref anymore) and this patch only exposes that problem
292		 *    more clearly with the unpackOutput property.
293		 *
294		 * * Consider the set of extensions that
295		 *   (a) process wikitext
296		 *   (b) provide an extensionPostProcessor
297		 *   (c) run the extensionPostProcessor only on the top-level
298		 *   As of today, there is exactly one extension (Cite) that has all
299		 *   these properties, so the problem below is a speculative problem
300		 *   for today. But, this could potentially be a problem in the future.
301		 *
302		 * * Let us say there are at least two of them, E1 and E2 that
303		 *   support extension tags <e1> and <e2> respectively.
304		 *
305		 * * Let us say in an instance of <e1> on the page, <e2> is present
306		 *   and in another instance of <e2> on the page, <e1> is present.
307		 *
308		 * * In what order should E1's and E2's extensionPostProcessors be
309		 *   run on the top-level? Depending on what these handlers do, you
310		 *   could get potentially different results. You can see this quite
311		 *   starkly with the unpackOutput flag.
312		 *
313		 * * The ideal solution to this problem is to require that every extension's
314		 *   extensionPostProcessor be idempotent which lets us run these
315		 *   post processors repeatedly till the DOM stabilizes. But, this
316		 *   still doesn't necessarily guarantee that ordering doesn't matter.
317		 *   It just guarantees that with the unpackOutput flag set to false
318		 *   multiple extensions, all sealed fragments get fully processed.
319		 *   So, we still need to worry about that problem.
320		 *
321		 *   But, idempotence *could* potentially be a sufficient property in most cases.
322		 *   To see this, consider that there is a Footnotes extension which is similar
323		 *   to the Cite extension in that they both extract inline content in the
324		 *   page source to a separate section of output and leave behind pointers to
325		 *   the global section in the output DOM. Given this, the Cite and Footnote
326		 *   extension post processors would essentially walk the dom and
327		 *   move any existing inline content into that global section till it is
328		 *   done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>,
329		 *   we ultimately end up with all footnote content in the footnotes section
330		 *   and all ref content in the references section and the DOM stabilizes.
331		 *   Ordering is irrelevant here.
332		 *
333		 *   So, perhaps one way of catching these problems would be in code review
334		 *   by analyzing what the DOM postprocessor does and see if it introduces
335		 *   potential ordering issues.
336		 */
337		foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) {
338			foreach ( $domProcs as $i => $domProcSpec ) {
339				$processors[] = [
340					'isExtPP' => true, // This is an extension DOM post processor
341					'name' => "pp:$extName:$i",
342					'Processor' => $domProcSpec,
343				];
344			}
345		}
346
347		$processors = array_merge( $processors, [
348			[
349				'name' => 'LiFixups,TableFixups,DedupeStyles',
350				'shortcut' => 'fixups',
351				'isTraverser' => true,
352				'skipNested' => true,
353				'handlers' => [
354					// 1. Deal with <li>-hack and move trailing categories in <li>s out of the list
355					[
356						'nodeName' => 'li',
357						'action' => [ LiFixups::class, 'handleLIHack' ],
358					],
359					[
360						'nodeName' => 'li',
361						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
362					],
363					[
364						'nodeName' => 'dt',
365						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
366					],
367					[
368						'nodeName' => 'dd',
369						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
370					],
371					// 2. Fix up issues from templated table cells and table cell attributes
372					[
373						'nodeName' => 'td',
374						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
375							return $tableFixer->stripDoubleTDs( $node, $this->frame );
376						}
377					],
378					[
379						'nodeName' => 'td',
380						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
381							return $tableFixer->handleTableCellTemplates( $node, $this->frame );
382						}
383					],
384					[
385						'nodeName' => 'th',
386						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
387							return $tableFixer->handleTableCellTemplates( $node, $this->frame );
388						}
389					],
390					// 3. Deduplicate template styles
391					// (should run after dom-fragment expansion + after extension post-processors)
392					[
393						'nodeName' => 'style',
394						'action' => [ DedupeStyles::class, 'dedupe' ]
395					]
396				]
397			],
398			// Benefits from running after determining which media are redlinks
399			[
400				'name' => 'Headings-genAnchors',
401				'shortcut' => 'heading-ids',
402				'isTraverser' => true,
403				'skipNested' => true,
404				'handlers' => [
405					[
406						'nodeName' => null,
407						'action' => [ Headings::class, 'genAnchors' ]
408					],
409					[
410						'nodeName' => null,
411						'action' => function ( $node, $env ) use ( &$seenIds ) {
412							return Headings::dedupeHeadingIds( $seenIds, $node );
413						}
414					]
415				]
416			],
417			[
418				'Processor' => Linter::class,
419				'omit' => !$env->getSiteConfig()->linting(),
420				'skipNested' => true
421			],
422			// Strip marker metas -- removes left over marker metas (ex: metas
423			// nested in expanded tpl/extension output).
424			[
425				'name' => 'CleanUp-stripMarkerMetas',
426				'shortcut' => 'strip-metas',
427				'isTraverser' => true,
428				'handlers' => [
429					[
430						'nodeName' => 'meta',
431						'action' => [ CleanUp::class, 'stripMarkerMetas' ]
432					]
433				]
434			],
435			// Language conversion and Red link marking are done here
436			// *before* we cleanup and save data-parsoid because they
437			// are also used in pb2pb/html2html passes, and we want to
438			// keep their input/output formats consistent.
439			[
440				'Processor' => LangConverter::class,
441				'shortcut' => 'lang-converter',
442				'skipNested' => true
443			],
444			[
445				'Processor' => AddRedLinks::class,
446				'shortcut' => 'redlinks',
447				'skipNested' => true,
448				'omit' => $env->noDataAccess(),
449			],
450			[
451				'name' => 'DisplaySpace',
452				'shortcut' => 'displayspace',
453				'skipNested' => true,
454				'isTraverser' => true,
455				'handlers' => [
456					[
457						'nodeName' => '#text',
458						'action' => [ DisplaySpace::class, 'leftHandler' ]
459					],
460					[
461						'nodeName' => '#text',
462						'action' => [ DisplaySpace::class, 'rightHandler' ]
463					],
464				]
465			],
466			[
467				'Processor' => AddExtLinkClasses::class,
468				'shortcut' => 'linkclasses',
469				'skipNested' => true
470			],
471			// Add <section> wrappers around sections
472			[
473				'Processor' => WrapSections::class,
474				'shortcut' => 'sections',
475				'skipNested' => true
476			],
477			[
478				'Processor' => ConvertOffsets::class,
479				'shortcut' => 'convertoffsets',
480				'skipNested' => true,
481			],
482			[
483				'Processor' => I18n::class,
484				'shortcut' => 'i18n',
485				// FIXME(T214994): This should probably be `true`, since we
486				// want this to be another html2html type pass, but then our
487				// processor would need to handle nested content.  Redlinks,
488				// displayspace, and others are ignoring that for now though,
489				// so let's wait until there's a more general mechanism.
490				'skipNested' => false,
491			],
492			[
493				'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanupAndSaveDataParsoid',
494				'shortcut' => 'cleanup',
495				'isTraverser' => true,
496				'handlers' => [
497					// Strip empty elements from template content
498					[
499						'nodeName' => null,
500						'action' => [ CleanUp::class, 'handleEmptyElements' ]
501					],
502					// Save data.parsoid into data-parsoid html attribute.
503					// Make this its own thing so that any changes to the DOM
504					// don't affect other handlers that run alongside it.
505					[
506						'nodeName' => null,
507						'action' => function (
508							$node, $env, $options, $atTopLevel, $tplInfo
509						) use ( &$usedIdIndex ) {
510							if ( $atTopLevel && DOMUtils::isBody( $node ) ) {
511								$usedIdIndex = DOMDataUtils::usedIdIndex( $node );
512							}
513							return CleanUp::cleanupAndSaveDataParsoid(
514								$usedIdIndex, $node, $env, $atTopLevel,
515								$tplInfo
516							);
517						}
518					]
519				]
520			],
521		] );
522
523		return $processors;
524	}
525
526	/**
527	 * @inheritDoc
528	 */
529	public function setSourceOffsets( SourceRange $so ): void {
530		$this->options['sourceOffsets'] = $so;
531	}
532
533	/**
534	 * @inheritDoc
535	 */
536	public function resetState( array $options ): void {
537		parent::resetState( $options );
538
539		// $this->env->getPageConfig()->meta->displayTitle = null;
540		$this->seenIds = [];
541	}
542
543	/**
544	 * Create an element in the document.head with the given attrs.
545	 *
546	 * @param DOMDocument $document
547	 * @param string $tagName
548	 * @param array $attrs
549	 */
550	private function appendToHead( DOMDocument $document, string $tagName, array $attrs = [] ): void {
551		$elt = $document->createElement( $tagName );
552		DOMUtils::addAttributes( $elt, $attrs );
553		( DOMCompat::getHead( $document ) )->appendChild( $elt );
554	}
555
556	/**
557	 * Get the array of style modules to add to <head>
558	 * @param DOMDocument $document
559	 * @param Env $env
560	 * @param string $lang
561	 */
562	private function exportStyleModules( DOMDocument $document, Env $env, string $lang ): void {
563		// Hack: link styles
564		$styleModules = [
565			'mediawiki.skinning.content.parsoid',
566			// Use the base styles that apioutput and fallback skin use.
567			'mediawiki.skinning.interface',
568			// Make sure to include contents of user generated styles
569			// e.g. MediaWiki:Common.css / MediaWiki:Mobile.css
570			'site.styles'
571		];
572
573		// Styles from modules returned from preprocessor / parse requests
574		$outputProps = $env->getOutputProperties();
575		if ( isset( $outputProps['modulestyles'] ) ) {
576			$styleModules = array_merge( $styleModules, $outputProps['modulestyles'] );
577		}
578
579		// FIXME: Maybe think about using an associative array or DS\Set
580		$styleModules = array_unique( $styleModules );
581		$styleURI = $env->getSiteConfig()->getModulesLoadURI() .
582			'?lang=' . $lang . '&modules=' .
583			PHPUtils::encodeURIComponent( implode( '|', $styleModules ) ) .
584			// FIXME: Hardcodes vector skin
585			'&only=styles&skin=vector';
586
587		// FIXME: We should add the list of style modules in a meta tag and
588		// have clients massage that into a a style URI based on skin and
589		// other baseline style modules they need for rendering.
590		$this->appendToHead( $document, 'link', [ 'rel' => 'stylesheet', 'href' => $styleURI ] );
591	}
592
593	/**
594	 * @param DOMElement $body
595	 * @param Env $env
596	 */
597	private function updateBodyClasslist( DOMElement $body, Env $env ): void {
598		$dir = $env->getPageConfig()->getPageLanguageDir();
599		$bodyCL = DOMCompat::getClassList( $body );
600		$bodyCL->add( 'mw-content-' . $dir );
601		$bodyCL->add( 'sitedir-' . $dir );
602		$bodyCL->add( $dir );
603		$body->setAttribute( 'dir', $dir );
604
605		// Set 'mw-body-content' directly on the body.
606		// This is the designated successor for #bodyContent in core skins.
607		$bodyCL->add( 'mw-body-content' );
608		// Set 'parsoid-body' to add the desired layout styling from Vector.
609		$bodyCL->add( 'parsoid-body' );
610		// Also, add the 'mediawiki' class.
611		// Some Mediawiki:Common.css seem to target this selector.
612		$bodyCL->add( 'mediawiki' );
613		// Set 'mw-parser-output' directly on the body.
614		// Templates target this class as part of the TemplateStyles RFC
615		$bodyCL->add( 'mw-parser-output' );
616	}
617
618	/**
619	 * FIXME: consider moving to DOMUtils or Env.
620	 *
621	 * @param Env $env
622	 * @param DOMDocument $document
623	 */
624	public function addMetaData( Env $env, DOMDocument $document ): void {
625		// add <head> element if it was missing
626		if ( !( DOMCompat::getHead( $document ) instanceof DOMElement ) ) {
627			$document->documentElement->insertBefore(
628				$document->createElement( 'head' ),
629				DOMCompat::getBody( $document )
630			);
631		}
632
633		// add mw: and mwr: RDFa prefixes
634		$prefixes = [
635			'dc: http://purl.org/dc/terms/',
636			'mw: http://mediawiki.org/rdf/'
637		];
638		$document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) );
639
640		// (From wfParseUrl in core:)
641		// Protocol-relative URLs are handled really badly by parse_url().
642		// It's so bad that the easiest way to handle them is to just prepend
643		// 'https:' and strip the protocol out later.
644		$baseURI = $env->getSiteConfig()->baseURI();
645		$wasRelative = substr( $baseURI, 0, 2 ) == '//';
646		if ( $wasRelative ) {
647			$baseURI = "https:$baseURI";
648		}
649		// add 'https://' to baseURI if it was missing
650		$pu = parse_url( $baseURI );
651		$mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) .
652			$baseURI . 'Special:Redirect/';
653
654		( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix );
655
656		// add <head> content based on page meta data:
657
658		// Set the charset first.
659		$this->appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] );
660
661		// Add page / revision metadata to the <head>
662		// PORT-FIXME: We will need to do some refactoring to eliminate
663		// this hardcoding. Probably even merge thi sinto metadataMap
664		$pageConfig = $env->getPageConfig();
665		$revProps = [
666			'id' => $pageConfig->getPageId(),
667			'ns' => $pageConfig->getNs(),
668			'rev_parentid' => $pageConfig->getParentRevisionId(),
669			'rev_revid' => $pageConfig->getRevisionId(),
670			'rev_sha1' => $pageConfig->getRevisionSha1(),
671			'rev_timestamp' => $pageConfig->getRevisionTimestamp()
672		];
673		foreach ( $revProps as $key => $value ) {
674			// generate proper attributes for the <meta> or <link> tag
675			if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) {
676				continue;
677			}
678
679			$attrs = [];
680			$mdm = $this->metadataMap[$key];
681
682			/** FIXME: The JS side has a bunch of other checks here */
683
684			foreach ( $mdm as $k => $v ) {
685				// evaluate a function, or perform sprintf-style formatting, or
686				// use string directly, depending on value in metadataMap
687				if ( $v instanceof Closure ) {
688					$v = $v( $revProps );
689				} elseif ( strpos( $v, '%' ) !== false ) {
690					// @phan-suppress-next-line PhanPluginPrintfVariableFormatString
691					$v = sprintf( $v, $value );
692				}
693				$attrs[$k] = $v;
694			}
695
696			// <link> is used if there's a resource or href attribute.
697			$this->appendToHead( $document,
698				isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta',
699				$attrs
700			);
701		}
702
703		if ( $revProps['rev_revid'] ) {
704			$document->documentElement->setAttribute(
705				'about', $mwrPrefix . 'revision/' . $revProps['rev_revid']
706			);
707		}
708
709		// Normalize before comparison
710		if (
711			preg_replace( '/_/', ' ', $env->getSiteConfig()->mainpage() ) ===
712			preg_replace( '/_/', ' ', $env->getPageConfig()->getTitle() )
713		) {
714			$this->appendToHead( $document, 'meta', [
715				'property' => 'isMainPage',
716				'content' => 'true' /* HTML attribute values should be strings */
717			] );
718		}
719
720		// Set the parsoid content-type strings
721		// FIXME: Should we be using http-equiv for this?
722		$this->appendToHead( $document, 'meta', [
723				'property' => 'mw:html:version',
724				'content' => $env->getOutputContentVersion()
725			]
726		);
727
728		$expTitle = strtr( $env->getPageConfig()->getTitle(), ' ', '_' );
729		$expTitle = explode( '/', $expTitle );
730		$expTitle = array_map( function ( $comp ) {
731			return PHPUtils::encodeURIComponent( $comp );
732		}, $expTitle );
733
734		$this->appendToHead( $document, 'link', [
735			'rel' => 'dc:isVersionOf',
736			'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle )
737		] );
738
739		DOMCompat::setTitle(
740			$document,
741			// PORT-FIXME: There isn't a place anywhere yet for displayTitle
742			/* $env->getPageConfig()->meta->displayTitle || */
743			$env->getPageConfig()->getTitle()
744		);
745
746		// Add base href pointing to the wiki root
747		$this->appendToHead( $document, 'base', [
748			'href' => $env->getSiteConfig()->baseURI()
749		] );
750
751		// Stick data attributes in the head
752		if ( $env->pageBundle ) {
753			DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) );
754		}
755
756		// PageConfig guarantees language will always be non-null.
757		$lang = $env->getPageConfig()->getPageLanguage();
758		$body = DOMCompat::getBody( $document );
759		$body->setAttribute( 'lang', Utils::bcp47n( $lang ) );
760		$this->updateBodyClasslist( $body, $env );
761		$this->exportStyleModules( $document, $env, $lang );
762
763		// Indicate whether LanguageConverter is enabled, so that downstream
764		// caches can split on variant (if necessary)
765		$this->appendToHead( $document, 'meta', [
766				'http-equiv' => 'content-language',
767				'content' => $env->htmlContentLanguage()
768			]
769		);
770		$this->appendToHead( $document, 'meta', [
771				'http-equiv' => 'vary',
772				'content' => $env->htmlVary()
773			]
774		);
775
776		if ( $env->profiling() ) {
777			$profile = $env->getCurrentProfile();
778			$body->appendChild( $body->ownerDocument->createTextNode( "\n" ) );
779			$body->appendChild( $body->ownerDocument->createComment( $this->timeProfile ) );
780			$body->appendChild( $body->ownerDocument->createTextNode( "\n" ) );
781		}
782	}
783
784	/**
785	 * @param DOMNode $node
786	 */
787	public function doPostProcess( DOMNode $node ): void {
788		$env = $this->env;
789
790		$hasDumpFlags = $env->hasDumpFlags();
791
792		if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-builder' ) ) {
793			$opts = [];
794			ContentUtils::dumpDOM( $node, 'DOM: after tree builder', $opts );
795		}
796
797		$startTime = null;
798		$endTime = null;
799		$prefix = null;
800		$traceLevel = null;
801		$resourceCategory = null;
802
803		$profile = null;
804		if ( $env->profiling() ) {
805			$profile = $env->getCurrentProfile();
806			if ( $this->atTopLevel ) {
807				$this->timeProfile = str_repeat( "-", 85 ) . "\n";
808				$prefix = 'TOP';
809				// Turn off DOM pass timing tracing on non-top-level documents
810				$resourceCategory = 'DOMPasses:TOP';
811			} else {
812				$prefix = '---';
813				$resourceCategory = 'DOMPasses:NESTED';
814			}
815			$startTime = PHPUtils::getStartHRTime();
816			$env->log( 'debug/time/dompp', $prefix . '; start=' . $startTime );
817		}
818
819		for ( $i = 0;  $i < count( $this->processors );  $i++ ) {
820			$pp = $this->processors[$i];
821			if ( !empty( $pp['skipNested'] ) && !$this->atTopLevel ) {
822				continue;
823			}
824
825			$ppName = null;
826			$ppStart = null;
827
828			// Trace
829			if ( $profile ) {
830				$ppName = $pp['name'] . str_repeat(
831					" ",
832					( strlen( $pp['name'] ) < 30 ) ? 30 - strlen( $pp['name'] ) : 0
833				);
834				$ppStart = PHPUtils::getStartHRTime();
835				$env->log( 'debug/time/dompp', $prefix . '; ' . $ppName . ' start' );
836			}
837
838			$opts = null;
839			if ( $hasDumpFlags ) {
840				$opts = [
841					'env' => $env,
842					'dumpFragmentMap' => $this->atTopLevel,
843					'keepTmp' => true
844				];
845
846				if ( $env->hasDumpFlag( 'dom:pre-' . $pp['shortcut'] ) ) {
847					ContentUtils::dumpDOM( $node, 'DOM: pre-' . $pp['shortcut'], $opts );
848				}
849			}
850
851			// Excessive to do it here always, but protects against future changes
852			// to how $this->frame may be updated.
853			$pp['proc']( $node, [ 'frame' => $this->frame ] + $this->options, $this->atTopLevel );
854
855			if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-' . $pp['shortcut'] ) ) {
856				ContentUtils::dumpDOM( $node, 'DOM: post-' . $pp['shortcut'], $opts );
857			}
858
859			if ( $profile ) {
860				$ppElapsed = PHPUtils::getHRTimeDifferential( $ppStart );
861				$env->log(
862					'debug/time/dompp',
863					$prefix . '; ' . $ppName . ' end; time = ' . $ppElapsed
864				);
865				if ( $this->atTopLevel ) {
866					$this->timeProfile .= str_pad( $prefix . '; ' . $ppName, 65 ) .
867						' time = ' .
868						str_pad( number_format( $ppElapsed, 2 ), 10, ' ', STR_PAD_LEFT ) . "\n";
869				}
870				$profile->bumpTimeUse( $resourceCategory, $ppElapsed, 'DOM' );
871			}
872		}
873
874		if ( $profile ) {
875			$endTime = PHPUtils::getStartHRTime();
876			$env->log(
877				'debug/time/dompp',
878				$prefix . '; end=' . number_format( $endTime, 2 ) . '; time = ' .
879				number_format( PHPUtils::getHRTimeDifferential( $startTime ), 2 )
880			);
881		}
882
883		// For sub-pipeline documents, we are done.
884		// For the top-level document, we generate <head> and add it.
885		if ( $this->atTopLevel ) {
886			self::addMetaData( $env, $node->ownerDocument );
887			if ( $env->hasDumpFlag( 'wt2html:limits' ) ) {
888				/*
889				 * PORT-FIXME: Not yet implemented
890				$env->printWt2HtmlResourceUsage( [
891					'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) )
892				] );
893				*/
894			}
895		}
896	}
897
898	/**
899	 * @inheritDoc
900	 */
901	public function process( $node, array $opts = null ) {
902		'@phan-var DOMNode $node'; // @var DOMNode $node
903		$this->doPostProcess( $node );
904		return $node;
905	}
906
907	/**
908	 * @inheritDoc
909	 */
910	public function processChunkily( $input, ?array $options ): Generator {
911		if ( $this->prevStage ) {
912			// The previous stage will yield a DOM.
913			// FIXME: Should we change the signature of that to return a DOM
914			// If we do so, a pipeline stage returns either a generator or
915			// concrete output (in this case, a DOM).
916			$node = $this->prevStage->processChunkily( $input, $options )->current();
917		} else {
918			$node = $input;
919		}
920		$this->process( $node );
921		yield $node;
922	}
923}
924