1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Wt2Html;
5
6use Closure;
7use DateTime;
8use Exception;
9use Generator;
10use Wikimedia\ObjectFactory;
11use Wikimedia\Parsoid\Config\Env;
12use Wikimedia\Parsoid\DOM\Document;
13use Wikimedia\Parsoid\DOM\Element;
14use Wikimedia\Parsoid\DOM\Node;
15use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor;
16use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
17use Wikimedia\Parsoid\Tokens\SourceRange;
18use Wikimedia\Parsoid\Utils\ContentUtils;
19use Wikimedia\Parsoid\Utils\DOMCompat;
20use Wikimedia\Parsoid\Utils\DOMDataUtils;
21use Wikimedia\Parsoid\Utils\DOMTraverser;
22use Wikimedia\Parsoid\Utils\DOMUtils;
23use Wikimedia\Parsoid\Utils\PHPUtils;
24use Wikimedia\Parsoid\Utils\Utils;
25use Wikimedia\Parsoid\Wt2Html\PP\Handlers\CleanUp;
26use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DedupeStyles;
27use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DisplaySpace;
28use Wikimedia\Parsoid\Wt2Html\PP\Handlers\HandleLinkNeighbours;
29use Wikimedia\Parsoid\Wt2Html\PP\Handlers\Headings;
30use Wikimedia\Parsoid\Wt2Html\PP\Handlers\LiFixups;
31use Wikimedia\Parsoid\Wt2Html\PP\Handlers\TableFixups;
32use Wikimedia\Parsoid\Wt2Html\PP\Handlers\UnpackDOMFragments;
33use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddLinkClasses;
34use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddMediaInfo;
35use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddRedLinks;
36use Wikimedia\Parsoid\Wt2Html\PP\Processors\ComputeDSR;
37use Wikimedia\Parsoid\Wt2Html\PP\Processors\ConvertOffsets;
38use Wikimedia\Parsoid\Wt2Html\PP\Processors\I18n;
39use Wikimedia\Parsoid\Wt2Html\PP\Processors\LangConverter;
40use Wikimedia\Parsoid\Wt2Html\PP\Processors\Linter;
41use Wikimedia\Parsoid\Wt2Html\PP\Processors\MarkFosteredContent;
42use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTemplateMarkerMetas;
43use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTrailingNLs;
44use Wikimedia\Parsoid\Wt2Html\PP\Processors\Normalize;
45use Wikimedia\Parsoid\Wt2Html\PP\Processors\ProcessTreeBuilderFixups;
46use Wikimedia\Parsoid\Wt2Html\PP\Processors\PWrap;
47use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapSections;
48use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapTemplates;
49
50/**
51 * Perform post-processing steps on an already-built HTML DOM.
52 */
53class DOMPostProcessor extends PipelineStage {
54	/** @var array */
55	private $options;
56
57	/** @var array */
58	private $seenIds;
59
60	/** @var array */
61	private $processors;
62
63	/** @var ParsoidExtensionAPI Provides post-processing support to extensions */
64	private $extApi;
65
66	/** @var array */
67	private $metadataMap;
68
69	/** @var string */
70	private $timeProfile = '';
71
72	/**
73	 * @param Env $env
74	 * @param array $options
75	 * @param string $stageId
76	 * @param ?PipelineStage $prevStage
77	 */
78	public function __construct(
79		Env $env, array $options = [], string $stageId = "",
80		?PipelineStage $prevStage = null
81	) {
82		parent::__construct( $env, $prevStage );
83
84		$this->options = $options;
85		$this->seenIds = [];
86		$this->processors = [];
87		$this->extApi = new ParsoidExtensionAPI( $env );
88
89		// map from mediawiki metadata names to RDFa property names
90		$this->metadataMap = [
91			'ns' => [
92				'property' => 'mw:pageNamespace',
93				'content' => '%d',
94			],
95			'id' => [
96				'property' => 'mw:pageId',
97				'content' => '%d',
98			],
99
100			// DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266)
101
102			// 'rev_revid' is used to set the overall subject of the document, we don't
103			// need to add a specific <meta> or <link> element for it.
104
105			'rev_parentid' => [
106				'rel' => 'dc:replaces',
107				'resource' => 'mwr:revision/%d',
108			],
109			'rev_timestamp' => [
110				'property' => 'dc:modified',
111				'content' => static function ( $m ) {
112					# Convert from TS_MW ("mediawiki timestamp") format
113					$dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] );
114					# Note that DateTime::ISO8601 is not actually ISO8601, alas.
115					return $dt->format( 'Y-m-d\TH:i:s.000\Z' );
116				},
117			],
118			'rev_sha1' => [
119				'property' => 'mw:revisionSHA1',
120				'content' => '%s',
121			]
122		];
123	}
124
125	/**
126	 * @param ?array $processors
127	 */
128	public function registerProcessors( ?array $processors ): void {
129		if ( empty( $processors ) ) {
130			$processors = $this->getDefaultProcessors();
131		}
132
133		foreach ( $processors as $p ) {
134			if ( !empty( $p['omit'] ) ) {
135				continue;
136			}
137			if ( empty( $p['name'] ) ) {
138				$p['name'] = Utils::stripNamespace( $p['Processor'] );
139			}
140			if ( empty( $p['shortcut'] ) ) {
141				$p['shortcut'] = $p['name'];
142			}
143			if ( !empty( $p['isTraverser'] ) ) {
144				$t = new DOMTraverser();
145				foreach ( $p['handlers'] as $h ) {
146					$t->addHandler( $h['nodeName'], $h['action'] );
147				}
148				$p['proc'] = function ( ...$args ) use ( $t ) {
149					$args[] = null;
150					return $t->traverse( $this->env, ...$args );
151				};
152			} else {
153				$classNameOrSpec = $p['Processor'];
154				if ( empty( $p['isExtPP'] ) ) {
155					// Internal processor w/ ::run() method, class name given
156					// @phan-suppress-next-line PhanNonClassMethodCall
157					$c = new $classNameOrSpec();
158					$p['proc'] = function ( ...$args ) use ( $c ) {
159						return $c->run( $this->env, ...$args );
160					};
161				} else {
162					// Extension post processor, object factory spec given
163					$c = ObjectFactory::getObjectFromSpec( $classNameOrSpec, [
164						'allowClassName' => true,
165						'assertClass' => ExtDOMProcessor::class,
166					] );
167					$p['proc'] = function ( ...$args ) use ( $c ) {
168						return $c->wtPostprocess( $this->extApi, ...$args );
169					};
170				}
171			}
172			$this->processors[] = $p;
173		}
174	}
175
176	/**
177	 * @return array
178	 */
179	public function getDefaultProcessors(): array {
180		$env = $this->env;
181		$options = $this->options;
182		$seenIds = &$this->seenIds;
183		$usedIdIndex = [];
184
185		$tableFixer = new TableFixups( $env );
186
187		/* ---------------------------------------------------------------------------
188		 * FIXME:
189		 * 1. PipelineFactory caches pipelines per env
190		 * 2. PipelineFactory.parse uses a default cache key
191		 * 3. ParserTests uses a shared/global env object for all tests.
192		 * 4. ParserTests also uses PipelineFactory.parse (via env.getContentHandler())
193		 *    => the pipeline constructed for the first test that runs wt2html
194		 *       is used for all subsequent wt2html tests
195		 * 5. If we are selectively turning on/off options on a per-test basis
196		 *    in parser tests, those options won't work if those options are
197		 *    also used to configure pipeline construction (including which DOM passes
198		 *    are enabled).
199		 *
200		 *    Ex: if (env.wrapSections) { addPP('wrapSections', wrapSections); }
201		 *
202		 *    This won't do what you expect it to do. This is primarily a
203		 *    parser tests script issue -- but given the abstraction layers that
204		 *    are on top of the parser pipeline construction, fixing that is
205		 *    not straightforward right now. So, this note is a warning to future
206		 *    developers to pay attention to how they construct pipelines.
207		 * --------------------------------------------------------------------------- */
208
209		$processors = [
210			// Common post processing
211			[
212				'Processor' => MarkFosteredContent::class,
213				'shortcut' => 'fostered'
214			],
215			[
216				'Processor' => ProcessTreeBuilderFixups::class,
217				'shortcut' => 'process-fixups'
218			],
219			[
220				'Processor' => Normalize::class
221			],
222			[
223				'Processor' => PWrap::class,
224				'shortcut' => 'pwrap',
225				'skipNested' => true
226			],
227			// This is run at all levels since, for now, we don't have a generic
228			// solution to running top level passes on HTML stashed in data-mw.
229			// See T214994 for that.
230			//
231			// Also, the gallery extension's "packed" mode would otherwise need a
232			// post-processing pass to scale media after it has been fetched.  That
233			// introduces an ordering dependency that may or may not complicate things.
234			[
235				'Processor' => AddMediaInfo::class,
236				'shortcut' => 'media'
237			],
238			// Run this after 'ProcessTreeBuilderFixups' because the mw:StartTag
239			// and mw:EndTag metas would otherwise interfere with the
240			// firstChild/lastChild check that this pass does.
241			[
242				'Processor' => MigrateTemplateMarkerMetas::class,
243				'shortcut' => 'migrate-metas'
244			],
245			[
246				'Processor' => MigrateTrailingNLs::class,
247				'shortcut' => 'migrate-nls'
248			],
249			// dsr computation and tpl encap are only relevant for top-level content
250			[
251				'Processor' => ComputeDSR::class,
252				'shortcut' => 'dsr',
253				'omit' => !empty( $options['inTemplate'] )
254			],
255			[
256				'Processor' => WrapTemplates::class,
257				'shortcut' => 'tplwrap',
258				'omit' => !empty( $options['inTemplate'] )
259			],
260			// 1. Link prefixes and suffixes
261			// 2. Unpack DOM fragments
262			[
263				'name' => 'HandleLinkNeighbours,UnpackDOMFragments',
264				'shortcut' => 'dom-unpack',
265				'isTraverser' => true,
266				'handlers' => [
267					[
268						'nodeName' => 'a',
269						'action' => [ HandleLinkNeighbours::class, 'handler' ]
270					],
271					[
272						'nodeName' => null,
273						'action' => [ UnpackDOMFragments::class, 'handler' ]
274					]
275				]
276			]
277		];
278
279		/**
280		 * FIXME: There are two potential ordering problems here.
281		 *
282		 * 1. unpackDOMFragment should always run immediately
283		 *    before these extensionPostProcessors, which we do currently.
284		 *    This ensures packed content get processed correctly by extensions
285		 *    before additional transformations are run on the DOM.
286		 *
287		 * This ordering issue is handled through documentation.
288		 *
289		 * 2. This has existed all along (in the PHP parser as well as Parsoid
290		 *    which is probably how the ref-in-ref hack works - because of how
291		 *    parser functions and extension tags are procesed, #tag:ref doesn't
292		 *    see a nested ref anymore) and this patch only exposes that problem
293		 *    more clearly with the unpackOutput property.
294		 *
295		 * * Consider the set of extensions that
296		 *   (a) process wikitext
297		 *   (b) provide an extensionPostProcessor
298		 *   (c) run the extensionPostProcessor only on the top-level
299		 *   As of today, there is exactly one extension (Cite) that has all
300		 *   these properties, so the problem below is a speculative problem
301		 *   for today. But, this could potentially be a problem in the future.
302		 *
303		 * * Let us say there are at least two of them, E1 and E2 that
304		 *   support extension tags <e1> and <e2> respectively.
305		 *
306		 * * Let us say in an instance of <e1> on the page, <e2> is present
307		 *   and in another instance of <e2> on the page, <e1> is present.
308		 *
309		 * * In what order should E1's and E2's extensionPostProcessors be
310		 *   run on the top-level? Depending on what these handlers do, you
311		 *   could get potentially different results. You can see this quite
312		 *   starkly with the unpackOutput flag.
313		 *
314		 * * The ideal solution to this problem is to require that every extension's
315		 *   extensionPostProcessor be idempotent which lets us run these
316		 *   post processors repeatedly till the DOM stabilizes. But, this
317		 *   still doesn't necessarily guarantee that ordering doesn't matter.
318		 *   It just guarantees that with the unpackOutput flag set to false
319		 *   multiple extensions, all sealed fragments get fully processed.
320		 *   So, we still need to worry about that problem.
321		 *
322		 *   But, idempotence *could* potentially be a sufficient property in most cases.
323		 *   To see this, consider that there is a Footnotes extension which is similar
324		 *   to the Cite extension in that they both extract inline content in the
325		 *   page source to a separate section of output and leave behind pointers to
326		 *   the global section in the output DOM. Given this, the Cite and Footnote
327		 *   extension post processors would essentially walk the dom and
328		 *   move any existing inline content into that global section till it is
329		 *   done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>,
330		 *   we ultimately end up with all footnote content in the footnotes section
331		 *   and all ref content in the references section and the DOM stabilizes.
332		 *   Ordering is irrelevant here.
333		 *
334		 *   So, perhaps one way of catching these problems would be in code review
335		 *   by analyzing what the DOM postprocessor does and see if it introduces
336		 *   potential ordering issues.
337		 */
338		foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) {
339			foreach ( $domProcs as $i => $domProcSpec ) {
340				$processors[] = [
341					'isExtPP' => true, // This is an extension DOM post processor
342					'name' => "pp:$extName:$i",
343					'Processor' => $domProcSpec,
344				];
345			}
346		}
347
348		$processors = array_merge( $processors, [
349			[
350				'name' => 'MigrateTrailingCategories,TableFixups,DedupeStyles',
351				'shortcut' => 'fixups',
352				'isTraverser' => true,
353				'skipNested' => true,
354				'handlers' => [
355					// Move trailing categories in <li>s out of the list
356					[
357						'nodeName' => 'li',
358						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
359					],
360					[
361						'nodeName' => 'dt',
362						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
363					],
364					[
365						'nodeName' => 'dd',
366						'action' => [ LiFixups::class, 'migrateTrailingCategories' ]
367					],
368					// 2. Fix up issues from templated table cells and table cell attributes
369					[
370						'nodeName' => 'td',
371						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
372							return $tableFixer->stripDoubleTDs( $node, $this->frame );
373						}
374					],
375					[
376						'nodeName' => 'td',
377						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
378							return $tableFixer->handleTableCellTemplates( $node, $this->frame );
379						}
380					],
381					[
382						'nodeName' => 'th',
383						'action' => function ( $node, $env, $options ) use ( &$tableFixer ) {
384							return $tableFixer->handleTableCellTemplates( $node, $this->frame );
385						}
386					],
387					// 3. Deduplicate template styles
388					// (should run after dom-fragment expansion + after extension post-processors)
389					[
390						'nodeName' => 'style',
391						'action' => [ DedupeStyles::class, 'dedupe' ]
392					]
393				]
394			],
395			// Benefits from running after determining which media are redlinks
396			[
397				'name' => 'Headings-genAnchors',
398				'shortcut' => 'heading-ids',
399				'isTraverser' => true,
400				'skipNested' => true,
401				'handlers' => [
402					[
403						'nodeName' => null,
404						'action' => [ Headings::class, 'genAnchors' ]
405					],
406					[
407						'nodeName' => null,
408						'action' => static function ( $node, $env ) use ( &$seenIds ) {
409							return Headings::dedupeHeadingIds( $seenIds, $node );
410						}
411					]
412				]
413			],
414			[
415				'Processor' => Linter::class,
416				'omit' => !$env->getSiteConfig()->linting(),
417				'skipNested' => true
418			],
419			// Strip marker metas -- removes left over marker metas (ex: metas
420			// nested in expanded tpl/extension output).
421			[
422				'name' => 'CleanUp-stripMarkerMetas',
423				'shortcut' => 'strip-metas',
424				'isTraverser' => true,
425				'handlers' => [
426					[
427						'nodeName' => 'meta',
428						'action' => [ CleanUp::class, 'stripMarkerMetas' ]
429					]
430				]
431			],
432			// Language conversion and Red link marking are done here
433			// *before* we cleanup and save data-parsoid because they
434			// are also used in pb2pb/html2html passes, and we want to
435			// keep their input/output formats consistent.
436			[
437				'Processor' => LangConverter::class,
438				'shortcut' => 'lang-converter',
439				'skipNested' => true
440			],
441			[
442				'Processor' => AddRedLinks::class,
443				'shortcut' => 'redlinks',
444				'skipNested' => true,
445				'omit' => $env->noDataAccess(),
446			],
447			[
448				'name' => 'DisplaySpace',
449				'shortcut' => 'displayspace',
450				'skipNested' => true,
451				'isTraverser' => true,
452				'handlers' => [
453					[
454						'nodeName' => '#text',
455						'action' => [ DisplaySpace::class, 'leftHandler' ]
456					],
457					[
458						'nodeName' => '#text',
459						'action' => [ DisplaySpace::class, 'rightHandler' ]
460					],
461				]
462			],
463			[
464				'Processor' => AddLinkClasses::class,
465				'shortcut' => 'linkclasses',
466				// Note that embedded content doesn't get these classes
467				'skipNested' => true
468			],
469			// Add <section> wrappers around sections
470			[
471				'Processor' => WrapSections::class,
472				'shortcut' => 'sections',
473				'skipNested' => true
474			],
475			[
476				'Processor' => ConvertOffsets::class,
477				'shortcut' => 'convertoffsets',
478				'skipNested' => true,
479			],
480			[
481				'Processor' => I18n::class,
482				'shortcut' => 'i18n',
483				// FIXME(T214994): This should probably be `true`, since we
484				// want this to be another html2html type pass, but then our
485				// processor would need to handle nested content.  Redlinks,
486				// displayspace, and others are ignoring that for now though,
487				// so let's wait until there's a more general mechanism.
488				'skipNested' => false,
489			],
490			[
491				'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanupAndSaveDataParsoid',
492				'shortcut' => 'cleanup',
493				'isTraverser' => true,
494				'handlers' => [
495					// Strip empty elements from template content
496					[
497						'nodeName' => null,
498						'action' => [ CleanUp::class, 'handleEmptyElements' ]
499					],
500					// Save data.parsoid into data-parsoid html attribute.
501					// Make this its own thing so that any changes to the DOM
502					// don't affect other handlers that run alongside it.
503					[
504						'nodeName' => null,
505						'action' => static function (
506							$node, $env, $options, $atTopLevel, $tplInfo
507						) use ( &$usedIdIndex ) {
508							if ( $atTopLevel && DOMUtils::isBody( $node ) ) {
509								$usedIdIndex = DOMDataUtils::usedIdIndex( $node );
510							}
511							return CleanUp::cleanupAndSaveDataParsoid(
512								$usedIdIndex, $node, $env, $atTopLevel,
513								$tplInfo
514							);
515						}
516					]
517				]
518			],
519		] );
520
521		return $processors;
522	}
523
524	/**
525	 * @inheritDoc
526	 */
527	public function setSourceOffsets( SourceRange $so ): void {
528		$this->options['sourceOffsets'] = $so;
529	}
530
531	/**
532	 * @inheritDoc
533	 */
534	public function resetState( array $options ): void {
535		parent::resetState( $options );
536
537		// $this->env->getPageConfig()->meta->displayTitle = null;
538		$this->seenIds = [];
539	}
540
541	/**
542	 * Create an element in the document.head with the given attrs.
543	 *
544	 * @param Document $document
545	 * @param string $tagName
546	 * @param array $attrs
547	 */
548	private function appendToHead( Document $document, string $tagName, array $attrs = [] ): void {
549		$elt = $document->createElement( $tagName );
550		DOMUtils::addAttributes( $elt, $attrs );
551		( DOMCompat::getHead( $document ) )->appendChild( $elt );
552	}
553
554	/**
555	 * While unnecessary for Wikimedia clients, a stylesheet url in the <head>
556	 * is useful for clients like Kiwix and others who might not want to process
557	 * the meta tags to construct the resourceloader url.
558	 *
559	 * Given that these clients will be consuming Parsoid HTML outside a MediaWiki skin,
560	 * the clients are effectively responsible for their own "skin". But, once again,
561	 * as a courtesy, we are hardcoding the vector skin modules for them. But, note
562	 * that this may cause page elements to render differently than how they render
563	 * on Wikimedia sites with the vector skin since this is probably missing a number
564	 * of other modules.
565	 *
566	 * All that said, note that JS-generated parts of the page will still require them
567	 * to have more intimate knowledge of how  to process the JS modules. Except for
568	 * <graph>s, page content doesn't require JS modules at this point. So, where these
569	 * clients want to invest in the necessary logic to construct a better resourceloader
570	 * url, they could simply delete / ignore this stylesheet.
571	 *
572	 * @param Document $document
573	 * @param Env $env
574	 * @param string $lang
575	 * @param array $styleModules
576	 */
577	private function addCourtesyBasicStyleSheet(
578		Document $document, Env $env, string $lang, array $styleModules
579	): void {
580		$styleModules = array_unique( array_merge( $styleModules, [
581			'mediawiki.skinning.content.parsoid',
582			// Use the base styles that API output and fallback skin use.
583			'mediawiki.skinning.interface',
584			// Make sure to include contents of user generated styles
585			// e.g. MediaWiki:Common.css / MediaWiki:Mobile.css
586			'site.styles'
587		] ) );
588
589		$styleURI = $env->getSiteConfig()->getModulesLoadURI() .
590			'?lang=' . $lang . '&modules=' .
591			PHPUtils::encodeURIComponent( implode( '|', $styleModules ) ) .
592			'&only=styles&skin=vector';
593		$this->appendToHead( $document, 'link', [ 'rel' => 'stylesheet', 'href' => $styleURI ] );
594	}
595
596	/**
597	 * Export used style modules via a meta tag (and via a stylesheet for now to aid some clients)
598	 * @param Document $document
599	 * @param Env $env
600	 * @param string $lang
601	 */
602	private function exportStyleModules( Document $document, Env $env, string $lang ): void {
603		// Styles from modules returned from preprocessor / parse requests
604		$styleModules = $env->getOutputProperties()['modulestyles'] ?? [];
605		if ( $styleModules ) {
606			// FIXME: Maybe think about using an associative array or DS\Set
607			$styleModules = array_unique( $styleModules );
608
609			// mw:styleModules are CSS modules that are render-blocking.
610			$this->appendToHead( $document, 'meta', [
611				'property' => 'mw:styleModules',
612				'content' => implode( '|', $styleModules )
613			] );
614		}
615
616		$this->addCourtesyBasicStyleSheet( $document, $env, $lang, $styleModules );
617	}
618
619	/**
620	 * Export general modules (usually JS scripts) via a meta tag
621	 * @param Document $document
622	 * @param Env $env
623	 */
624	private function exportGeneralModules( Document $document, Env $env ): void {
625		// Styles from modules returned from preprocessor / parse requests
626		$generalModules = $env->getOutputProperties()['modules'] ?? [];
627		if ( $generalModules ) {
628			// mw:generalModules can be processed via JS (and async) and are usually (but
629			// not always) JS scripts.
630			$this->appendToHead( $document, 'meta', [
631				'property' => 'mw:generalModules',
632				'content' => implode( '|', array_unique( $generalModules ) )
633			] );
634		}
635	}
636
637	/**
638	 * Export used JS config vars via a meta tag
639	 * @param Document $document
640	 * @param Env $env
641	 */
642	private function exportJSConfigVars( Document $document, Env $env ): void {
643		$vars = $env->getOutputProperties()['jsconfigvars'] ?? [];
644		if ( $vars ) {
645			try {
646				$content = PHPUtils::jsonEncode( $vars );
647			} catch ( Exception $e ) {
648				// Similar to ResourceLoader::makeConfigSetScript.  See T289358
649				$env->log(
650					'warn', 'JSON serialization of config data failed. ' .
651						'This usually means the config data is not valid UTF-8.'
652				);
653				return;
654			}
655			$this->appendToHead( $document, 'meta', [
656				'property' => 'mw:jsConfigVars',
657				'content' => $content,
658			] );
659		}
660	}
661
662	/**
663	 * @param Element $body
664	 * @param Env $env
665	 */
666	private function updateBodyClasslist( Element $body, Env $env ): void {
667		$dir = $env->getPageConfig()->getPageLanguageDir();
668		$bodyCL = DOMCompat::getClassList( $body );
669		$bodyCL->add( 'mw-content-' . $dir );
670		$bodyCL->add( 'sitedir-' . $dir );
671		$bodyCL->add( $dir );
672		$body->setAttribute( 'dir', $dir );
673
674		// Set 'mw-body-content' directly on the body.
675		// This is the designated successor for #bodyContent in core skins.
676		$bodyCL->add( 'mw-body-content' );
677		// Set 'parsoid-body' to add the desired layout styling from Vector.
678		$bodyCL->add( 'parsoid-body' );
679		// Also, add the 'mediawiki' class.
680		// Some Mediawiki:Common.css seem to target this selector.
681		$bodyCL->add( 'mediawiki' );
682		// Set 'mw-parser-output' directly on the body.
683		// Templates target this class as part of the TemplateStyles RFC
684		// FIXME: This isn't expected to be found on the same element as the
685		// body class above, since some css targets it as a descendant.
686		// In visual diff'ing, we migrate the body contents to a wrapper div
687		// with this class to reduce visual differences.  Consider getting
688		// rid of it.
689		$bodyCL->add( 'mw-parser-output' );
690	}
691
692	/**
693	 * FIXME: consider moving to DOMUtils or Env.
694	 *
695	 * @param Env $env
696	 * @param Document $document
697	 */
698	public function addMetaData( Env $env, Document $document ): void {
699		// add <head> element if it was missing
700		if ( !( DOMCompat::getHead( $document ) instanceof Element ) ) {
701			$document->documentElement->insertBefore(
702				$document->createElement( 'head' ),
703				DOMCompat::getBody( $document )
704			);
705		}
706
707		// add mw: and mwr: RDFa prefixes
708		$prefixes = [
709			'dc: http://purl.org/dc/terms/',
710			'mw: http://mediawiki.org/rdf/'
711		];
712		$document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) );
713
714		// (From wfParseUrl in core:)
715		// Protocol-relative URLs are handled really badly by parse_url().
716		// It's so bad that the easiest way to handle them is to just prepend
717		// 'https:' and strip the protocol out later.
718		$baseURI = $env->getSiteConfig()->baseURI();
719		$wasRelative = substr( $baseURI, 0, 2 ) == '//';
720		if ( $wasRelative ) {
721			$baseURI = "https:$baseURI";
722		}
723		// add 'https://' to baseURI if it was missing
724		$pu = parse_url( $baseURI );
725		$mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) .
726			$baseURI . 'Special:Redirect/';
727
728		( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix );
729
730		// add <head> content based on page meta data:
731
732		// Set the charset first.
733		$this->appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] );
734
735		// Add page / revision metadata to the <head>
736		// PORT-FIXME: We will need to do some refactoring to eliminate
737		// this hardcoding. Probably even merge thi sinto metadataMap
738		$pageConfig = $env->getPageConfig();
739		$revProps = [
740			'id' => $pageConfig->getPageId(),
741			'ns' => $pageConfig->getNs(),
742			'rev_parentid' => $pageConfig->getParentRevisionId(),
743			'rev_revid' => $pageConfig->getRevisionId(),
744			'rev_sha1' => $pageConfig->getRevisionSha1(),
745			'rev_timestamp' => $pageConfig->getRevisionTimestamp()
746		];
747		foreach ( $revProps as $key => $value ) {
748			// generate proper attributes for the <meta> or <link> tag
749			if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) {
750				continue;
751			}
752
753			$attrs = [];
754			$mdm = $this->metadataMap[$key];
755
756			/** FIXME: The JS side has a bunch of other checks here */
757
758			foreach ( $mdm as $k => $v ) {
759				// evaluate a function, or perform sprintf-style formatting, or
760				// use string directly, depending on value in metadataMap
761				if ( $v instanceof Closure ) {
762					$v = $v( $revProps );
763				} elseif ( strpos( $v, '%' ) !== false ) {
764					// @phan-suppress-next-line PhanPluginPrintfVariableFormatString
765					$v = sprintf( $v, $value );
766				}
767				$attrs[$k] = $v;
768			}
769
770			// <link> is used if there's a resource or href attribute.
771			$this->appendToHead( $document,
772				isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta',
773				$attrs
774			);
775		}
776
777		if ( $revProps['rev_revid'] ) {
778			$document->documentElement->setAttribute(
779				'about', $mwrPrefix . 'revision/' . $revProps['rev_revid']
780			);
781		}
782
783		// Normalize before comparison
784		if (
785			str_replace( '_', ' ', $env->getSiteConfig()->mainpage() ) ===
786			str_replace( '_', ' ', $env->getPageConfig()->getTitle() )
787		) {
788			$this->appendToHead( $document, 'meta', [
789				'property' => 'isMainPage',
790				'content' => 'true' /* HTML attribute values should be strings */
791			] );
792		}
793
794		// Set the parsoid content-type strings
795		// FIXME: Should we be using http-equiv for this?
796		$this->appendToHead( $document, 'meta', [
797				'property' => 'mw:htmlVersion',
798				'content' => $env->getOutputContentVersion()
799			]
800		);
801		// Temporary backward compatibility for clients
802		// This could be skipped if we support a version downgrade path
803		// with a major version bump.
804		$this->appendToHead( $document, 'meta', [
805				'property' => 'mw:html:version',
806				'content' => $env->getOutputContentVersion()
807			]
808		);
809
810		$expTitle = strtr( $env->getPageConfig()->getTitle(), ' ', '_' );
811		$expTitle = explode( '/', $expTitle );
812		$expTitle = array_map( static function ( $comp ) {
813			return PHPUtils::encodeURIComponent( $comp );
814		}, $expTitle );
815
816		$this->appendToHead( $document, 'link', [
817			'rel' => 'dc:isVersionOf',
818			'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle )
819		] );
820
821		DOMCompat::setTitle(
822			$document,
823			// PORT-FIXME: There isn't a place anywhere yet for displayTitle
824			/* $env->getPageConfig()->meta->displayTitle || */
825			$env->getPageConfig()->getTitle()
826		);
827
828		// Add base href pointing to the wiki root
829		$this->appendToHead( $document, 'base', [
830			'href' => $env->getSiteConfig()->baseURI()
831		] );
832
833		// Stick data attributes in the head
834		if ( $env->pageBundle ) {
835			DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) );
836		}
837
838		// PageConfig guarantees language will always be non-null.
839		$lang = $env->getPageConfig()->getPageLanguage();
840		$body = DOMCompat::getBody( $document );
841		$body->setAttribute( 'lang', Utils::bcp47n( $lang ) );
842		$this->updateBodyClasslist( $body, $env );
843		$this->exportJSConfigVars( $document, $env );
844		$this->exportGeneralModules( $document, $env );
845		$this->exportStyleModules( $document, $env, $lang );
846
847		// Indicate whether LanguageConverter is enabled, so that downstream
848		// caches can split on variant (if necessary)
849		$this->appendToHead( $document, 'meta', [
850				'http-equiv' => 'content-language',
851				'content' => $env->htmlContentLanguage()
852			]
853		);
854		$this->appendToHead( $document, 'meta', [
855				'http-equiv' => 'vary',
856				'content' => $env->htmlVary()
857			]
858		);
859
860		if ( $env->profiling() ) {
861			$profile = $env->getCurrentProfile();
862			$body->appendChild( $body->ownerDocument->createTextNode( "\n" ) );
863			$body->appendChild( $body->ownerDocument->createComment( $this->timeProfile ) );
864			$body->appendChild( $body->ownerDocument->createTextNode( "\n" ) );
865		}
866	}
867
868	/**
869	 * @param Node $node
870	 */
871	public function doPostProcess( Node $node ): void {
872		$env = $this->env;
873
874		$hasDumpFlags = $env->hasDumpFlags();
875
876		if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-builder' ) ) {
877			$opts = [];
878			ContentUtils::dumpDOM( $node, 'DOM: after tree builder', $opts );
879		}
880
881		$startTime = null;
882		$endTime = null;
883		$prefix = null;
884		$traceLevel = null;
885		$resourceCategory = null;
886
887		$profile = null;
888		if ( $env->profiling() ) {
889			$profile = $env->getCurrentProfile();
890			if ( $this->atTopLevel ) {
891				$this->timeProfile = str_repeat( "-", 85 ) . "\n";
892				$prefix = 'TOP';
893				// Turn off DOM pass timing tracing on non-top-level documents
894				$resourceCategory = 'DOMPasses:TOP';
895			} else {
896				$prefix = '---';
897				$resourceCategory = 'DOMPasses:NESTED';
898			}
899			$startTime = PHPUtils::getStartHRTime();
900			$env->log( 'debug/time/dompp', $prefix . '; start=' . $startTime );
901		}
902
903		for ( $i = 0;  $i < count( $this->processors );  $i++ ) {
904			$pp = $this->processors[$i];
905			if ( !empty( $pp['skipNested'] ) && !$this->atTopLevel ) {
906				continue;
907			}
908
909			$ppName = null;
910			$ppStart = null;
911
912			// Trace
913			if ( $profile ) {
914				$ppName = $pp['name'] . str_repeat(
915					" ",
916					( strlen( $pp['name'] ) < 30 ) ? 30 - strlen( $pp['name'] ) : 0
917				);
918				$ppStart = PHPUtils::getStartHRTime();
919				$env->log( 'debug/time/dompp', $prefix . '; ' . $ppName . ' start' );
920			}
921
922			$opts = null;
923			if ( $hasDumpFlags ) {
924				$opts = [
925					'env' => $env,
926					'dumpFragmentMap' => $this->atTopLevel,
927					'keepTmp' => true
928				];
929
930				if ( $env->hasDumpFlag( 'dom:pre-' . $pp['shortcut'] ) ) {
931					ContentUtils::dumpDOM( $node, 'DOM: pre-' . $pp['shortcut'], $opts );
932				}
933			}
934
935			// Excessive to do it here always, but protects against future changes
936			// to how $this->frame may be updated.
937			$pp['proc']( $node, [ 'frame' => $this->frame ] + $this->options, $this->atTopLevel );
938
939			if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-' . $pp['shortcut'] ) ) {
940				ContentUtils::dumpDOM( $node, 'DOM: post-' . $pp['shortcut'], $opts );
941			}
942
943			if ( $profile ) {
944				$ppElapsed = PHPUtils::getHRTimeDifferential( $ppStart );
945				$env->log(
946					'debug/time/dompp',
947					$prefix . '; ' . $ppName . ' end; time = ' . $ppElapsed
948				);
949				if ( $this->atTopLevel ) {
950					$this->timeProfile .= str_pad( $prefix . '; ' . $ppName, 65 ) .
951						' time = ' .
952						str_pad( number_format( $ppElapsed, 2 ), 10, ' ', STR_PAD_LEFT ) . "\n";
953				}
954				$profile->bumpTimeUse( $resourceCategory, $ppElapsed, 'DOM' );
955			}
956		}
957
958		if ( $profile ) {
959			$endTime = PHPUtils::getStartHRTime();
960			$env->log(
961				'debug/time/dompp',
962				$prefix . '; end=' . number_format( $endTime, 2 ) . '; time = ' .
963				number_format( PHPUtils::getHRTimeDifferential( $startTime ), 2 )
964			);
965		}
966
967		// For sub-pipeline documents, we are done.
968		// For the top-level document, we generate <head> and add it.
969		if ( $this->atTopLevel ) {
970			self::addMetaData( $env, $node->ownerDocument );
971			if ( $env->hasDumpFlag( 'wt2html:limits' ) ) {
972				/*
973				 * PORT-FIXME: Not yet implemented
974				$env->printWt2HtmlResourceUsage( [
975					'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) )
976				] );
977				*/
978			}
979		}
980	}
981
982	/**
983	 * @inheritDoc
984	 */
985	public function process( $node, array $opts = null ) {
986		'@phan-var Node $node'; // @var Node $node
987		$this->doPostProcess( $node );
988		return $node;
989	}
990
991	/**
992	 * @inheritDoc
993	 */
994	public function processChunkily( $input, ?array $options ): Generator {
995		if ( $this->prevStage ) {
996			// The previous stage will yield a DOM.
997			// FIXME: Should we change the signature of that to return a DOM
998			// If we do so, a pipeline stage returns either a generator or
999			// concrete output (in this case, a DOM).
1000			$node = $this->prevStage->processChunkily( $input, $options )->current();
1001		} else {
1002			$node = $input;
1003		}
1004		$this->process( $node );
1005		yield $node;
1006	}
1007}
1008