1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Utils;
5
6use DOMComment;
7use DOMElement;
8use DOMNode;
9use stdClass;
10use Wikimedia\Parsoid\Config\Env;
11use Wikimedia\Parsoid\Config\WikitextConstants as Consts;
12use Wikimedia\Parsoid\Ext\ExtensionTagHandler;
13use Wikimedia\Parsoid\Tokens\CommentTk;
14use Wikimedia\Parsoid\Wt2Html\Frame;
15
16/**
17 * These utilites pertain to extracting / modifying wikitext information from the DOM.
18 */
19class WTUtils {
20	private const FIRST_ENCAP_REGEXP =
21		'#(?:^|\s)(mw:(?:Transclusion|Param|LanguageVariant|Extension(/[^\s]+)))(?=$|\s)#D';
22
23	/**
24	 * Regexp for checking marker metas typeofs representing
25	 * transclusion markup or template param markup.
26	 */
27	private const TPL_META_TYPE_REGEXP = '#^mw:(?:Transclusion|Param)(?:/End)?$#D';
28
29	/**
30	 * Check whether a node's data-parsoid object includes
31	 * an indicator that the original wikitext was a literal
32	 * HTML element (like table or p)
33	 *
34	 * @param stdClass $dp
35	 * @return bool
36	 */
37	public static function hasLiteralHTMLMarker( stdClass $dp ): bool {
38		return isset( $dp->stx ) && $dp->stx === 'html';
39	}
40
41	/**
42	 * Run a node through {@link #hasLiteralHTMLMarker}.
43	 * @param DOMNode|null $node
44	 * @return bool
45	 */
46	public static function isLiteralHTMLNode( ?DOMNode $node ): bool {
47		return ( $node &&
48			$node instanceof DOMElement &&
49			self::hasLiteralHTMLMarker( DOMDataUtils::getDataParsoid( $node ) ) );
50	}
51
52	/**
53	 * @param DOMNode $node
54	 * @return bool
55	 */
56	public static function isZeroWidthWikitextElt( DOMNode $node ): bool {
57		return isset( Consts::$ZeroWidthWikitextTags[$node->nodeName] ) &&
58			!self::isLiteralHTMLNode( $node );
59	}
60
61	/**
62	 * Is `$node` a block node that is also visible in wikitext?
63	 * An example of an invisible block node is a `<p>`-tag that
64	 * Parsoid generated, or a `<ul>`, `<ol>` tag.
65	 *
66	 * @param DOMNode $node
67	 * @return bool
68	 */
69	public static function isBlockNodeWithVisibleWT( DOMNode $node ): bool {
70		return DOMUtils::isBlockNode( $node ) && !self::isZeroWidthWikitextElt( $node );
71	}
72
73	/**
74	 * Helper functions to detect when an A-$node uses [[..]]/[..]/... style
75	 * syntax (for wikilinks, ext links, url links). rel-type is not sufficient
76	 * anymore since mw:ExtLink is used for all the three link syntaxes.
77	 *
78	 * @param DOMElement $node
79	 * @param stdClass|null $dp
80	 * @return bool
81	 */
82	public static function usesWikiLinkSyntax( DOMElement $node, ?stdClass $dp ): bool {
83		// FIXME: Optimization from ComputeDSR to avoid refetching this property
84		// Is it worth the unnecessary code here?
85		if ( !$dp ) {
86			$dp = DOMDataUtils::getDataParsoid( $node );
87		}
88
89		// SSS FIXME: This requires to be made more robust
90		// for when dp->stx value is not present
91		return $node->getAttribute( "rel" ) === "mw:WikiLink" ||
92			( isset( $dp->stx ) && $dp->stx !== "url" && $dp->stx !== "magiclink" );
93	}
94
95	/**
96	 * Helper function to detect when an A-node uses ext-link syntax.
97	 * rel attribute is not sufficient anymore since mw:ExtLink is used for
98	 * multiple link types
99	 *
100	 * @param DOMElement $node
101	 * @param stdClass|null $dp
102	 * @return bool
103	 */
104	public static function usesExtLinkSyntax( DOMElement $node, ?stdClass $dp ): bool {
105		// FIXME: Optimization from ComputeDSR to avoid refetching this property
106		// Is it worth the unnecessary code here?
107		if ( !$dp ) {
108			$dp = DOMDataUtils::getDataParsoid( $node );
109		}
110
111		// SSS FIXME: This requires to be made more robust
112		// for when $dp->stx value is not present
113		return $node->getAttribute( "rel" ) === "mw:ExtLink" &&
114			( !isset( $dp->stx ) || ( $dp->stx !== "url" && $dp->stx !== "magiclink" ) );
115	}
116
117	/**
118	 * Helper function to detect when an A-node uses url-link syntax.
119	 * rel attribute is not sufficient anymore since mw:ExtLink is used for
120	 * multiple link types
121	 *
122	 * @param DOMElement $node
123	 * @param stdClass|null $dp
124	 * @return bool
125	 */
126	public static function usesURLLinkSyntax( DOMElement $node, stdClass $dp = null ): bool {
127		// FIXME: Optimization from ComputeDSR to avoid refetching this property
128		// Is it worth the unnecessary code here?
129		if ( !$dp ) {
130			$dp = DOMDataUtils::getDataParsoid( $node );
131		}
132
133		// SSS FIXME: This requires to be made more robust
134		// for when $dp->stx value is not present
135		return $node->getAttribute( "rel" ) === "mw:ExtLink" &&
136			isset( $dp->stx ) && $dp->stx === "url";
137	}
138
139	/**
140	 * Helper function to detect when an A-node uses magic-link syntax.
141	 * rel attribute is not sufficient anymore since mw:ExtLink is used for
142	 * multiple link types
143	 *
144	 * @param DOMElement $node
145	 * @param stdClass|null $dp
146	 * @return bool
147	 */
148	public static function usesMagicLinkSyntax( DOMElement $node, stdClass $dp = null ): bool {
149		if ( !$dp ) {
150			$dp = DOMDataUtils::getDataParsoid( $node );
151		}
152
153		// SSS FIXME: This requires to be made more robust
154		// for when $dp->stx value is not present
155		return $node->getAttribute( "rel" ) === "mw:ExtLink" &&
156			isset( $dp->stx ) && $dp->stx === "magiclink";
157	}
158
159	/**
160	 * Check whether a node's typeof indicates that it is a template expansion.
161	 *
162	 * @param DOMElement $node
163	 * @return ?string The matched type, or null if no match.
164	 */
165	public static function matchTplType( DOMElement $node ): ?string {
166		return DOMUtils::matchTypeOf( $node, self::TPL_META_TYPE_REGEXP );
167	}
168
169	/**
170	 * Check whether a typeof indicates that it signifies an
171	 * expanded attribute.
172	 *
173	 * @param DOMElement $node
174	 * @return bool
175	 */
176	public static function hasExpandedAttrsType( DOMElement $node ): bool {
177		return DOMUtils::matchTypeOf( $node, '/^mw:ExpandedAttrs(\/[^\s]+)*$/' ) !== null;
178	}
179
180	/**
181	 * Check whether a node is a meta tag that signifies a template expansion.
182	 *
183	 * @param DOMNode $node
184	 * @return bool
185	 */
186	public static function isTplMarkerMeta( DOMNode $node ): bool {
187		return DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ) !== null;
188	}
189
190	/**
191	 * Check whether a node is a meta signifying the start of a template expansion.
192	 *
193	 * @param DOMNode $node
194	 * @return bool
195	 */
196	public static function isTplStartMarkerMeta( DOMNode $node ): bool {
197		$t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP );
198		return $t !== null && !preg_match( '#/End$#D', $t );
199	}
200
201	/**
202	 * Check whether a node is a meta signifying the end of a template
203	 * expansion.
204	 *
205	 * @param DOMNode $node
206	 * @return bool
207	 */
208	public static function isTplEndMarkerMeta( DOMNode $node ): bool {
209		$t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP );
210		return $t !== null && preg_match( '#/End$#D', $t );
211	}
212
213	/**
214	 * Find the first wrapper element of encapsulated content.
215	 * @param DOMNode $node
216	 * @return DOMElement|null
217	 */
218	public static function findFirstEncapsulationWrapperNode( DOMNode $node ): ?DOMElement {
219		if ( !self::hasParsoidAboutId( $node ) ) {
220			return null;
221		}
222		/** @var DOMElement $node */
223		DOMUtils::assertElt( $node );
224
225		$about = $node->getAttribute( 'about' );
226		$prev = $node;
227		do {
228			$node = $prev;
229			$prev = DOMUtils::previousNonDeletedSibling( $node );
230		} while (
231			$prev &&
232			$prev instanceof DOMElement &&
233			$prev->getAttribute( 'about' ) === $about
234		);
235		$elt = self::isFirstEncapsulationWrapperNode( $node ) ? $node : null;
236		'@phan-var ?DOMElement $elt'; // @var ?DOMElement $elt
237		return $elt;
238	}
239
240	/**
241	 * This tests whether a DOM $node is a new $node added during an edit session
242	 * or an existing $node from parsed wikitext.
243	 *
244	 * As written, this function can only be used on non-template/extension content
245	 * or on the top-level $nodes of template/extension content. This test will
246	 * return the wrong results on non-top-level $nodes of template/extension content.
247	 *
248	 * @param DOMNode $node
249	 * @return bool
250	 */
251	public static function isNewElt( DOMNode $node ): bool {
252		// We cannot determine newness on text/comment $nodes.
253		if ( !( $node instanceof DOMElement ) ) {
254			return false;
255		}
256
257		// For template/extension content, newness should be
258		// checked on the encapsulation wrapper $node.
259		$node = self::findFirstEncapsulationWrapperNode( $node ) ?? $node;
260		$dp = DOMDataUtils::getDataParsoid( $node );
261		return !empty( $dp->tmp->isNew );
262	}
263
264	/**
265	 * Check whether a pre is caused by indentation in the original wikitext.
266	 * @param DOMNode $node
267	 * @return bool
268	 */
269	public static function isIndentPre( DOMNode $node ): bool {
270		return $node->nodeName === "pre" && !self::isLiteralHTMLNode( $node );
271	}
272
273	/**
274	 * @param DOMNode $node
275	 * @return bool
276	 */
277	public static function isInlineMedia( DOMNode $node ): bool {
278		return DOMUtils::matchNameAndTypeOf(
279			$node, 'figure-inline', '#^mw:(Image|Video|Audio)($|/)#D'
280		) !== null;
281	}
282
283	/**
284	 * @param DOMNode $node
285	 * @return bool
286	 */
287	public static function isGeneratedFigure( DOMNode $node ): bool {
288		return DOMUtils::matchTypeOf( $node, '#^mw:(Image|Video|Audio)($|/)#' ) !== null;
289	}
290
291	/**
292	 * Find how much offset is necessary for the DSR of an
293	 * indent-originated pre tag.
294	 *
295	 * @param DOMNode $textNode
296	 * @return int
297	 */
298	public static function indentPreDSRCorrection( DOMNode $textNode ): int {
299		// NOTE: This assumes a text-node and doesn't check that it is one.
300		//
301		// FIXME: Doesn't handle text nodes that are not direct children of the pre
302		if ( self::isIndentPre( $textNode->parentNode ) ) {
303			if ( $textNode->parentNode->lastChild === $textNode ) {
304				// We dont want the trailing newline of the last child of the pre
305				// to contribute a pre-correction since it doesn't add new content
306				// in the pre-node after the text
307				$numNLs = preg_match_all( '/\n./', $textNode->nodeValue );
308			} else {
309				$numNLs = preg_match_all( '/\n/', $textNode->nodeValue );
310			}
311			return $numNLs;
312		} else {
313			return 0;
314		}
315	}
316
317	/**
318	 * Check if $node is an ELEMENT $node belongs to a template/extension.
319	 *
320	 * NOTE: Use with caution. This technique works reliably for the
321	 * root level elements of tpl-content DOM subtrees since only they
322	 * are guaranteed to be  marked and nested content might not
323	 * necessarily be marked.
324	 *
325	 * @param DOMNode $node
326	 * @return bool
327	 */
328	public static function hasParsoidAboutId( DOMNode $node ): bool {
329		if (
330			$node instanceof DOMElement &&
331			$node->hasAttribute( 'about' )
332		) {
333			$about = $node->getAttribute( 'about' );
334			// SSS FIXME: Verify that our DOM spec clarifies this
335			// expectation on about-ids and that our clients respect this.
336			return $about && Utils::isParsoidObjectId( $about );
337		} else {
338			return false;
339		}
340	}
341
342	/**
343	 * Does $node represent a redirect link?
344	 *
345	 * @param DOMNode $node
346	 * @return bool
347	 */
348	public static function isRedirectLink( DOMNode $node ): bool {
349		return $node->nodeName === 'link' &&
350			DOMUtils::assertElt( $node ) &&
351			preg_match( '#\bmw:PageProp/redirect\b#', $node->getAttribute( 'rel' ) );
352	}
353
354	/**
355	 * Does $node represent a category link?
356	 *
357	 * @param DOMNode|null $node
358	 * @return bool
359	 */
360	public static function isCategoryLink( ?DOMNode $node ): bool {
361		return $node instanceof DOMelement &&
362			$node->nodeName === 'link' &&
363			preg_match( '#\bmw:PageProp/Category\b#', $node->getAttribute( 'rel' ) );
364	}
365
366	/**
367	 * Does $node represent a link that is sol-transparent?
368	 *
369	 * @param DOMNode $node
370	 * @return bool
371	 */
372	public static function isSolTransparentLink( DOMNode $node ): bool {
373		return $node->nodeName === 'link' &&
374			DOMUtils::assertElt( $node ) &&
375			preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $node->getAttribute( 'rel' ) );
376	}
377
378	/**
379	 * Check if '$node' emits wikitext that is sol-transparent in wikitext form.
380	 * This is a test for wikitext that doesn't introduce line breaks.
381	 *
382	 * Comment, whitespace text $nodes, category links, redirect links, behavior
383	 * switches, and include directives currently satisfy this definition.
384	 *
385	 * This should come close to matching TokenUtils.isSolTransparent()
386	 *
387	 * @param DOMNode $node
388	 * @return bool
389	 */
390	public static function emitsSolTransparentSingleLineWT( DOMNode $node ): bool {
391		if ( DOMUtils::isText( $node ) ) {
392			// NB: We differ here to meet the nl condition.
393			return (bool)preg_match( '/^[ \t]*$/D', $node->nodeValue );
394		} elseif ( self::isRenderingTransparentNode( $node ) ) {
395			// NB: The only metas in a DOM should be for behavior switches and
396			// include directives, other than explicit HTML meta tags. This
397			// differs from our counterpart in Util where ref meta tokens
398			// haven't been expanded to spans yet.
399			return true;
400		} else {
401			return false;
402		}
403	}
404
405	/**
406	 * This is the span added to headings to add fallback ids for when legacy
407	 * and HTML5 ids don't match up. This prevents broken links to legacy ids.
408	 *
409	 * @param DOMNode $node
410	 * @return bool
411	 */
412	public static function isFallbackIdSpan( DOMNode $node ): bool {
413		return DOMUtils::hasNameAndTypeOf( $node, 'span', 'mw:FallbackId' );
414	}
415
416	/**
417	 * These are primarily 'metadata'-like $nodes that don't show up in output rendering.
418	 * - In Parsoid output, they are represented by link/meta tags.
419	 * - In the PHP parser, they are completely stripped from the input early on.
420	 *   Because of this property, these rendering-transparent $nodes are also
421	 *   SOL-transparent for the purposes of parsing behavior.
422	 *
423	 * @param DOMNode $node
424	 * @return bool
425	 */
426	public static function isRenderingTransparentNode( DOMNode $node ): bool {
427		// FIXME: Can we change this entire thing to
428		// DOMUtils::isComment($node) ||
429		// DOMUtils::getDataParsoid($node).stx !== 'html' &&
430		// ($node->nodeName === 'meta' || $node->nodeName === 'link')
431		//
432		return DOMUtils::isComment( $node ) ||
433			self::isSolTransparentLink( $node ) || (
434				// Catch-all for everything else.
435				$node->nodeName === 'meta' &&
436				DOMUtils::assertElt( $node ) &&
437				(
438					// (Start|End)Tag metas clone data-parsoid from the tokens
439					// they're shadowing, which trips up on the stx check.
440					// TODO: Maybe that data should be nested in a property?
441					DOMUtils::matchTypeOf( $node, '/^mw:(StartTag|EndTag)$/' ) !== null ||
442					!isset( DOMDataUtils::getDataParsoid( $node )->stx ) ||
443					DOMDataUtils::getDataParsoid( $node )->stx !== 'html'
444				)
445			) || self::isFallbackIdSpan( $node );
446	}
447
448	/**
449	 * Is $node nested inside a table tag that uses HTML instead of native
450	 * wikitext?
451	 *
452	 * @param DOMNode $node
453	 * @return bool
454	 */
455	public static function inHTMLTableTag( DOMNode $node ): bool {
456		$p = $node->parentNode;
457		while ( DOMUtils::isTableTag( $p ) ) {
458			if ( self::isLiteralHTMLNode( $p ) ) {
459				return true;
460			} elseif ( $p->nodeName === 'table' ) {
461				// Don't cross <table> boundaries
462				return false;
463			}
464			$p = $p->parentNode;
465		}
466
467		return false;
468	}
469
470	/**
471	 * Is $node the first wrapper element of encapsulated content?
472	 *
473	 * @param DOMNode $node
474	 * @return bool
475	 */
476	public static function isFirstEncapsulationWrapperNode( DOMNode $node ): bool {
477		return DOMUtils::matchTypeOf( $node, self::FIRST_ENCAP_REGEXP ) !== null;
478	}
479
480	/**
481	 * Is $node an encapsulation wrapper elt?
482	 *
483	 * All root-level $nodes of generated content are considered
484	 * encapsulation wrappers and share an about-id.
485	 *
486	 * @param DOMNode $node
487	 * @return bool
488	 */
489	public static function isEncapsulationWrapper( DOMNode $node ): bool {
490		// True if it has an encapsulation type or while walking backwards
491		// over elts with identical about ids, we run into a $node with an
492		// encapsulation type.
493		if ( !( $node instanceof DOMElement ) ) {
494			return false;
495		}
496		return self::findFirstEncapsulationWrapperNode( $node ) !== null;
497	}
498
499	/**
500	 * Is $node a DOMFragment wrapper?
501	 *
502	 * @param DOMNode $node
503	 * @return bool
504	 */
505	public static function isDOMFragmentWrapper( DOMNode $node ): bool {
506		// See TokenUtils::hasDOMFragmentType
507		return DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null;
508	}
509
510	/**
511	 * Is $node a sealed DOMFragment of a specific type?
512	 *
513	 * @param DOMNode $node
514	 * @param string $type
515	 * @return bool
516	 */
517	public static function isSealedFragmentOfType( DOMNode $node, string $type ): bool {
518		return DOMUtils::hasTypeOf( $node, "mw:DOMFragment/sealed/$type" );
519	}
520
521	/**
522	 * Is $node a Parsoid-generated <section> tag?
523	 *
524	 * @param DOMNode $node
525	 * @return bool
526	 */
527	public static function isParsoidSectionTag( DOMNode $node ): bool {
528		return $node->nodeName === 'section' &&
529			DOMUtils::assertElt( $node ) &&
530			$node->hasAttribute( 'data-mw-section-id' );
531	}
532
533	/**
534	 * Is the $node from extension content?
535	 * @param DOMNode $node
536	 * @param string $extType
537	 * @return bool
538	 */
539	public static function fromExtensionContent( DOMNode $node, string $extType ): bool {
540		$parentNode = $node->parentNode;
541		while ( $parentNode && !DOMUtils::atTheTop( $parentNode ) ) {
542			if ( DOMUtils::hasTypeOf( $parentNode, "mw:Extension/$extType" ) ) {
543				return true;
544			}
545			$parentNode = $parentNode->parentNode;
546		}
547		return false;
548	}
549
550	/**
551	 * Compute, when possible, the wikitext source for a $node in
552	 * an environment env. Returns null if the source cannot be
553	 * extracted.
554	 * @param Frame $frame
555	 * @param DOMElement $node
556	 * @return string|null
557	 */
558	public static function getWTSource( Frame $frame, DOMElement $node ): ?string {
559		$dp = DOMDataUtils::getDataParsoid( $node );
560		$dsr = $dp->dsr ?? null;
561		// FIXME: We could probably change the null return to ''
562		// Just need to verify that code that uses this won't break
563		return Utils::isValidDSR( $dsr ) ?
564			$dsr->substr( $frame->getSrcText() ) : null;
565	}
566
567	/**
568	 * Gets all siblings that follow '$node' that have an 'about' as
569	 * their about id.
570	 *
571	 * This is used to fetch transclusion/extension content by using
572	 * the about-id as the key.  This works because
573	 * transclusion/extension content is a forest of dom-trees formed
574	 * by adjacent dom-nodes.  This is the contract that template
575	 * encapsulation, dom-reuse, and VE code all have to abide by.
576	 *
577	 * The only exception to this adjacency rule is IEW nodes in
578	 * fosterable positions (in tables) which are not span-wrapped to
579	 * prevent them from getting fostered out.
580	 *
581	 * @param DOMNode $node
582	 * @param string $about
583	 * @return DOMNode[]
584	 */
585	public static function getAboutSiblings( DOMNode $node, string $about ): array {
586		$nodes = [ $node ];
587
588		if ( !$about ) {
589			return $nodes;
590		}
591
592		$node = $node->nextSibling;
593		while ( $node && (
594			$node instanceof DOMElement &&
595			$node->getAttribute( 'about' ) === $about ||
596				DOMUtils::isFosterablePosition( $node ) && !DOMUtils::isElt( $node ) && DOMUtils::isIEW( $node )
597		) ) {
598			$nodes[] = $node;
599			$node = $node->nextSibling;
600		}
601
602		// Remove already consumed trailing IEW, if any
603		while ( count( $nodes ) > 0 && DOMUtils::isIEW( $nodes[count( $nodes ) - 1] ) ) {
604			array_pop( $nodes );
605		}
606
607		return $nodes;
608	}
609
610	/**
611	 * This function is only intended to be used on encapsulated $nodes
612	 * (Template/Extension/Param content).
613	 *
614	 * Given a '$node' that has an about-id, it is assumed that it is generated
615	 * by templates or extensions.  This function skips over all
616	 * following content nodes and returns the first non-template node
617	 * that follows it.
618	 *
619	 * @param DOMNode $node
620	 * @return DOMNode|null
621	 */
622	public static function skipOverEncapsulatedContent( DOMNode $node ): ?DOMNode {
623		if ( $node instanceof DOMElement && $node->hasAttribute( 'about' ) ) {
624			$about = $node->getAttribute( 'about' );
625			// Guaranteed not to be empty. It will at least include $node.
626			$aboutSiblings = self::getAboutSiblings( $node, $about );
627			return end( $aboutSiblings )->nextSibling;
628		} else {
629			return $node->nextSibling;
630		}
631	}
632
633	/**
634	 * Comment encoding/decoding.
635	 *
636	 * * Some relevant phab tickets: T94055, T70146, T60184, T95039
637	 *
638	 * The wikitext comment rule is very simple: <!-- starts a comment,
639	 * and --> ends a comment.  This means we can have almost anything as the
640	 * contents of a comment (except the string "-->", but see below), including
641	 * several things that are not valid in HTML5 comments:
642	 *
643	 * * For one, the html5 comment parsing algorithm [0] leniently accepts
644	 * --!> as a closing comment tag, which differs from the php+tidy combo.
645	 *
646	 * * If the comment's data matches /^-?>/, html5 will end the comment.
647	 *    For example, <!-->stuff<--> breaks up as
648	 *    <!--> (the comment) followed by, stuff<--> (as text).
649	 *
650	 *  * Finally, comment data shouldn't contain two consecutive hyphen-minus
651	 *    characters (--), nor end in a hyphen-minus character (/-$/) as defined
652	 *    in the spec [1].
653	 *
654	 * We work around all these problems by using HTML entity encoding inside
655	 * the comment body.  The characters -, >, and & must be encoded in order
656	 * to prevent premature termination of the comment by one of the cases
657	 * above.  Encoding other characters is optional; all entities will be
658	 * decoded during wikitext serialization.
659	 *
660	 * In order to allow *arbitrary* content inside a wikitext comment,
661	 * including the forbidden string "-->" we also do some minimal entity
662	 * decoding on the wikitext.  We are also limited by our inability
663	 * to encode DSR attributes on the comment $node, so our wikitext entity
664	 * decoding must be 1-to-1: that is, there must be a unique "decoded"
665	 * string for every wikitext sequence, and for every decoded string there
666	 * must be a unique wikitext which creates it.
667	 *
668	 * The basic idea here is to replace every string ab*c with the string with
669	 * one more b in it.  This creates a string with no instance of "ac",
670	 * so you can use 'ac' to encode one more code point.  In this case
671	 * a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to
672	 * encode "-->" (which is otherwise unspeakable in wikitext).
673	 *
674	 * Note that any user content which does not match the regular
675	 * expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext
676	 * representation, as shown in the first two examples below.
677	 *
678	 * User-authored comment text    Wikitext       HTML5 DOM
679	 * --------------------------    -------------  ----------------------
680	 * & - >                         & - >          &amp; &#43; &gt;
681	 * Use &gt; here                 Use &gt; here  Use &amp;gt; here
682	 * -->                           --&gt;         &#43;&#43;&gt;
683	 * --&gt;                        --&amp;gt;     &#43;&#43;&amp;gt;
684	 * --&amp;gt;                    --&amp;amp;gt; &#43;&#43;&amp;amp;gt;
685	 *
686	 * [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state
687	 * [1] http://www.w3.org/TR/html5/syntax.html#comments
688	 *
689	 * Map a wikitext-escaped comment to an HTML DOM-escaped comment.
690	 *
691	 * @param string $comment Wikitext-escaped comment.
692	 * @return string DOM-escaped comment.
693	 */
694	public static function encodeComment( string $comment ): string {
695		// Undo wikitext escaping to obtain "true value" of comment.
696		$trueValue = preg_replace_callback( '/--&(amp;)*gt;/', function ( $m ) {
697				return Utils::decodeWtEntities( $m[0] );
698		}, $comment );
699
700		// Now encode '-', '>' and '&' in the "true value" as HTML entities,
701		// so that they can be safely embedded in an HTML comment.
702		// This part doesn't have to map strings 1-to-1.
703		// WARNING(T279451): This is actually the part which protects the
704		// "-type" key in self::fosterCommentData
705		return preg_replace_callback( '/[->&]/', function ( $m ) {
706			return Utils::entityEncodeAll( $m[0] );
707		}, $trueValue );
708	}
709
710	/**
711	 * Map an HTML DOM-escaped comment to a wikitext-escaped comment.
712	 * @param string $comment DOM-escaped comment.
713	 * @return string Wikitext-escaped comment.
714	 */
715	public static function decodeComment( string $comment ): string {
716		// Undo HTML entity escaping to obtain "true value" of comment.
717		$trueValue = Utils::decodeWtEntities( $comment );
718
719		// ok, now encode this "true value" of the comment in such a way
720		// that the string "-->" never shows up.  (See above.)
721		return preg_replace_callback( '/--(&(amp;)*gt;|>)/', function ( $m ) {
722			$s = $m[0];
723				return $s === '-->' ? '--&gt;' : '--&amp;' . substr( $s, 3 );
724		}, $trueValue );
725	}
726
727	/**
728	 * Utility function: we often need to know the wikitext DSR length for
729	 * an HTML DOM comment value.
730	 *
731	 * @param DOMComment|CommentTk|string $node A comment node containing a DOM-escaped comment.
732	 * @return int The wikitext length in UTF-8 bytes necessary to encode this
733	 *   comment, including 7 characters for the `<!--` and `-->` delimiters.
734	 */
735	public static function decodedCommentLength( $node ): int {
736		// Add 7 for the "<!--" and "-->" delimiters in wikitext.
737		if ( $node instanceof DOMComment ) {
738			$value = $node->nodeValue;
739		} elseif ( $node instanceof CommentTk ) {
740			$value = $node->value;
741		} else {
742			$value = $node;
743		}
744		return strlen( self::decodeComment( $value ) ) + 7;
745	}
746
747	/**
748	 * Escape `<nowiki>` tags.
749	 *
750	 * @param string $text
751	 * @return string
752	 */
753	public static function escapeNowikiTags( string $text ): string {
754		return preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '&lt;$1&gt;', $text );
755	}
756
757	/**
758	 * Conditional encoding is because, while treebuilding, the value goes
759	 * directly from token to dom node without the comment itself being
760	 * stringified and parsed where the comment encoding would be necessary.
761	 *
762	 * @param string $typeOf
763	 * @param array $attrs
764	 * @param bool $encode
765	 * @return string
766	 */
767	public static function fosterCommentData( string $typeOf, array $attrs, bool $encode ): string {
768		$str = PHPUtils::jsonEncode( [
769			// WARNING(T279451): The choice of "-type" as the key is because
770			// "-" will be encoded with self::encodeComment when comments come
771			// from source wikitext (see the grammar), so we can be sure when
772			// reinserting that the comments are internal to Parsoid
773			'-type' => $typeOf,
774			'attrs' => $attrs
775		] );
776		if ( $encode ) {
777			$str = self::encodeComment( $str );
778		}
779		return $str;
780	}
781
782	/**
783	 * @param Env $env
784	 * @param DOMNode $node
785	 * @param bool $decode
786	 * @return DOMNode|null
787	 */
788	public static function reinsertFosterableContent( Env $env, DOMNode $node, bool $decode ):
789			?DOMNode {
790		if ( DOMUtils::isComment( $node ) && preg_match( '/^\{.+\}$/D', $node->nodeValue ) ) {
791			// XXX(T279451#6981267): Hardcode this for good measure, even
792			// though all production uses should already be passing in `false`
793			$decode = false;
794			// Convert serialized meta tags back from comments.
795			// We use this trick because comments won't be fostered,
796			// providing more accurate information about where tags are expected
797			// to be found.
798			// @phan-suppress-next-line PhanImpossibleCondition
799			$data = json_decode( $decode ? self::decodeComment( $node->nodeValue ) : $node->nodeValue );
800			if ( $data === null ) {
801				// not a valid json attribute, do nothing
802				return null;
803			}
804			$type = $data->{'-type'} ?? '';
805			if ( preg_match( '/^mw:/', $type ) ) {
806				$meta = $node->ownerDocument->createElement( 'meta' );
807				foreach ( $data->attrs as $attr ) {
808					try {
809						$meta->setAttribute( ...$attr );
810					} catch ( \Exception $e ) {
811						$env->log( 'warn', 'prepareDOM: Dropped invalid attribute',
812							PHPUtils::jsonEncode( $attr )
813						);
814					}
815				}
816				$node->parentNode->replaceChild( $meta, $node );
817				return $meta;
818			}
819		}
820		return null;
821	}
822
823	/**
824	 * @param Env $env
825	 * @param DOMNode $node
826	 * @return ?ExtensionTagHandler
827	 */
828	public static function getNativeExt( Env $env, DOMNode $node ): ?ExtensionTagHandler {
829		$match = DOMUtils::matchTypeOf( $node, '/^mw:Extension\/(.+?)$/' );
830		$matchingTag = $match ? substr( $match, strlen( 'mw:Extension/' ) ) : null;
831		return $matchingTag ?
832			$env->getSiteConfig()->getExtTagImpl( $matchingTag ) : null;
833	}
834}
835