1<?php
2
3namespace Wikimedia\Parsoid\Html2Wt;
4
5use stdClass;
6use UnexpectedValueException;
7use Wikimedia\Parsoid\Config\Env;
8use Wikimedia\Parsoid\Core\MediaStructure;
9use Wikimedia\Parsoid\DOM\Element;
10use Wikimedia\Parsoid\DOM\Node;
11use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText;
12use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText;
13use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText;
14use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText;
15use Wikimedia\Parsoid\Utils\ContentUtils;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\TokenUtils;
21use Wikimedia\Parsoid\Utils\UrlUtils;
22use Wikimedia\Parsoid\Utils\Utils;
23use Wikimedia\Parsoid\Utils\WTUtils;
24
25/**
26 * Serializes link markup.
27 */
28class LinkHandlerUtils {
29	private static $REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D';
30	private static $MW_TITLE_WHITESPACE_RE
31		= '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u';
32
33	/**
34	 * Split a string based on a prefix and suffix
35	 *
36	 * @param string $contentString
37	 * @param stdClass $dp Containing ->prefix and ->tail
38	 * @return stdClass
39	 */
40	private static function splitLinkContentString( string $contentString, stdClass $dp ): stdClass {
41		$tail = $dp->tail ?? '';
42		$prefix = $dp->prefix ?? '';
43
44		$tailLen = strlen( $tail );
45		if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) {
46			// strip the tail off the content
47			$contentString = substr( $contentString, 0, -$tailLen );
48		} else {
49			$tail = '';
50		}
51
52		$prefixLen = strlen( $prefix );
53		if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) {
54			$contentString = substr( $contentString, $prefixLen );
55		} else {
56			$prefix = '';
57		}
58
59		return (object)[
60			'contentString' => $contentString,
61			'tail' => $tail,
62			'prefix' => $prefix,
63		];
64	}
65
66	/**
67	 * Helper function for munging protocol-less absolute URLs:
68	 * If this URL is absolute, but doesn't contain a protocol,
69	 * try to find a localinterwiki protocol that would work.
70	 *
71	 * @param Env $env
72	 * @param Element $node
73	 * @return string
74	 */
75	private static function getHref( Env $env, Element $node ): string {
76		$href = $node->getAttribute( 'href' ) ?? '';
77		if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) {
78			// protocol-less but absolute.  let's find a base href
79			foreach ( $env->getSiteConfig()->interwikiMap() as $prefix => $interwikiInfo ) {
80				if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) {
81					$base = $interwikiInfo['url'];
82
83					// evaluate the url relative to this base
84					$nhref = UrlUtils::expandUrl( $href, $base );
85
86					// can this match the pattern?
87					$re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD';
88					if ( preg_match( $re, $nhref ) ) {
89						return $nhref;
90					}
91				}
92			}
93		}
94		return $href;
95	}
96
97	/**
98	 * Normalize an interwiki prefix (?)
99	 * @param string $str
100	 * @return string
101	 */
102	private static function normalizeIWP( string $str ): string {
103		return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' );
104	}
105
106	/**
107	 * Escape a link target, and indicate if it's valid
108	 * @param string $linkTarget
109	 * @param SerializerState $state
110	 * @return stdClass
111	 */
112	private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass {
113		// Entity-escape the content.
114		$linkTarget = Utils::escapeWtEntities( $linkTarget );
115		return (object)[
116			'linkTarget' => $linkTarget,
117			// Is this an invalid link?
118			'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) ||
119				// `isValidLinkTarget` omits fragments (the part after #) so,
120				// even though "|" is an invalid character, we still need to ensure
121				// it doesn't appear in there.  The percent encoded version is fine
122				// in the fragment, since it won't break the parse.
123				strpos( $linkTarget, '|' ) !== false,
124		];
125	}
126
127	/**
128	 * Get the plain text content of the node, if it can be represented as such
129	 *
130	 * NOTE: This function seems a little inconsistent about what's considered
131	 * null and what's an empty string.  For example, no children is null
132	 * but a single diffMarker gets a string?  One of the current callers
133	 * seems to subtly depend on that though.
134	 *
135	 * FIXME(T254501): This function can return `$node->textContent` instead
136	 * of the string concatenation once mw:DisplaySpace is preprocessed away.
137	 *
138	 * @param Node $node
139	 * @return ?string
140	 */
141	private static function getContentString( Node $node ): ?string {
142		if ( !$node->hasChildNodes() ) {
143			return null;
144		}
145		$contentString = '';
146		$child = $node->firstChild;
147		while ( $child ) {
148			if ( DOMUtils::isText( $child ) ) {
149				$contentString .= $child->nodeValue;
150			} elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) {
151				$contentString .= ' ';
152			} elseif ( DOMUtils::isDiffMarker( $child ) ) {
153			} else {
154				return null;
155			}
156			$child = $child->nextSibling;
157		}
158		return $contentString;
159	}
160
161	/**
162	 * Helper function for getting RT data from the tokens
163	 * @param Env $env
164	 * @param Element $node
165	 * @param SerializerState $state
166	 * @return stdClass
167	 */
168	private static function getLinkRoundTripData(
169		Env $env, Element $node, SerializerState $state
170	): stdClass {
171		$dp = DOMDataUtils::getDataParsoid( $node );
172		$siteConfig = $env->getSiteConfig();
173		$rtData = (object)[
174			'type' => null, // could be null
175			'href' => null, // filled in below
176			'origHref' => null, // filled in below
177			'target' => null, // filled in below
178			'tail' => $dp->tail ?? '',
179			'prefix' => $dp->prefix ?? '',
180			'linkType' => null
181		];
182		$rtData->content = new stdClass;
183
184		// Figure out the type of the link
185		if ( $node->hasAttribute( 'rel' ) ) {
186			$rel = $node->getAttribute( 'rel' ) ?? '';
187			// Parsoid only emits and recognizes ExtLink, WikiLink, and PageProp rel values.
188			// Everything else defaults to ExtLink during serialization (unless it is
189			// serializable to a wikilink)
190			if ( preg_match( '/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)[^\s]*)\b/', $rel, $typeMatch ) ) {
191				$rtData->type = $typeMatch[1];
192				// Strip link subtype info
193				if ( $typeMatch[2] === 'WikiLink' || $typeMatch[2] === 'ExtLink' ) {
194					$rtData->type = 'mw:' . $typeMatch[2];
195				}
196			}
197		}
198
199		// Default link type if nothing else is set
200		if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) {
201			$rtData->type = 'mw:ExtLink';
202		}
203
204		// Get href, and save the token's "real" href for comparison
205		$href = self::getHref( $env, $node );
206		$rtData->origHref = $href;
207		$rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 );
208
209		// WikiLinks should be relative (but see below); fixup the link type
210		// if a WikiLink has an absolute URL.
211		// (This may get converted back to a WikiLink below, in the interwiki
212		// handling code.)
213		if ( $rtData->type === 'mw:WikiLink' &&
214			( preg_match( '#^(\w+:)?//#', $rtData->href ) ||
215				substr( $rtData->origHref ?? '', 0, 1 ) === '/' )
216		) {
217			$rtData->type = 'mw:ExtLink';
218		}
219
220		// Now get the target from rt data
221		$rtData->target = $state->serializer->serializedAttrVal( $node, 'href' );
222
223		// Check if the link content has been modified or is newly inserted content.
224		// FIXME: This will only work with selser of course. Hard to test without selser.
225		if ( $state->inModifiedContent ||
226			DiffUtils::hasDiffMark( $node, $env, 'subtree-changed' )
227		) {
228			$rtData->contentModified = true;
229		}
230
231		// Get the content string or tokens
232		$contentString = self::getContentString( $node );
233		if ( $contentString !== null ) {
234			if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) {
235				// Try to identify a new potential tail
236				$contentParts = self::splitLinkContentString( $contentString, $dp );
237				$rtData->content->string = $contentParts->contentString;
238				$rtData->tail = $contentParts->tail;
239				$rtData->prefix = $contentParts->prefix;
240			} else {
241				$rtData->tail = '';
242				$rtData->prefix = '';
243				$rtData->content->string = $contentString;
244			}
245		} elseif ( $node->hasChildNodes() ) {
246			$rtData->contentNode = $node;
247		} elseif ( $rtData->type === 'mw:PageProp/redirect' ) {
248			$rtData->isRedirect = true;
249			$rtData->prefix = $dp->src
250				?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' );
251		}
252
253		// Update link type based on additional analysis.
254		// What might look like external links might be serializable as a wikilink.
255		$target = &$rtData->target;
256
257		// mw:MediaLink annotations are considered authoritative
258		// and interwiki link matches aren't made for these
259		if ( $rtData->type === 'mw:MediaLink' ) {
260			// Parse title from resource attribute (see analog in image handling)
261			$resource = $state->serializer->serializedAttrVal( $node, 'resource' );
262			if ( $resource['value'] === null ) {
263				// from non-parsoid HTML: try to reconstruct resource from href?
264				// (See similar code which tries to guess resource from <img src>)
265				$mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) );
266				$slashPos = strrpos( $rtData->origHref, '/' );
267				$fileName = $slashPos === false ? $rtData->origHref :
268					substr( $rtData->origHref, $slashPos + 1 );
269				$resource = [
270					'value' => $mediaPrefix . ':' . $fileName,
271					'fromsrc' => false,
272					'modified' => false
273				];
274			}
275			$rtData->target = $resource;
276			$rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 );
277			return $rtData;
278		}
279
280		// Check if the href matches any of our interwiki URL patterns
281		$interWikiMatch = $siteConfig->interWikiMatcher( $href );
282		if ( $interWikiMatch &&
283			// Question mark is a valid title char, so it won't fail the test below,
284			// but gets percent encoded on the way out since it has special
285			// semantics in a url.  That will break the url we're serializing, so
286			// protect it.
287			// FIXME: If ever the default value for $wgExternalInterwikiFragmentMode
288			// changes, we can reduce this by always stripping off the fragment
289			// identifier, since in "html5" mode, that isn't encoded.  At present,
290			// we can only do that if we know it's a local interwiki link.
291			strpos( $interWikiMatch[1], '?' ) === false &&
292			// Ensure we have a valid link target, otherwise falling back to extlink
293			// is preferable, since it won't serialize as a link.
294			(
295				$interWikiMatch[1] === '' || !self::escapeLinkTarget(
296					// Append the prefix since we want to validate the target
297					// with respect to it being an interwiki.
298					$interWikiMatch[0] . ':' . $interWikiMatch[1],
299					$state
300				)->invalidLink
301			) &&
302			// ExtLinks should have content to convert.
303			(
304				$rtData->type !== 'mw:ExtLink' ||
305				!empty( $rtData->content->string ) ||
306				!empty( $rtData->contentNode )
307			) &&
308			( !empty( $dp->isIW ) || !empty( $target['modified'] ) || !empty( $rtData->contentModified ) )
309		) {
310			// External link that is really an interwiki link. Convert it.
311			// TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound
312			if ( $rtData->type === 'mw:ExtLink' ) {
313				$rtData->type = 'mw:WikiLink';
314			}
315			$rtData->isInterwiki = true;
316			$iwMap = $siteConfig->interwikiMap();
317			// could this be confused with a language link?
318			$iwi = $iwMap[self::normalizeIWP( $interWikiMatch[0] )] ?? null;
319			$rtData->isInterwikiLang = $iwi && isset( $iwi['language'] );
320			// is this our own wiki?
321			$rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] );
322			// strip off localinterwiki prefixes
323			$localPrefix = '';
324			$oldPrefix = null;
325			while ( true ) {
326				$tmp = substr( $target['value'], strlen( $localPrefix ) );
327				if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) {
328					break;
329				}
330				$iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null;
331				if ( !$iwi || !isset( $iwi['localinterwiki'] ) ) {
332					break;
333				}
334				$localPrefix .= $oldPrefix[1] . ':';
335			}
336
337			if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) {
338				// Leave the target alone!
339			} else {
340				if ( $rtData->type === 'mw:PageProp/Language' ) {
341					$targetValue = implode( ':', $interWikiMatch );
342					// Strip initial colon
343					if ( $targetValue[0] === ':' ) {
344						$targetValue = substr( $targetValue, 1 );
345					}
346					$target['value'] = $targetValue;
347				} elseif (
348					$oldPrefix && ( // Should we preserve the old prefix?
349						strcasecmp( $oldPrefix[1], $interWikiMatch[0] ) === 0 ||
350						// Check if the old prefix mapped to the same URL as
351						// the new one. Use the old one if that's the case.
352						// Example: [[w:Foo]] vs. [[:en:Foo]]
353						( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null )
354							=== ( $iwMap[self::normalizeIWP( $interWikiMatch[0] )]['url'] ?? null )
355					)
356				) {
357					// Reuse old prefix capitalization
358					if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) )
359						!== $interWikiMatch[1]
360					) {
361						// Modified, update target.value.
362						$target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interWikiMatch[1];
363					}
364					// Ensure that we generate an interwiki link and not a language link!
365					if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) {
366						$target['value'] = ':' . $target['value'];
367					}
368				} else { // Else: preserve old encoding
369					if ( !empty( $rtData->isLocal ) ) {
370						// - interwikiMatch will be ":en", ":de", etc.
371						// - This tests whether the interwiki-like link is actually
372						// a local wikilink.
373						$target['value'] = $interWikiMatch[1];
374						$rtData->isInterwiki = $rtData->isInterwikiLang = false;
375					} else {
376						$target['value'] = implode( ':', $interWikiMatch );
377					}
378				}
379			}
380		}
381
382		return $rtData;
383	}
384
385	/**
386	 * The provided URL is already percent-encoded -- but it may still
387	 * not be safe for wikitext.  Add additional escapes to make the URL
388	 * wikitext-safe. Don't touch percent escapes already in the url,
389	 * though!
390	 * @param string $urlStr
391	 * @return string
392	 */
393	private static function escapeExtLinkURL( string $urlStr ): string {
394		// this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser
395		return preg_replace(
396			// IPv6 host names are bracketed with [].  Entity-decode these.
397			'!^([a-z][^:/]*:)?//&#x5B;([0-9a-f:.]+)&#x5D;(:\d|/|$)!iD',
398			'$1//[$2]$3',
399			preg_replace_callback(
400				// phpcs:ignore Generic.Files.LineLength.TooLong
401				'/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]|-(?=\{)/u',
402				static function ( $m ) {
403					return Utils::entityEncodeAll( $m[0] );
404				},
405				$urlStr
406			),
407			1
408		);
409	}
410
411	/**
412	 * Add a colon escape to a wikilink target string if needed.
413	 * @param Env $env
414	 * @param string $linkTarget
415	 * @param stdClass $linkData
416	 * @return string
417	 */
418	private static function addColonEscape(
419		Env $env, string $linkTarget, stdClass $linkData
420	): string {
421		$linkTitle = $env->makeTitleFromText( $linkTarget );
422		if ( ( $linkTitle->getNamespace()->isCategory() || $linkTitle->getNamespace()->isFile() ) &&
423			$linkData->type === 'mw:WikiLink' &&
424			$linkTarget[0] !== ':' ) {
425			// Escape category and file links
426			return ':' . $linkTarget;
427		} else {
428			return $linkTarget;
429		}
430	}
431
432	/**
433	 * Test if something is a URL link
434	 * @param Env $env
435	 * @param Element $node
436	 * @param stdClass $linkData
437	 * @return bool
438	 */
439	private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool {
440		$target = $linkData->target;
441
442		// Get plain text content, if any
443		$contentStr = self::getContentString( $node );
444
445		// First check if we can serialize as an URL link
446		return ( $contentStr !== null && $contentStr !== '' ) &&
447			// Can we minimize this?
448			( $target['value'] === $contentStr || self::getHref( $env, $node ) === $contentStr ) &&
449			// protocol-relative url links not allowed in text
450			// (see autourl rule in peg tokenizer, T32269)
451			!str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env );
452	}
453
454	/**
455	 * Figure out if we need a piped or simple link
456	 * @param Env $env
457	 * @param stdClass $dp
458	 * @param array $target
459	 * @param stdClass $linkData
460	 * @return bool
461	 */
462	private static function isSimpleWikiLink(
463		Env $env, stdClass $dp, array $target, stdClass $linkData
464	): bool {
465		$canUseSimple = false;
466		$contentString = $linkData->content->string ?? null;
467
468		// FIXME (SSS):
469		// 1. Revisit this logic to see if all these checks
470		// are still relevant or whether this can be simplified somehow.
471		// 2. There are also duplicate computations for env.normalizedTitleKey(..)
472		// and Util.decodeURIComponent(..) that could be removed.
473		// 3. This could potentially be refactored as if-then chains.
474
475		// Would need to pipe for any non-string content.
476		// Preserve unmodified or non-minimal piped links.
477		if ( $contentString !== null &&
478			( !empty( $target['modified'] ) || !empty( $linkData->contentModified ) ||
479				( $dp->stx ?? null ) !== 'piped'
480			) &&
481			// Relative links are not simple
482			!str_starts_with( $contentString, './' )
483		) {
484			// Strip colon escapes from the original target as that is
485			// stripped when deriving the content string.
486			// Strip ./ prefixes as well since they are relative link prefixes
487			// added to all titles.
488			$strippedTargetValue = preg_replace( '#^(:|\./)#', '', $target['value'], 1 );
489			$decodedTarget = Utils::decodeWtEntities( $strippedTargetValue );
490			// Deal with the protocol-relative link scenario as well
491			$hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href );
492
493			// Normalize content string and decoded target before comparison.
494			// Piped links don't come down this path => it is safe to normalize both.
495			$contentString = str_replace( '_', ' ', $contentString );
496			$decodedTarget = str_replace( '_', ' ', $decodedTarget );
497
498			// See if the (normalized) content matches the
499			// target, either shadowed or actual.
500			$canUseSimple =
501				$contentString === $decodedTarget ||
502				// try wrapped in forward slashes in case they were stripped
503				( '/' . $contentString . '/' ) === $decodedTarget ||
504				// normalize as titles and compare
505				$env->normalizedTitleKey( $contentString, true )
506					=== preg_replace( self::$MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) ||
507				// Relative link
508				(
509					(
510						$env->getSiteConfig()->namespaceHasSubpages( $env->getPageConfig()->getNs() ) &&
511						preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) &&
512						$contentString === $env->resolveTitle( $strippedTargetValue )
513					) ||
514					(
515						preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) &&
516						$contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 )
517					)
518				) ||
519				// if content == href this could be a simple link... eg [[Foo]].
520				// but if href is an absolute url with protocol, this won't
521				// work: [[http://example.com]] is not a valid simple link!
522				(
523					!$hrefHasProto &&
524					// Always compare against decoded uri because
525					// <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p>
526					// should serialize as [[7% Solution|7%25 Solution]]
527					(
528						$contentString === Utils::decodeURIComponent( $linkData->href ) ||
529						// normalize with underscores for comparison with href
530						$env->normalizedTitleKey( $contentString, true )
531							=== Utils::decodeURIComponent( $linkData->href )
532					)
533				);
534		}
535
536		return $canUseSimple;
537	}
538
539	/**
540	 * Serialize as wiki link
541	 * @param Element $node
542	 * @param SerializerState $state
543	 * @param stdClass $linkData
544	 */
545	private static function serializeAsWikiLink(
546		Element $node, SerializerState $state, stdClass $linkData
547	): void {
548		$contentParts = null;
549		$contentSrc = '';
550		$isPiped = false;
551		$requiresEscaping = true;
552		$env = $state->getEnv();
553		$siteConfig = $env->getSiteConfig();
554		$target = $linkData->target;
555		$dp = DOMDataUtils::getDataParsoid( $node );
556
557		// Decode any link that did not come from the source (data-mw/parsoid)
558		// Links that come from data-mw/data-parsoid will be true titles,
559		// but links that come from hrefs will need to be url-decoded.
560		// Ex: <a href="/wiki/A%3Fb">Foobar</a>
561		if ( empty( $target['fromsrc'] ) ) {
562			// Omit fragments from decoding
563			$hash = strpos( $target['value'], '#' );
564			if ( $hash !== false ) {
565				$target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) )
566					. substr( $target['value'], $hash );
567			} else {
568				$target['value'] = Utils::decodeURIComponent( $target['value'] );
569			}
570		}
571
572		// Special-case handling for category links
573		if ( $linkData->type === 'mw:PageProp/Category' ) {
574			// Split target and sort key in $target['value'].
575			// The sort key shows up as "#something" in there.
576			// However, watch out for parser functions that start with "{{#"
577			// The atomic group is essential to prevent "{{#" parser function prefix
578			// from getting split at the "{{" and "#" where the "{{" matches the
579			// [^#]* and the "#" matches after separately.
580			if ( preg_match( '/^((?>{{#|[^#])*)#(.*)/', $target['value'], $targetParts ) ) {
581				$target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' );
582				// FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');`
583				$strContent = Utils::decodeURIComponent( $targetParts[2] );
584				$contentParts = self::splitLinkContentString( $strContent, $dp );
585				$linkData->content->string = $contentParts->contentString;
586				$dp->tail = $linkData->tail = $contentParts->tail;
587				$dp->prefix = $linkData->prefix = $contentParts->prefix;
588			} else { // No sort key, will serialize to simple link
589				// Normalize the content string
590				$linkData->content->string = strtr(
591					PHPUtils::stripPrefix( $target['value'], './' ), '_', ' '
592				);
593			}
594
595			// Special-case handling for template-affected sort keys
596			// FIXME: sort keys cannot be modified yet, but if they are,
597			// we need to fully shadow the sort key.
598			// if ( !target.modified ) {
599			// The target and source key was not modified
600			$sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' );
601			if ( isset( $sortKeySrc['value'] ) ) {
602				$linkData->contentNode = null;
603				$linkData->content->string = $sortKeySrc['value'];
604				// TODO: generalize this flag. It is already used by
605				// getAttributeShadowInfo. Maybe use the same
606				// structure as its return value?
607				$linkData->content->fromsrc = true;
608			}
609			// }
610		} else {
611			if ( $linkData->type === 'mw:PageProp/Language' ) {
612				// Fix up the content string
613				// TODO: see if linkData can be cleaner!
614				if ( !isset( $linkData->content->string ) ) {
615					$linkData->content->string = Utils::decodeWtEntities( $target['value'] );
616				}
617			}
618		}
619
620		// The string value of the content, if it is plain text.
621		$linkTarget = null;
622		$escapedTgt = null;
623		if ( !empty( $linkData->isRedirect ) ) {
624			$linkTarget = $target['value'];
625			if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) {
626				$linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' );
627				$escapedTgt = self::escapeLinkTarget( $linkTarget, $state );
628				$linkTarget = $escapedTgt->linkTarget;
629				// Determine if it's a redirect to a category, in which case
630				// it needs a ':' on front to distingish from a category link.
631				if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) {
632					$ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) );
633					if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) {
634						// Check that the next node isn't a category link,
635						// in which case we don't want the ':'.
636						$nextNode = $node->nextSibling;
637						if ( !(
638							$nextNode && $nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' &&
639							$nextNode->getAttribute( 'rel' ) === 'mw:PageProp/Category' &&
640							$nextNode->getAttribute( 'href' ) === $node->getAttribute( 'href' )
641						) ) {
642							$linkTarget = ':' . $linkTarget;
643						}
644					}
645				}
646			}
647		} elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) {
648			// Simple case
649			if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) {
650				$linkTarget = PHPUtils::stripPrefix( $target['value'], './' );
651			} else {
652				// If token has templated attrs or is a subpage, use target.value
653				// since content string will be drastically different.
654				if ( WTUtils::hasExpandedAttrsType( $node ) ||
655					preg_match( '#(^|/)\.\./#', $target['value'] )
656				) {
657					$linkTarget = PHPUtils::stripPrefix( $target['value'], './' );
658				} else {
659					$escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state );
660					if ( !$escapedTgt->invalidLink ) {
661						$linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData );
662					} else {
663						$linkTarget = $escapedTgt->linkTarget;
664					}
665				}
666				if ( !empty( $linkData->isInterwikiLang ) &&
667					$linkTarget[0] !== ':' &&
668					$linkData->type !== 'mw:PageProp/Language'
669				) {
670					// ensure interwiki links can't be confused with
671					// interlanguage links.
672					$linkTarget = ':' . $linkTarget;
673				}
674			}
675		} elseif ( self::isURLLink( $state->getEnv(), $node, $linkData )
676			/* && empty( $linkData->isInterwiki ) */
677		) {
678			// Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]]
679			// for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>'
680			$linkData->linkType = 'mw:URLLink';
681		} else {
682			// Emit piped wikilink syntax
683			$isPiped = true;
684
685			// First get the content source
686			if ( !empty( $linkData->contentNode ) ) {
687				$cs = $state->serializeLinkChildrenToString(
688					$linkData->contentNode,
689					[ $state->serializer->wteHandlers, 'wikilinkHandler' ]
690				);
691				// strip off the tail and handle the pipe trick
692				$contentParts = self::splitLinkContentString( $cs, $dp );
693				$contentSrc = $contentParts->contentString;
694				$dp->tail = $contentParts->tail;
695				$linkData->tail = $contentParts->tail;
696				$dp->prefix = $contentParts->prefix;
697				$linkData->prefix = $contentParts->prefix;
698				$requiresEscaping = false;
699			} else {
700				$contentSrc = $linkData->content->string ?? '';
701				$requiresEscaping = empty( $linkData->content->fromsrc );
702			}
703
704			if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) {
705				// Protect empty link content from PST pipe trick
706				$contentSrc = '<nowiki/>';
707				$requiresEscaping = false;
708			}
709
710			$linkTarget = $target['value'];
711			if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) {
712				// Links starting with ./ shouldn't get _ replaced with ' '
713				$linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' );
714				$linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 );
715				if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) {
716					$linkTarget = strtr( $linkTarget, '_', ' ' );
717				}
718				$escapedTgt = self::escapeLinkTarget( $linkTarget, $state );
719				$linkTarget = $escapedTgt->linkTarget;
720			}
721
722			// If we are reusing the target from source, we don't
723			// need to worry about colon-escaping because it will
724			// be in the right form already.
725			//
726			// Trying to eliminate this check and always check for
727			// colon-escaping seems a bit tricky when the reused
728			// target has encoded entities that won't resolve to
729			// valid titles.
730			if ( ( !$escapedTgt || !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) {
731				$linkTarget = self::addColonEscape( $env, $linkTarget, $linkData );
732			}
733		}
734		if ( $linkData->linkType === 'mw:URLLink' ) {
735			$state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node );
736			return;
737		}
738
739		if ( !empty( $linkData->isRedirect ) ) {
740			// Drop duplicates
741			if ( $state->redirectText !== null ) {
742				return;
743			}
744
745			// Buffer redirect text if it is not in start of file position
746			if ( !preg_match( self::$REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) {
747				$state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]';
748				$state->emitChunk( '', $node ); // Flush seperators for this node
749				// Flush seperators for this node
750				return;
751			}
752
753			// Set to some non-null string
754			$state->redirectText = 'unbuffered';
755		}
756
757		$pipedText = null;
758		if ( $escapedTgt && $escapedTgt->invalidLink ) {
759			// If the link target was invalid, instead of emitting an invalid link,
760			// omit the link and serialize just the content instead. But, log the
761			// invalid html for Parsoid clients to investigate later.
762			$state->getEnv()->log(
763				'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node )
764			);
765
766			// For non-piped content, use the original invalid link text
767			$pipedText = $isPiped ? $contentSrc : $linkTarget;
768			$state->escapeText = $requiresEscaping;
769			$state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node );
770		} else {
771			if ( $isPiped && $requiresEscaping ) {
772				// We are definitely not in sol context since content
773				// will be preceded by "[[" or "[" text in target wikitext.
774				$pipedText = '|' . $state->serializer->wteHandlers
775					->escapeLinkContent( $state, $contentSrc, false, $node, false );
776			} elseif ( $isPiped ) {
777				$pipedText = '|' . $contentSrc;
778			} else {
779				$pipedText = '';
780			}
781			if ( $isPiped ) {
782				$state->singleLineContext->disable();
783			}
784			$state->emitChunk( new WikiLinkText(
785				$linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail,
786				$node, $siteConfig, $linkData->type
787			), $node );
788			if ( $isPiped ) {
789				$state->singleLineContext->pop();
790			}
791		}
792	}
793
794	/**
795	 * Serialize as external link
796	 * @param Element $node
797	 * @param SerializerState $state
798	 * @param stdClass $linkData
799	 */
800	private static function serializeAsExtLink(
801		Element $node, SerializerState $state, stdClass $linkData
802	): void {
803		$target = $linkData->target;
804		$urlStr = $target['value'];
805		if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) {
806			// We expect modified hrefs to be percent-encoded already, so
807			// don't need to encode them here any more. Unmodified hrefs are
808			// just using the original encoding anyway.
809			// BUT we do have to encode certain special wikitext
810			// characters (like []) which aren't necessarily
811			// percent-encoded because they are valid in URLs and HTML5
812			$urlStr = self::escapeExtLinkURL( $urlStr );
813		}
814
815		if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) {
816			// Serialize as URL link
817			$state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node );
818			return;
819		}
820
821		$siteConfig = $state->getEnv()->getSiteConfig();
822
823		// TODO: match vs. interwikis too
824		$magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()(
825			Utils::decodeURI( $linkData->origHref )
826		);
827		$pureHashMatch = substr( $urlStr, 0, 1 ) === '#';
828		// Fully serialize the content
829		$contentStr = $state->serializeLinkChildrenToString(
830			$node,
831			[ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ]
832		);
833		// First check for ISBN/RFC/PMID links. We rely on selser to
834		// preserve non-minimal forms.
835		if ( $magicLinkMatch ) {
836			$serialized = $siteConfig->makeExtResourceURL(
837				$magicLinkMatch, $target['value'], $contentStr
838			);
839			if ( $serialized[0] === '[' ) {
840				// Serialization as a magic link failed (perhaps the
841				// content string wasn't appropriate).
842				$state->emitChunk(
843					( $magicLinkMatch[0] === 'ISBN' ) ?
844					new WikiLinkText( $serialized, $node, $siteConfig, 'mw:WikiLink' ) :
845					new ExtLinkText( $serialized, $node, $siteConfig, 'mw:ExtLink' ),
846					$node
847				);
848			} else {
849				$state->emitChunk( new MagicLinkText( $serialized, $node ), $node );
850			}
851			return;
852		} else { // There is an interwiki for RFCs, but strangely none for PMIDs.
853			// serialize as auto-numbered external link
854			// [http://example.com]
855			$linktext = null;
856			$class = null;
857			// If it's just anchor text, serialize as an internal link.
858			if ( $pureHashMatch ) {
859				$class = WikiLinkText::class;
860				$linktext = '[[' . $urlStr . ( ( $contentStr ) ? '|' . $contentStr : '' ) . ']]';
861			} else {
862				$class = ExtLinkText::class;
863				$linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']';
864			}
865			$state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node );
866			return;
867		}
868	}
869
870	/**
871	 * Main link handler.
872	 * @param SerializerState $state
873	 * @param Element $node
874	 */
875	public static function linkHandler( SerializerState $state, Element $node ): void {
876		// TODO: handle internal/external links etc using RDFa and dataAttribs
877		// Also convert unannotated html links without advanced attributes to
878		// external wiki links for html import. Might want to consider converting
879		// relative links without path component and file extension to wiki links.
880		$env = $state->getEnv();
881		$siteConfig = $env->getSiteConfig();
882
883		// Get the rt data from the token and tplAttrs
884		$linkData = self::getLinkRoundTripData( $env, $node, $state );
885		$linkType = $linkData->type;
886		if ( $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ) ) {
887			// Override the 'rel' type if this is a magic link
888			$linkType = 'mw:ExtLink';
889		}
890		if ( $linkType !== null && isset( $linkData->target['value'] ) ) {
891			// We have a type and target info
892			if ( $linkType === 'mw:WikiLink' || $linkType === 'mw:MediaLink' ||
893				preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType )
894			) {
895				// [[..]] links: normal, category, redirect, or lang links
896				// (except images)
897				self::serializeAsWikiLink( $node, $state, $linkData );
898				return;
899			} elseif ( $linkType === 'mw:ExtLink' ) {
900				// [..] links, autolinks, ISBN, RFC, PMID
901				self::serializeAsExtLink( $node, $state, $linkData );
902				return;
903			} else {
904				throw new UnexpectedValueException(
905					'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node )
906				);
907			}
908		} else {
909			$safeAttr = array_flip( [
910				'href', 'rel', 'class', 'title', DOMDataUtils::DATA_OBJECT_ATTR_NAME
911			] );
912
913			$isComplexLink = false;
914			foreach ( DOMCompat::attributes( $node ) as $attr ) {
915				// XXX: Don't drop rel and class in every case once a tags are
916				// actually supported in the MW default config?
917				if ( $attr->name && !isset( $safeAttr[$attr->name] ) ) {
918					$isComplexLink = true;
919					break;
920				}
921			}
922
923			if ( $isComplexLink ) {
924				$env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ),
925					'-- serializing as extlink and dropping <a> attributes unsupported in wikitext.'
926				);
927			} else {
928				$media = DOMUtils::selectMediaElt( $node );  // TODO: Handle missing media too
929				$isFigure = ( $media instanceof Element && $media->parentNode === $node );
930				if ( $isFigure ) {
931					// this is a basic html figure: <a><img></a>
932					self::figureHandler( $state, $node, new MediaStructure( $media, $node ) );
933					return;
934				}
935			}
936
937			// href is already percent-encoded, etc., but it might contain
938			// spaces or other wikitext nasties.  escape the nasties.
939			$hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) );
940			$handler = [ $state->serializer->wteHandlers, 'aHandler' ];
941			$str = $state->serializeLinkChildrenToString( $node, $handler );
942			$chunk = null;
943			if ( !$hrefStr ) {
944				// Without an href, we just emit the string as text.
945				// However, to preserve targets for anchor links,
946				// serialize as a span with a name.
947				if ( $node->hasAttribute( 'name' ) ) {
948					$name = $node->getAttribute( 'name' );
949					$doc = $node->ownerDocument;
950					$span = $doc->createElement( 'span' );
951					$span->setAttribute( 'name', $name );
952					$span->appendChild( $doc->createTextNode( $str ) );
953					$chunk = DOMCompat::getOuterHTML( $span );
954				} else {
955					$chunk = $str;
956				}
957			} else {
958				$chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']',
959					$node, $siteConfig, 'mw:ExtLink'
960				);
961			}
962			$state->emitChunk( $chunk, $node );
963		}
964	}
965
966	/**
967	 * Main figure handler.
968	 *
969	 * @param SerializerState $state
970	 * @param Element $node
971	 * @param ?MediaStructure $ms
972	 */
973	public static function figureHandler(
974		SerializerState $state, Element $node, ?MediaStructure $ms
975	): void {
976		$env = $state->getEnv();
977
978		if ( !$ms ) {
979			$env->log(
980				'error/html2wt/figure',
981				"Couldn't parse media structure: ",
982				DOMCompat::getOuterHTML( $node )
983			);
984			$state->emitChunk( '', $node );
985			return;
986		}
987
988		$outerElt = $ms->containerElt ?? $ms->mediaElt;
989		$linkElt = $ms->linkElt;
990		$elt = $ms->mediaElt;
991		$captionElt = $ms->captionElt;
992
993		$format = WTSUtils::getMediaFormat( $outerElt );
994
995		// Try to identify the local title to use for this image.
996		$resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' );
997		if ( !isset( $resource['value'] ) ) {
998			// from non-parsoid HTML: try to reconstruct resource from src?
999			// (this won't work for manual-thumb images)
1000			if ( !$elt->hasAttribute( 'src' ) ) {
1001				$env->log( 'error/html2wt/figure',
1002					'In WSP.figureHandler, img does not have resource or src:',
1003					DOMCompat::getOuterHTML( $node )
1004				);
1005				$state->emitChunk( '', $node );
1006				return;
1007			}
1008			$src = $elt->getAttribute( 'src' ) ?? '';
1009			if ( preg_match( '/^https?:/', $src ) ) {
1010				// external image link, presumably $wgAllowExternalImages=true
1011				$state->emitChunk( new AutoURLLinkText( $src, $node ), $node );
1012				return;
1013			}
1014			$resource = [
1015				'value' => $src,
1016				'fromsrc' => false,
1017				'modified' => false
1018			];
1019		}
1020		if ( empty( $resource['fromsrc'] ) ) {
1021			$resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 );
1022		}
1023
1024		$nopts = [];
1025		$outerDP = DOMDataUtils::getDataParsoid( $outerElt );
1026		$outerDMW = DOMDataUtils::getDataMw( $outerElt );
1027		$mwAliases = $state->getEnv()->getSiteConfig()->mwAliases();
1028
1029		// Return ref to the array element in case it is modified
1030		$getOpt = static function & ( $key ) use ( &$outerDP ): ?array {
1031			$null = null;
1032			if ( empty( $outerDP->optList ) ) {
1033				return $null;
1034			}
1035			foreach ( $outerDP->optList as $opt ) {
1036				if ( ( $opt['ck'] ?? null ) === $key ) {
1037					return $opt;
1038				}
1039			}
1040			return $null;
1041		};
1042		// Return ref to the array element in case it is modified
1043		$getLastOpt = static function & ( $key ) use ( &$outerDP ) : ?array {
1044			$null = null;
1045			$opts = $outerDP->optList ?? [];
1046			for ( $i = count( $opts ) - 1;  $i >= 0;  $i-- ) {
1047				if ( ( $opts[$i]['ck'] ?? null ) === $key ) {
1048					return $opts[$i];
1049				}
1050			}
1051			return $null;
1052		};
1053
1054		// Identify a page # to use.
1055		$page = null;
1056		$pageFromHref = preg_match(
1057			'#[?]page=(\d+)$#D',
1058			( $linkElt ? $linkElt->getAttribute( 'href' ) : null ) ?? '',
1059			$matches ) ? $matches[1] : null;
1060		$pageFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'page', true );
1061		if ( $pageFromDataMw !== null ) {
1062			// FIXME: if $pageFromHref is null but $pageFromDataMw is
1063			// set, then we go ahead and serialize the page parameter
1064			// as unmodified.  This helps transition old RESTBase
1065			// content where the ?page suffix on the URL was missing,
1066			// but eventually $restBaseMigrationHack should be left
1067			// false always. (T259931)
1068			$restBaseMigrationHack =
1069				( $pageFromHref === null && $pageFromDataMw[1]->txt );
1070
1071			if (
1072				trim( $pageFromDataMw[1]->txt ) === $pageFromHref ||
1073				$restBaseMigrationHack
1074			) {
1075				$page = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'page' );
1076				if ( !$page ) {
1077					$page = [
1078						'value' => $pageFromDataMw[1]->txt,
1079						'modified' => false,
1080						'fromsrc' => false,
1081						'fromDataMW' => true,
1082					];
1083				}
1084			}
1085		}
1086		if ( !$page && $pageFromHref !== null ) {
1087			$page = [
1088				'value' => $pageFromHref,
1089				'modified' => true,
1090				'fromsrc' => false,
1091				'fromDataMW' => false,
1092			];
1093		}
1094
1095		// Try to identify the local title to use for the link.
1096		$link = null;
1097
1098		$linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true );
1099		if ( $linkFromDataMw !== null ) {
1100			// "link" attribute on the `outerElt` takes precedence
1101			if ( isset( $linkFromDataMw[1]->html ) ) {
1102				$link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' );
1103			} else {
1104				$link = [
1105					'value' => "link={$linkFromDataMw[1]->txt}",
1106					'modified' => false,
1107					'fromsrc' => false,
1108					'fromDataMW' => true
1109				];
1110			}
1111		} elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) {
1112			$link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' );
1113			if ( empty( $link['fromsrc'] ) ) {
1114				// strip page parameter if present on href
1115				$strippedHref = preg_replace( '#[?]page=\d+$#D', '', $linkElt->getAttribute( 'href' ) ?? '' );
1116				if ( $strippedHref === $elt->getAttribute( 'resource' ) ) {
1117					// default link: same place as resource
1118					$link = $resource;
1119				}
1120				$link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 );
1121			}
1122		} else {
1123			// Otherwise, just try and get it from data-mw
1124			$link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' );
1125		}
1126
1127		if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) {
1128			$linkOpt = $getOpt( 'link' );
1129			if ( $linkOpt ) {
1130				$link['fromsrc'] = true;
1131				$link['value'] = $linkOpt['ak'];
1132			}
1133		}
1134
1135		// Reconstruct the caption
1136		if ( !$captionElt && is_string( $outerDMW->caption ?? null ) ) {
1137			// IMPORTANT: Assign to a variable to prevent the fragment
1138			// from getting GCed before we are done with it.
1139			$fragment = ContentUtils::createAndLoadDocumentFragment(
1140				$outerElt->ownerDocument, $outerDMW->caption,
1141				[ 'markNew' => true ]
1142			);
1143			// FIXME: We should just be able to serialize the children of the
1144			// fragment, however, we need some way of marking this as being
1145			// inModifiedContent so that any bare text is assured to be escaped
1146			$captionElt = $outerElt->ownerDocument->createElement( 'div' );
1147			DOMDataUtils::getDataParsoid( $captionElt )->tmp->isNew = true;
1148			DOMUtils::migrateChildren( $fragment, $captionElt );
1149			// Needs a parent node in order for WTS to be happy
1150			$fragment->appendChild( $captionElt );
1151		}
1152
1153		$caption = null;
1154		if ( $captionElt ) {
1155			$caption = $state->serializeCaptionChildrenToString(
1156				$captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ]
1157			);
1158		}
1159
1160		// Fetch the alt (if any)
1161		$alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' );
1162		// Fetch the lang (if any)
1163		$lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' );
1164
1165		// Ok, start assembling options, beginning with link & alt & lang
1166		// Other media don't have links in output.
1167		$linkCond = DOMCompat::nodeName( $elt ) === 'img';
1168		if ( $linkCond && $link ) {
1169			// Check whether the link goes to the default place, in which
1170			// case an explicit link tag isn't needed.
1171			// The link may be external, or may include wikitext template markup,
1172			// therefore check first that it parses to a title.
1173			$linkTitle = $env->normalizedTitleKey(
1174				Utils::decodeURIComponent( $link['value'] ), true
1175			);
1176			$resourceTitle = $env->normalizedTitleKey(
1177				Utils::decodeURIComponent( $resource['value'] ), true
1178			);
1179			if (
1180				$link['value'] === $resource['value'] ||
1181				( $linkTitle !== null && $linkTitle === $resourceTitle )
1182			) {
1183				$linkCond = false; // No explicit link attribute needed
1184			}
1185		}
1186
1187		// "alt" for non-image is handle below
1188		$altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img';
1189
1190		// This loop handles media options which *mostly* correspond 1-1 with
1191		// HTML attributes.  `img_$name` is the name of the media option,
1192		// and $value is the Parsoid "shadow info" for the attribute.
1193		// $cond tells us whether we need to explicitly output this option;
1194		// if it is false we are using an implicit default.
1195		// `lang` and `alt` are fairly straightforward.  `link` and `page`
1196		// are a little trickier, since we need to massage/fake the shadow
1197		// info because they don't come *directly* from the attribute.
1198		// link comes from the combination of a[href], img[src], and
1199		// img[resource], etc; page comes from the query part of a[href] etc.
1200		foreach ( [
1201			[ 'name' => 'link', 'value' => $link, 'cond' => $linkCond ],
1202			[ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond ],
1203			[ 'name' => 'page', 'value' => $page, 'cond' => isset( $page['value'] ) ],
1204			[ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ) ]
1205		] as $o ) {
1206			if ( !$o['cond'] ) {
1207				continue;
1208			}
1209			if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) {
1210				$nopts[] = [
1211					'ck' => $o['name'],
1212					'ak' => [ $o['value']['value'] ],
1213				];
1214			} else {
1215				$value = $o['value'] ? $o['value']['value'] : '';
1216				if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) {
1217					// see WikiLinkHandler::isWikitextOpt(): link and alt are allowed
1218					// to contain arbitrary wikitext, even though it is stripped
1219					// to a string before emitting.
1220					$value = $state->serializer->wteHandlers->escapeLinkContent(
1221						$state, $value, false, $node, true
1222					);
1223				}
1224				$nopts[] = [
1225					'ck' => $o['name'],
1226					'v' => $value,
1227					'ak' => $mwAliases['img_' . $o['name']],
1228				];
1229			}
1230		}
1231
1232		// Now we handle media options which all come from space-separated
1233		// values in a single HTML attribute, `class`.  (But note that there
1234		// can also be "extra" classes added by `img_class` as well.)
1235		$classes = DOMCompat::getClassList( $outerElt );
1236		$extra = []; // 'extra' classes
1237		$val = null;
1238
1239		foreach ( $classes as $c ) {
1240			switch ( $c ) {
1241				case 'mw-halign-none':
1242				case 'mw-halign-right':
1243				case 'mw-halign-left':
1244				case 'mw-halign-center':
1245					$val = substr( $c, 10 ); // strip mw-halign- prefix
1246					$nopts[] = [
1247						'ck' => $val,
1248						'ak' => $mwAliases['img_' . $val],
1249					];
1250					break;
1251
1252				case 'mw-valign-top':
1253				case 'mw-valign-middle':
1254				case 'mw-valign-baseline':
1255				case 'mw-valign-sub':
1256				case 'mw-valign-super':
1257				case 'mw-valign-text-top':
1258				case 'mw-valign-bottom':
1259				case 'mw-valign-text-bottom':
1260					$val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_'
1261					$nopts[] = [
1262						'ck' => $val,
1263						'ak' => $mwAliases['img_' . $val],
1264					];
1265					break;
1266
1267				case 'mw-image-border':
1268					$nopts[] = [
1269						'ck' => 'border',
1270						'ak' => $mwAliases['img_border'],
1271					];
1272					break;
1273
1274				case 'mw-default-size':
1275				case 'mw-default-audio-height':
1276					// handled below
1277					break;
1278
1279				default:
1280					$extra[] = $c;
1281					break;
1282			}
1283		}
1284
1285		if ( count( $extra ) ) {
1286			$nopts[] = [
1287				'ck' => 'class',
1288				'v' => implode( ' ', $extra ),
1289				'ak' => $mwAliases['img_class'],
1290			];
1291		}
1292
1293		// Now we handle parameters which don't have a representation
1294		// as HTML attributes; they are set only from the data-mw
1295		// values.  (In theory they could perhaps be reverse engineered
1296		// from the thumbnail URL, but that would be fragile and expose
1297		// thumbnail implementation to the editor so we don't do that.)
1298		$mwParams = [
1299			[ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ],
1300			// mw:Video specific
1301			[ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ],
1302			[ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ],
1303			[ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ]
1304		];
1305
1306		// `img_link` and `img_alt` are only surfaced as HTML attributes
1307		// for image media. For all other media we treat them as set only
1308		// from data-mw.
1309		if ( DOMCompat::nodeName( $elt ) !== 'img' ) {
1310			$mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ];
1311			$mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ];
1312		}
1313
1314		foreach ( $mwParams as $o ) {
1315			$v = $outerDMW->{$o['prop']} ?? null;
1316			if ( $v === null ) {
1317				$a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true );
1318				if ( $a !== null && !isset( $a[1]->html ) ) {
1319					$v = $a[1]->txt;
1320				}
1321			}
1322			if ( $v !== null ) {
1323				$ak = $state->serializer->getAttributeValue(
1324					$outerElt, $o['ck']
1325				) ?? $mwAliases[$o['alias']];
1326				$nopts[] = [
1327					'ck' => $o['ck'],
1328					'ak' => $ak,
1329					'v' => $v
1330				];
1331				// Piggyback this here ...
1332				if ( $o['prop'] === 'thumb' ) {
1333					$format = '';
1334				}
1335			}
1336		}
1337
1338		// These media options come from the HTML `typeof` attribute.
1339		switch ( $format ) {
1340			case 'Thumb':
1341				$nopts[] = [
1342					'ck' => 'thumbnail',
1343					'ak' => $state->serializer->getAttributeValue(
1344						$outerElt, 'thumbnail'
1345					) ?? $mwAliases['img_thumbnail'],
1346				];
1347				break;
1348			case 'Frame':
1349				$nopts[] = [
1350					'ck' => 'framed',
1351					'ak' => $state->serializer->getAttributeValue(
1352						$outerElt, 'framed'
1353					) ?? $mwAliases['img_framed'],
1354				];
1355				break;
1356			case 'Frameless':
1357				$nopts[] = [
1358					'ck' => 'frameless',
1359					'ak' => $state->serializer->getAttributeValue(
1360						$outerElt, 'frameless'
1361					) ?? $mwAliases['img_frameless'],
1362				];
1363				break;
1364		}
1365
1366		// Now handle the size-related options.  This is complicated!
1367		// We consider the `height`, `data-height`, `width`, and
1368		// `data-width` attributes, as well as the `typeof` and the `class`.
1369
1370		// Get the user-specified height from wikitext
1371		$wh = $state->serializer->serializedImageAttrVal(
1372			$outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height'
1373		);
1374		// Get the user-specified width from wikitext
1375		$ww = $state->serializer->serializedImageAttrVal(
1376			$outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width'
1377		);
1378
1379		$sizeUnmodified = !empty( $ww['fromDataMW'] ) ||
1380			( empty( $ww['modified'] ) && empty( $wh['modified'] ) );
1381		$upright = $getOpt( 'upright' );
1382
1383		// XXX: Infer upright factor from default size for all thumbs by default?
1384		// Better for scaling with user prefs, but requires knowledge about
1385		// default used in VE.
1386		if ( $sizeUnmodified && $upright &&
1387			// Only serialize upright where it is actually respected
1388			// This causes some dirty diffs, but makes sure that we don't
1389			// produce nonsensical output after a type switch.
1390			// TODO: Only strip if type was actually modified.
1391			in_array( $format, [ 'Frameless', 'Thumb' ], true )
1392		) {
1393			// preserve upright option
1394			$nopts[] = [
1395				'ck' => $upright['ck'],
1396				'ak' => [ $upright['ak'] ],
1397			];
1398		}// FIXME: don't use ak here!
1399
1400		if ( !( DOMCompat::getClassList( $outerElt )->contains( 'mw-default-size' ) ) ) {
1401			$size = $getLastOpt( 'width' );
1402			$sizeString = (string)( $size['ak'] ?? '' );
1403			if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) {
1404				$sizeString = (string)( $ww['value'] ?? '' );
1405			}
1406			if ( $sizeUnmodified && $sizeString !== '' ) {
1407				// preserve original width/height string if not touched
1408				$nopts[] = [
1409					'ck' => 'width',
1410					'v' => $sizeString, // original size string
1411					'ak' => [ '$1' ]
1412				];
1413			} else { // don't add px or the like
1414				$bbox = null;
1415				// Serialize to a square bounding box
1416				if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) {
1417					$bbox = intval( $ww['value'] );
1418				}
1419				if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) &&
1420					// As with "mw-default-size", editing clients should remove the
1421					// "mw-default-audio-height" if they want to factor a defined
1422					// height into the bounding box size.  However, note that, at
1423					// present, a defined height for audio is ignored while parsing,
1424					// so this only has the effect of modifying the width.
1425					(
1426						DOMCompat::nodeName( $elt ) !== 'audio' ||
1427						!DOMCompat::getClassList( $outerElt )->contains( 'mw-default-audio-height' )
1428					)
1429				) {
1430					$height = intval( $wh['value'] );
1431					if ( $bbox === null || $height > $bbox ) {
1432						$bbox = $height;
1433					}
1434				}
1435				if ( $bbox !== null ) {
1436					$nopts[] = [
1437						'ck' => 'width',
1438						// MediaWiki interprets 100px as a width
1439						// restriction only, so we need to make the bounding
1440						// box explicitly square (100x100px). The 'px' is
1441						// added by the alias though, and can be localized.
1442						'v' => $bbox . 'x' . $bbox,
1443						'ak' => $mwAliases['img_width'],
1444					];
1445				}
1446			}
1447		}// adds the 'px' suffix
1448
1449		$opts = $outerDP->optList ?? []; // original wikitext options
1450
1451		// Add bogus options from old optlist in order to round-trip cleanly (T64500)
1452		foreach ( $opts as $o ) {
1453			if ( ( $o['ck'] ?? null ) === 'bogus' ) {
1454				$nopts[] = [
1455					'ck' => 'bogus',
1456					'ak' => [ $o['ak'] ],
1457				];
1458			}
1459		}
1460
1461		// Put the caption last, by default.
1462		if ( is_string( $caption ) ) {
1463			$nopts[] = [
1464				'ck' => 'caption',
1465				'ak' => [ $caption ],
1466			];
1467		}
1468
1469		// ok, sort the new options to match the order given in the old optlist
1470		// and try to match up the aliases used
1471		$changed = false;
1472		foreach ( $nopts as &$no ) {
1473			// Make sure we have an array here. Default in data-parsoid is
1474			// actually a string.
1475			// FIXME: don't reuse ak for two different things!
1476			if ( !is_array( $no['ak'] ) ) {
1477				$no['ak'] = [ $no['ak'] ];
1478			}
1479
1480			$no['sortId'] = count( $opts );
1481			$idx = -1;
1482			foreach ( $opts as $i => $o ) {
1483				if ( ( $o['ck'] ?? null ) === $no['ck'] &&
1484					// for bogus options, make sure the source matches too.
1485					( $o['ck'] !== 'bogus' || $o['ak'] === $no['ak'][0] )
1486				) {
1487					$idx = $i;
1488					break;
1489				}
1490			}
1491			if ( $idx < 0 ) {
1492				// Preferred words are first in the alias list
1493				// (but not in old versions of mediawiki).
1494				$no['ak'] = $no['ak'][0];
1495				$changed = true;
1496				continue;
1497			}
1498
1499			$no['sortId'] = $idx;
1500			// use a matching alias, if there is one
1501			$a = null;
1502			foreach ( $no['ak'] as $b ) {
1503				// note the trim() here; that allows us to snarf eccentric
1504				// whitespace from the original option wikitext
1505				$b2 = $b;
1506				if ( isset( $no['v'] ) ) {
1507					$b2 = str_replace( '$1', $no['v'], $b );
1508				}
1509				if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) {
1510					$a = $b;
1511					break;
1512				}
1513			}
1514			// use the alias (incl whitespace) from the original option wikitext
1515			// if found; otherwise use the last alias given (English default by
1516			// convention that works everywhere).
1517			// TODO: use first alias (localized) instead for RTL languages (T53852)
1518			if ( $a !== null && $no['ck'] !== 'caption' ) {
1519				$no['ak'] = $opts[$idx]['ak'];
1520				unset( $no['v'] ); // prevent double substitution
1521			} else {
1522				$no['ak'] = PHPUtils::lastItem( $no['ak'] );
1523				if ( !( $no['ck'] === 'caption' && $a !== null ) ) {
1524					$changed = true;
1525				}
1526			}
1527		}
1528
1529		// Filter out bogus options if the image options/caption have changed.
1530		if ( $changed ) {
1531			$nopts = array_filter( $nopts, static function ( $no ) {
1532				return $no['ck'] !== 'bogus';
1533			} );
1534			// empty captions should get filtered out in this case, too (T64264)
1535			$nopts = array_filter( $nopts, static function ( $no ) {
1536				return !( $no['ck'] === 'caption' && $no['ak'] === '' );
1537			} );
1538		}
1539
1540		// sort!
1541		usort( $nopts, static function ( $a, $b ) {
1542			return $a['sortId'] <=> $b['sortId'];
1543		} );
1544
1545		// emit all the options as wikitext!
1546		$wikitext = '[[' . $resource['value'];
1547		foreach ( $nopts as $o ) {
1548			$wikitext .= '|';
1549			if ( isset( $o['v'] ) ) {
1550				$wikitext .= str_replace( '$1', $o['v'], $o['ak'] );
1551			} else {
1552				$wikitext .= $o['ak'];
1553			}
1554		}
1555		$wikitext .= ']]';
1556
1557		$state->emitChunk( new WikiLinkText(
1558			$wikitext, $node, $state->getEnv()->getSiteConfig(),
1559			// FIXME: Does this matter? Emit a constant for now, it'll all
1560			// be same in the follow up patch to consolidate the types
1561			'mw:Image'
1562		), $node );
1563	}
1564
1565}
1566