1<?php 2 3namespace Wikimedia\Parsoid\Html2Wt; 4 5use stdClass; 6use UnexpectedValueException; 7use Wikimedia\Parsoid\Config\Env; 8use Wikimedia\Parsoid\Core\MediaStructure; 9use Wikimedia\Parsoid\DOM\Element; 10use Wikimedia\Parsoid\DOM\Node; 11use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText; 12use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText; 13use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText; 14use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText; 15use Wikimedia\Parsoid\Utils\ContentUtils; 16use Wikimedia\Parsoid\Utils\DOMCompat; 17use Wikimedia\Parsoid\Utils\DOMDataUtils; 18use Wikimedia\Parsoid\Utils\DOMUtils; 19use Wikimedia\Parsoid\Utils\PHPUtils; 20use Wikimedia\Parsoid\Utils\TokenUtils; 21use Wikimedia\Parsoid\Utils\UrlUtils; 22use Wikimedia\Parsoid\Utils\Utils; 23use Wikimedia\Parsoid\Utils\WTUtils; 24 25/** 26 * Serializes link markup. 27 */ 28class LinkHandlerUtils { 29 private static $REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D'; 30 private static $MW_TITLE_WHITESPACE_RE 31 = '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u'; 32 33 /** 34 * Split a string based on a prefix and suffix 35 * 36 * @param string $contentString 37 * @param stdClass $dp Containing ->prefix and ->tail 38 * @return stdClass 39 */ 40 private static function splitLinkContentString( string $contentString, stdClass $dp ): stdClass { 41 $tail = $dp->tail ?? ''; 42 $prefix = $dp->prefix ?? ''; 43 44 $tailLen = strlen( $tail ); 45 if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) { 46 // strip the tail off the content 47 $contentString = substr( $contentString, 0, -$tailLen ); 48 } else { 49 $tail = ''; 50 } 51 52 $prefixLen = strlen( $prefix ); 53 if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) { 54 $contentString = substr( $contentString, $prefixLen ); 55 } else { 56 $prefix = ''; 57 } 58 59 return (object)[ 60 'contentString' => $contentString, 61 'tail' => $tail, 62 'prefix' => $prefix, 63 ]; 64 } 65 66 /** 67 * Helper function for munging protocol-less absolute URLs: 68 * If this URL is absolute, but doesn't contain a protocol, 69 * try to find a localinterwiki protocol that would work. 70 * 71 * @param Env $env 72 * @param Element $node 73 * @return string 74 */ 75 private static function getHref( Env $env, Element $node ): string { 76 $href = $node->getAttribute( 'href' ) ?? ''; 77 if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) { 78 // protocol-less but absolute. let's find a base href 79 foreach ( $env->getSiteConfig()->interwikiMap() as $prefix => $interwikiInfo ) { 80 if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) { 81 $base = $interwikiInfo['url']; 82 83 // evaluate the url relative to this base 84 $nhref = UrlUtils::expandUrl( $href, $base ); 85 86 // can this match the pattern? 87 $re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD'; 88 if ( preg_match( $re, $nhref ) ) { 89 return $nhref; 90 } 91 } 92 } 93 } 94 return $href; 95 } 96 97 /** 98 * Normalize an interwiki prefix (?) 99 * @param string $str 100 * @return string 101 */ 102 private static function normalizeIWP( string $str ): string { 103 return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' ); 104 } 105 106 /** 107 * Escape a link target, and indicate if it's valid 108 * @param string $linkTarget 109 * @param SerializerState $state 110 * @return stdClass 111 */ 112 private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass { 113 // Entity-escape the content. 114 $linkTarget = Utils::escapeWtEntities( $linkTarget ); 115 return (object)[ 116 'linkTarget' => $linkTarget, 117 // Is this an invalid link? 118 'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) || 119 // `isValidLinkTarget` omits fragments (the part after #) so, 120 // even though "|" is an invalid character, we still need to ensure 121 // it doesn't appear in there. The percent encoded version is fine 122 // in the fragment, since it won't break the parse. 123 strpos( $linkTarget, '|' ) !== false, 124 ]; 125 } 126 127 /** 128 * Get the plain text content of the node, if it can be represented as such 129 * 130 * NOTE: This function seems a little inconsistent about what's considered 131 * null and what's an empty string. For example, no children is null 132 * but a single diffMarker gets a string? One of the current callers 133 * seems to subtly depend on that though. 134 * 135 * FIXME(T254501): This function can return `$node->textContent` instead 136 * of the string concatenation once mw:DisplaySpace is preprocessed away. 137 * 138 * @param Node $node 139 * @return ?string 140 */ 141 private static function getContentString( Node $node ): ?string { 142 if ( !$node->hasChildNodes() ) { 143 return null; 144 } 145 $contentString = ''; 146 $child = $node->firstChild; 147 while ( $child ) { 148 if ( DOMUtils::isText( $child ) ) { 149 $contentString .= $child->nodeValue; 150 } elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) { 151 $contentString .= ' '; 152 } elseif ( DOMUtils::isDiffMarker( $child ) ) { 153 } else { 154 return null; 155 } 156 $child = $child->nextSibling; 157 } 158 return $contentString; 159 } 160 161 /** 162 * Helper function for getting RT data from the tokens 163 * @param Env $env 164 * @param Element $node 165 * @param SerializerState $state 166 * @return stdClass 167 */ 168 private static function getLinkRoundTripData( 169 Env $env, Element $node, SerializerState $state 170 ): stdClass { 171 $dp = DOMDataUtils::getDataParsoid( $node ); 172 $siteConfig = $env->getSiteConfig(); 173 $rtData = (object)[ 174 'type' => null, // could be null 175 'href' => null, // filled in below 176 'origHref' => null, // filled in below 177 'target' => null, // filled in below 178 'tail' => $dp->tail ?? '', 179 'prefix' => $dp->prefix ?? '', 180 'linkType' => null 181 ]; 182 $rtData->content = new stdClass; 183 184 // Figure out the type of the link 185 if ( $node->hasAttribute( 'rel' ) ) { 186 $rel = $node->getAttribute( 'rel' ) ?? ''; 187 // Parsoid only emits and recognizes ExtLink, WikiLink, and PageProp rel values. 188 // Everything else defaults to ExtLink during serialization (unless it is 189 // serializable to a wikilink) 190 if ( preg_match( '/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)[^\s]*)\b/', $rel, $typeMatch ) ) { 191 $rtData->type = $typeMatch[1]; 192 // Strip link subtype info 193 if ( $typeMatch[2] === 'WikiLink' || $typeMatch[2] === 'ExtLink' ) { 194 $rtData->type = 'mw:' . $typeMatch[2]; 195 } 196 } 197 } 198 199 // Default link type if nothing else is set 200 if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) { 201 $rtData->type = 'mw:ExtLink'; 202 } 203 204 // Get href, and save the token's "real" href for comparison 205 $href = self::getHref( $env, $node ); 206 $rtData->origHref = $href; 207 $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 ); 208 209 // WikiLinks should be relative (but see below); fixup the link type 210 // if a WikiLink has an absolute URL. 211 // (This may get converted back to a WikiLink below, in the interwiki 212 // handling code.) 213 if ( $rtData->type === 'mw:WikiLink' && 214 ( preg_match( '#^(\w+:)?//#', $rtData->href ) || 215 substr( $rtData->origHref ?? '', 0, 1 ) === '/' ) 216 ) { 217 $rtData->type = 'mw:ExtLink'; 218 } 219 220 // Now get the target from rt data 221 $rtData->target = $state->serializer->serializedAttrVal( $node, 'href' ); 222 223 // Check if the link content has been modified or is newly inserted content. 224 // FIXME: This will only work with selser of course. Hard to test without selser. 225 if ( $state->inModifiedContent || 226 DiffUtils::hasDiffMark( $node, $env, 'subtree-changed' ) 227 ) { 228 $rtData->contentModified = true; 229 } 230 231 // Get the content string or tokens 232 $contentString = self::getContentString( $node ); 233 if ( $contentString !== null ) { 234 if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) { 235 // Try to identify a new potential tail 236 $contentParts = self::splitLinkContentString( $contentString, $dp ); 237 $rtData->content->string = $contentParts->contentString; 238 $rtData->tail = $contentParts->tail; 239 $rtData->prefix = $contentParts->prefix; 240 } else { 241 $rtData->tail = ''; 242 $rtData->prefix = ''; 243 $rtData->content->string = $contentString; 244 } 245 } elseif ( $node->hasChildNodes() ) { 246 $rtData->contentNode = $node; 247 } elseif ( $rtData->type === 'mw:PageProp/redirect' ) { 248 $rtData->isRedirect = true; 249 $rtData->prefix = $dp->src 250 ?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' ); 251 } 252 253 // Update link type based on additional analysis. 254 // What might look like external links might be serializable as a wikilink. 255 $target = &$rtData->target; 256 257 // mw:MediaLink annotations are considered authoritative 258 // and interwiki link matches aren't made for these 259 if ( $rtData->type === 'mw:MediaLink' ) { 260 // Parse title from resource attribute (see analog in image handling) 261 $resource = $state->serializer->serializedAttrVal( $node, 'resource' ); 262 if ( $resource['value'] === null ) { 263 // from non-parsoid HTML: try to reconstruct resource from href? 264 // (See similar code which tries to guess resource from <img src>) 265 $mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) ); 266 $slashPos = strrpos( $rtData->origHref, '/' ); 267 $fileName = $slashPos === false ? $rtData->origHref : 268 substr( $rtData->origHref, $slashPos + 1 ); 269 $resource = [ 270 'value' => $mediaPrefix . ':' . $fileName, 271 'fromsrc' => false, 272 'modified' => false 273 ]; 274 } 275 $rtData->target = $resource; 276 $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 ); 277 return $rtData; 278 } 279 280 // Check if the href matches any of our interwiki URL patterns 281 $interWikiMatch = $siteConfig->interWikiMatcher( $href ); 282 if ( $interWikiMatch && 283 // Question mark is a valid title char, so it won't fail the test below, 284 // but gets percent encoded on the way out since it has special 285 // semantics in a url. That will break the url we're serializing, so 286 // protect it. 287 // FIXME: If ever the default value for $wgExternalInterwikiFragmentMode 288 // changes, we can reduce this by always stripping off the fragment 289 // identifier, since in "html5" mode, that isn't encoded. At present, 290 // we can only do that if we know it's a local interwiki link. 291 strpos( $interWikiMatch[1], '?' ) === false && 292 // Ensure we have a valid link target, otherwise falling back to extlink 293 // is preferable, since it won't serialize as a link. 294 ( 295 $interWikiMatch[1] === '' || !self::escapeLinkTarget( 296 // Append the prefix since we want to validate the target 297 // with respect to it being an interwiki. 298 $interWikiMatch[0] . ':' . $interWikiMatch[1], 299 $state 300 )->invalidLink 301 ) && 302 // ExtLinks should have content to convert. 303 ( 304 $rtData->type !== 'mw:ExtLink' || 305 !empty( $rtData->content->string ) || 306 !empty( $rtData->contentNode ) 307 ) && 308 ( !empty( $dp->isIW ) || !empty( $target['modified'] ) || !empty( $rtData->contentModified ) ) 309 ) { 310 // External link that is really an interwiki link. Convert it. 311 // TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound 312 if ( $rtData->type === 'mw:ExtLink' ) { 313 $rtData->type = 'mw:WikiLink'; 314 } 315 $rtData->isInterwiki = true; 316 $iwMap = $siteConfig->interwikiMap(); 317 // could this be confused with a language link? 318 $iwi = $iwMap[self::normalizeIWP( $interWikiMatch[0] )] ?? null; 319 $rtData->isInterwikiLang = $iwi && isset( $iwi['language'] ); 320 // is this our own wiki? 321 $rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] ); 322 // strip off localinterwiki prefixes 323 $localPrefix = ''; 324 $oldPrefix = null; 325 while ( true ) { 326 $tmp = substr( $target['value'], strlen( $localPrefix ) ); 327 if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) { 328 break; 329 } 330 $iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null; 331 if ( !$iwi || !isset( $iwi['localinterwiki'] ) ) { 332 break; 333 } 334 $localPrefix .= $oldPrefix[1] . ':'; 335 } 336 337 if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) { 338 // Leave the target alone! 339 } else { 340 if ( $rtData->type === 'mw:PageProp/Language' ) { 341 $targetValue = implode( ':', $interWikiMatch ); 342 // Strip initial colon 343 if ( $targetValue[0] === ':' ) { 344 $targetValue = substr( $targetValue, 1 ); 345 } 346 $target['value'] = $targetValue; 347 } elseif ( 348 $oldPrefix && ( // Should we preserve the old prefix? 349 strcasecmp( $oldPrefix[1], $interWikiMatch[0] ) === 0 || 350 // Check if the old prefix mapped to the same URL as 351 // the new one. Use the old one if that's the case. 352 // Example: [[w:Foo]] vs. [[:en:Foo]] 353 ( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null ) 354 === ( $iwMap[self::normalizeIWP( $interWikiMatch[0] )]['url'] ?? null ) 355 ) 356 ) { 357 // Reuse old prefix capitalization 358 if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) ) 359 !== $interWikiMatch[1] 360 ) { 361 // Modified, update target.value. 362 $target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interWikiMatch[1]; 363 } 364 // Ensure that we generate an interwiki link and not a language link! 365 if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) { 366 $target['value'] = ':' . $target['value']; 367 } 368 } else { // Else: preserve old encoding 369 if ( !empty( $rtData->isLocal ) ) { 370 // - interwikiMatch will be ":en", ":de", etc. 371 // - This tests whether the interwiki-like link is actually 372 // a local wikilink. 373 $target['value'] = $interWikiMatch[1]; 374 $rtData->isInterwiki = $rtData->isInterwikiLang = false; 375 } else { 376 $target['value'] = implode( ':', $interWikiMatch ); 377 } 378 } 379 } 380 } 381 382 return $rtData; 383 } 384 385 /** 386 * The provided URL is already percent-encoded -- but it may still 387 * not be safe for wikitext. Add additional escapes to make the URL 388 * wikitext-safe. Don't touch percent escapes already in the url, 389 * though! 390 * @param string $urlStr 391 * @return string 392 */ 393 private static function escapeExtLinkURL( string $urlStr ): string { 394 // this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser 395 return preg_replace( 396 // IPv6 host names are bracketed with []. Entity-decode these. 397 '!^([a-z][^:/]*:)?//[([0-9a-f:.]+)](:\d|/|$)!iD', 398 '$1//[$2]$3', 399 preg_replace_callback( 400 // phpcs:ignore Generic.Files.LineLength.TooLong 401 '/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]|-(?=\{)/u', 402 static function ( $m ) { 403 return Utils::entityEncodeAll( $m[0] ); 404 }, 405 $urlStr 406 ), 407 1 408 ); 409 } 410 411 /** 412 * Add a colon escape to a wikilink target string if needed. 413 * @param Env $env 414 * @param string $linkTarget 415 * @param stdClass $linkData 416 * @return string 417 */ 418 private static function addColonEscape( 419 Env $env, string $linkTarget, stdClass $linkData 420 ): string { 421 $linkTitle = $env->makeTitleFromText( $linkTarget ); 422 if ( ( $linkTitle->getNamespace()->isCategory() || $linkTitle->getNamespace()->isFile() ) && 423 $linkData->type === 'mw:WikiLink' && 424 $linkTarget[0] !== ':' ) { 425 // Escape category and file links 426 return ':' . $linkTarget; 427 } else { 428 return $linkTarget; 429 } 430 } 431 432 /** 433 * Test if something is a URL link 434 * @param Env $env 435 * @param Element $node 436 * @param stdClass $linkData 437 * @return bool 438 */ 439 private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool { 440 $target = $linkData->target; 441 442 // Get plain text content, if any 443 $contentStr = self::getContentString( $node ); 444 445 // First check if we can serialize as an URL link 446 return ( $contentStr !== null && $contentStr !== '' ) && 447 // Can we minimize this? 448 ( $target['value'] === $contentStr || self::getHref( $env, $node ) === $contentStr ) && 449 // protocol-relative url links not allowed in text 450 // (see autourl rule in peg tokenizer, T32269) 451 !str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env ); 452 } 453 454 /** 455 * Figure out if we need a piped or simple link 456 * @param Env $env 457 * @param stdClass $dp 458 * @param array $target 459 * @param stdClass $linkData 460 * @return bool 461 */ 462 private static function isSimpleWikiLink( 463 Env $env, stdClass $dp, array $target, stdClass $linkData 464 ): bool { 465 $canUseSimple = false; 466 $contentString = $linkData->content->string ?? null; 467 468 // FIXME (SSS): 469 // 1. Revisit this logic to see if all these checks 470 // are still relevant or whether this can be simplified somehow. 471 // 2. There are also duplicate computations for env.normalizedTitleKey(..) 472 // and Util.decodeURIComponent(..) that could be removed. 473 // 3. This could potentially be refactored as if-then chains. 474 475 // Would need to pipe for any non-string content. 476 // Preserve unmodified or non-minimal piped links. 477 if ( $contentString !== null && 478 ( !empty( $target['modified'] ) || !empty( $linkData->contentModified ) || 479 ( $dp->stx ?? null ) !== 'piped' 480 ) && 481 // Relative links are not simple 482 !str_starts_with( $contentString, './' ) 483 ) { 484 // Strip colon escapes from the original target as that is 485 // stripped when deriving the content string. 486 // Strip ./ prefixes as well since they are relative link prefixes 487 // added to all titles. 488 $strippedTargetValue = preg_replace( '#^(:|\./)#', '', $target['value'], 1 ); 489 $decodedTarget = Utils::decodeWtEntities( $strippedTargetValue ); 490 // Deal with the protocol-relative link scenario as well 491 $hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href ); 492 493 // Normalize content string and decoded target before comparison. 494 // Piped links don't come down this path => it is safe to normalize both. 495 $contentString = str_replace( '_', ' ', $contentString ); 496 $decodedTarget = str_replace( '_', ' ', $decodedTarget ); 497 498 // See if the (normalized) content matches the 499 // target, either shadowed or actual. 500 $canUseSimple = 501 $contentString === $decodedTarget || 502 // try wrapped in forward slashes in case they were stripped 503 ( '/' . $contentString . '/' ) === $decodedTarget || 504 // normalize as titles and compare 505 $env->normalizedTitleKey( $contentString, true ) 506 === preg_replace( self::$MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) || 507 // Relative link 508 ( 509 ( 510 $env->getSiteConfig()->namespaceHasSubpages( $env->getPageConfig()->getNs() ) && 511 preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) && 512 $contentString === $env->resolveTitle( $strippedTargetValue ) 513 ) || 514 ( 515 preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) && 516 $contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 ) 517 ) 518 ) || 519 // if content == href this could be a simple link... eg [[Foo]]. 520 // but if href is an absolute url with protocol, this won't 521 // work: [[http://example.com]] is not a valid simple link! 522 ( 523 !$hrefHasProto && 524 // Always compare against decoded uri because 525 // <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p> 526 // should serialize as [[7% Solution|7%25 Solution]] 527 ( 528 $contentString === Utils::decodeURIComponent( $linkData->href ) || 529 // normalize with underscores for comparison with href 530 $env->normalizedTitleKey( $contentString, true ) 531 === Utils::decodeURIComponent( $linkData->href ) 532 ) 533 ); 534 } 535 536 return $canUseSimple; 537 } 538 539 /** 540 * Serialize as wiki link 541 * @param Element $node 542 * @param SerializerState $state 543 * @param stdClass $linkData 544 */ 545 private static function serializeAsWikiLink( 546 Element $node, SerializerState $state, stdClass $linkData 547 ): void { 548 $contentParts = null; 549 $contentSrc = ''; 550 $isPiped = false; 551 $requiresEscaping = true; 552 $env = $state->getEnv(); 553 $siteConfig = $env->getSiteConfig(); 554 $target = $linkData->target; 555 $dp = DOMDataUtils::getDataParsoid( $node ); 556 557 // Decode any link that did not come from the source (data-mw/parsoid) 558 // Links that come from data-mw/data-parsoid will be true titles, 559 // but links that come from hrefs will need to be url-decoded. 560 // Ex: <a href="/wiki/A%3Fb">Foobar</a> 561 if ( empty( $target['fromsrc'] ) ) { 562 // Omit fragments from decoding 563 $hash = strpos( $target['value'], '#' ); 564 if ( $hash !== false ) { 565 $target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) ) 566 . substr( $target['value'], $hash ); 567 } else { 568 $target['value'] = Utils::decodeURIComponent( $target['value'] ); 569 } 570 } 571 572 // Special-case handling for category links 573 if ( $linkData->type === 'mw:PageProp/Category' ) { 574 // Split target and sort key in $target['value']. 575 // The sort key shows up as "#something" in there. 576 // However, watch out for parser functions that start with "{{#" 577 // The atomic group is essential to prevent "{{#" parser function prefix 578 // from getting split at the "{{" and "#" where the "{{" matches the 579 // [^#]* and the "#" matches after separately. 580 if ( preg_match( '/^((?>{{#|[^#])*)#(.*)/', $target['value'], $targetParts ) ) { 581 $target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' ); 582 // FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');` 583 $strContent = Utils::decodeURIComponent( $targetParts[2] ); 584 $contentParts = self::splitLinkContentString( $strContent, $dp ); 585 $linkData->content->string = $contentParts->contentString; 586 $dp->tail = $linkData->tail = $contentParts->tail; 587 $dp->prefix = $linkData->prefix = $contentParts->prefix; 588 } else { // No sort key, will serialize to simple link 589 // Normalize the content string 590 $linkData->content->string = strtr( 591 PHPUtils::stripPrefix( $target['value'], './' ), '_', ' ' 592 ); 593 } 594 595 // Special-case handling for template-affected sort keys 596 // FIXME: sort keys cannot be modified yet, but if they are, 597 // we need to fully shadow the sort key. 598 // if ( !target.modified ) { 599 // The target and source key was not modified 600 $sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' ); 601 if ( isset( $sortKeySrc['value'] ) ) { 602 $linkData->contentNode = null; 603 $linkData->content->string = $sortKeySrc['value']; 604 // TODO: generalize this flag. It is already used by 605 // getAttributeShadowInfo. Maybe use the same 606 // structure as its return value? 607 $linkData->content->fromsrc = true; 608 } 609 // } 610 } else { 611 if ( $linkData->type === 'mw:PageProp/Language' ) { 612 // Fix up the content string 613 // TODO: see if linkData can be cleaner! 614 if ( !isset( $linkData->content->string ) ) { 615 $linkData->content->string = Utils::decodeWtEntities( $target['value'] ); 616 } 617 } 618 } 619 620 // The string value of the content, if it is plain text. 621 $linkTarget = null; 622 $escapedTgt = null; 623 if ( !empty( $linkData->isRedirect ) ) { 624 $linkTarget = $target['value']; 625 if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { 626 $linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' ); 627 $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); 628 $linkTarget = $escapedTgt->linkTarget; 629 // Determine if it's a redirect to a category, in which case 630 // it needs a ':' on front to distingish from a category link. 631 if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) { 632 $ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) ); 633 if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) { 634 // Check that the next node isn't a category link, 635 // in which case we don't want the ':'. 636 $nextNode = $node->nextSibling; 637 if ( !( 638 $nextNode && $nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' && 639 $nextNode->getAttribute( 'rel' ) === 'mw:PageProp/Category' && 640 $nextNode->getAttribute( 'href' ) === $node->getAttribute( 'href' ) 641 ) ) { 642 $linkTarget = ':' . $linkTarget; 643 } 644 } 645 } 646 } 647 } elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) { 648 // Simple case 649 if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) { 650 $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); 651 } else { 652 // If token has templated attrs or is a subpage, use target.value 653 // since content string will be drastically different. 654 if ( WTUtils::hasExpandedAttrsType( $node ) || 655 preg_match( '#(^|/)\.\./#', $target['value'] ) 656 ) { 657 $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); 658 } else { 659 $escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state ); 660 if ( !$escapedTgt->invalidLink ) { 661 $linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData ); 662 } else { 663 $linkTarget = $escapedTgt->linkTarget; 664 } 665 } 666 if ( !empty( $linkData->isInterwikiLang ) && 667 $linkTarget[0] !== ':' && 668 $linkData->type !== 'mw:PageProp/Language' 669 ) { 670 // ensure interwiki links can't be confused with 671 // interlanguage links. 672 $linkTarget = ':' . $linkTarget; 673 } 674 } 675 } elseif ( self::isURLLink( $state->getEnv(), $node, $linkData ) 676 /* && empty( $linkData->isInterwiki ) */ 677 ) { 678 // Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]] 679 // for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>' 680 $linkData->linkType = 'mw:URLLink'; 681 } else { 682 // Emit piped wikilink syntax 683 $isPiped = true; 684 685 // First get the content source 686 if ( !empty( $linkData->contentNode ) ) { 687 $cs = $state->serializeLinkChildrenToString( 688 $linkData->contentNode, 689 [ $state->serializer->wteHandlers, 'wikilinkHandler' ] 690 ); 691 // strip off the tail and handle the pipe trick 692 $contentParts = self::splitLinkContentString( $cs, $dp ); 693 $contentSrc = $contentParts->contentString; 694 $dp->tail = $contentParts->tail; 695 $linkData->tail = $contentParts->tail; 696 $dp->prefix = $contentParts->prefix; 697 $linkData->prefix = $contentParts->prefix; 698 $requiresEscaping = false; 699 } else { 700 $contentSrc = $linkData->content->string ?? ''; 701 $requiresEscaping = empty( $linkData->content->fromsrc ); 702 } 703 704 if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) { 705 // Protect empty link content from PST pipe trick 706 $contentSrc = '<nowiki/>'; 707 $requiresEscaping = false; 708 } 709 710 $linkTarget = $target['value']; 711 if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { 712 // Links starting with ./ shouldn't get _ replaced with ' ' 713 $linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' ); 714 $linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ); 715 if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) { 716 $linkTarget = strtr( $linkTarget, '_', ' ' ); 717 } 718 $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); 719 $linkTarget = $escapedTgt->linkTarget; 720 } 721 722 // If we are reusing the target from source, we don't 723 // need to worry about colon-escaping because it will 724 // be in the right form already. 725 // 726 // Trying to eliminate this check and always check for 727 // colon-escaping seems a bit tricky when the reused 728 // target has encoded entities that won't resolve to 729 // valid titles. 730 if ( ( !$escapedTgt || !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) { 731 $linkTarget = self::addColonEscape( $env, $linkTarget, $linkData ); 732 } 733 } 734 if ( $linkData->linkType === 'mw:URLLink' ) { 735 $state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node ); 736 return; 737 } 738 739 if ( !empty( $linkData->isRedirect ) ) { 740 // Drop duplicates 741 if ( $state->redirectText !== null ) { 742 return; 743 } 744 745 // Buffer redirect text if it is not in start of file position 746 if ( !preg_match( self::$REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) { 747 $state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]'; 748 $state->emitChunk( '', $node ); // Flush seperators for this node 749 // Flush seperators for this node 750 return; 751 } 752 753 // Set to some non-null string 754 $state->redirectText = 'unbuffered'; 755 } 756 757 $pipedText = null; 758 if ( $escapedTgt && $escapedTgt->invalidLink ) { 759 // If the link target was invalid, instead of emitting an invalid link, 760 // omit the link and serialize just the content instead. But, log the 761 // invalid html for Parsoid clients to investigate later. 762 $state->getEnv()->log( 763 'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node ) 764 ); 765 766 // For non-piped content, use the original invalid link text 767 $pipedText = $isPiped ? $contentSrc : $linkTarget; 768 $state->escapeText = $requiresEscaping; 769 $state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node ); 770 } else { 771 if ( $isPiped && $requiresEscaping ) { 772 // We are definitely not in sol context since content 773 // will be preceded by "[[" or "[" text in target wikitext. 774 $pipedText = '|' . $state->serializer->wteHandlers 775 ->escapeLinkContent( $state, $contentSrc, false, $node, false ); 776 } elseif ( $isPiped ) { 777 $pipedText = '|' . $contentSrc; 778 } else { 779 $pipedText = ''; 780 } 781 if ( $isPiped ) { 782 $state->singleLineContext->disable(); 783 } 784 $state->emitChunk( new WikiLinkText( 785 $linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail, 786 $node, $siteConfig, $linkData->type 787 ), $node ); 788 if ( $isPiped ) { 789 $state->singleLineContext->pop(); 790 } 791 } 792 } 793 794 /** 795 * Serialize as external link 796 * @param Element $node 797 * @param SerializerState $state 798 * @param stdClass $linkData 799 */ 800 private static function serializeAsExtLink( 801 Element $node, SerializerState $state, stdClass $linkData 802 ): void { 803 $target = $linkData->target; 804 $urlStr = $target['value']; 805 if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { 806 // We expect modified hrefs to be percent-encoded already, so 807 // don't need to encode them here any more. Unmodified hrefs are 808 // just using the original encoding anyway. 809 // BUT we do have to encode certain special wikitext 810 // characters (like []) which aren't necessarily 811 // percent-encoded because they are valid in URLs and HTML5 812 $urlStr = self::escapeExtLinkURL( $urlStr ); 813 } 814 815 if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) { 816 // Serialize as URL link 817 $state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node ); 818 return; 819 } 820 821 $siteConfig = $state->getEnv()->getSiteConfig(); 822 823 // TODO: match vs. interwikis too 824 $magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()( 825 Utils::decodeURI( $linkData->origHref ) 826 ); 827 $pureHashMatch = substr( $urlStr, 0, 1 ) === '#'; 828 // Fully serialize the content 829 $contentStr = $state->serializeLinkChildrenToString( 830 $node, 831 [ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ] 832 ); 833 // First check for ISBN/RFC/PMID links. We rely on selser to 834 // preserve non-minimal forms. 835 if ( $magicLinkMatch ) { 836 $serialized = $siteConfig->makeExtResourceURL( 837 $magicLinkMatch, $target['value'], $contentStr 838 ); 839 if ( $serialized[0] === '[' ) { 840 // Serialization as a magic link failed (perhaps the 841 // content string wasn't appropriate). 842 $state->emitChunk( 843 ( $magicLinkMatch[0] === 'ISBN' ) ? 844 new WikiLinkText( $serialized, $node, $siteConfig, 'mw:WikiLink' ) : 845 new ExtLinkText( $serialized, $node, $siteConfig, 'mw:ExtLink' ), 846 $node 847 ); 848 } else { 849 $state->emitChunk( new MagicLinkText( $serialized, $node ), $node ); 850 } 851 return; 852 } else { // There is an interwiki for RFCs, but strangely none for PMIDs. 853 // serialize as auto-numbered external link 854 // [http://example.com] 855 $linktext = null; 856 $class = null; 857 // If it's just anchor text, serialize as an internal link. 858 if ( $pureHashMatch ) { 859 $class = WikiLinkText::class; 860 $linktext = '[[' . $urlStr . ( ( $contentStr ) ? '|' . $contentStr : '' ) . ']]'; 861 } else { 862 $class = ExtLinkText::class; 863 $linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']'; 864 } 865 $state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node ); 866 return; 867 } 868 } 869 870 /** 871 * Main link handler. 872 * @param SerializerState $state 873 * @param Element $node 874 */ 875 public static function linkHandler( SerializerState $state, Element $node ): void { 876 // TODO: handle internal/external links etc using RDFa and dataAttribs 877 // Also convert unannotated html links without advanced attributes to 878 // external wiki links for html import. Might want to consider converting 879 // relative links without path component and file extension to wiki links. 880 $env = $state->getEnv(); 881 $siteConfig = $env->getSiteConfig(); 882 883 // Get the rt data from the token and tplAttrs 884 $linkData = self::getLinkRoundTripData( $env, $node, $state ); 885 $linkType = $linkData->type; 886 if ( $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ) ) { 887 // Override the 'rel' type if this is a magic link 888 $linkType = 'mw:ExtLink'; 889 } 890 if ( $linkType !== null && isset( $linkData->target['value'] ) ) { 891 // We have a type and target info 892 if ( $linkType === 'mw:WikiLink' || $linkType === 'mw:MediaLink' || 893 preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType ) 894 ) { 895 // [[..]] links: normal, category, redirect, or lang links 896 // (except images) 897 self::serializeAsWikiLink( $node, $state, $linkData ); 898 return; 899 } elseif ( $linkType === 'mw:ExtLink' ) { 900 // [..] links, autolinks, ISBN, RFC, PMID 901 self::serializeAsExtLink( $node, $state, $linkData ); 902 return; 903 } else { 904 throw new UnexpectedValueException( 905 'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node ) 906 ); 907 } 908 } else { 909 $safeAttr = array_flip( [ 910 'href', 'rel', 'class', 'title', DOMDataUtils::DATA_OBJECT_ATTR_NAME 911 ] ); 912 913 $isComplexLink = false; 914 foreach ( DOMCompat::attributes( $node ) as $attr ) { 915 // XXX: Don't drop rel and class in every case once a tags are 916 // actually supported in the MW default config? 917 if ( $attr->name && !isset( $safeAttr[$attr->name] ) ) { 918 $isComplexLink = true; 919 break; 920 } 921 } 922 923 if ( $isComplexLink ) { 924 $env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ), 925 '-- serializing as extlink and dropping <a> attributes unsupported in wikitext.' 926 ); 927 } else { 928 $media = DOMUtils::selectMediaElt( $node ); // TODO: Handle missing media too 929 $isFigure = ( $media instanceof Element && $media->parentNode === $node ); 930 if ( $isFigure ) { 931 // this is a basic html figure: <a><img></a> 932 self::figureHandler( $state, $node, new MediaStructure( $media, $node ) ); 933 return; 934 } 935 } 936 937 // href is already percent-encoded, etc., but it might contain 938 // spaces or other wikitext nasties. escape the nasties. 939 $hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) ); 940 $handler = [ $state->serializer->wteHandlers, 'aHandler' ]; 941 $str = $state->serializeLinkChildrenToString( $node, $handler ); 942 $chunk = null; 943 if ( !$hrefStr ) { 944 // Without an href, we just emit the string as text. 945 // However, to preserve targets for anchor links, 946 // serialize as a span with a name. 947 if ( $node->hasAttribute( 'name' ) ) { 948 $name = $node->getAttribute( 'name' ); 949 $doc = $node->ownerDocument; 950 $span = $doc->createElement( 'span' ); 951 $span->setAttribute( 'name', $name ); 952 $span->appendChild( $doc->createTextNode( $str ) ); 953 $chunk = DOMCompat::getOuterHTML( $span ); 954 } else { 955 $chunk = $str; 956 } 957 } else { 958 $chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']', 959 $node, $siteConfig, 'mw:ExtLink' 960 ); 961 } 962 $state->emitChunk( $chunk, $node ); 963 } 964 } 965 966 /** 967 * Main figure handler. 968 * 969 * @param SerializerState $state 970 * @param Element $node 971 * @param ?MediaStructure $ms 972 */ 973 public static function figureHandler( 974 SerializerState $state, Element $node, ?MediaStructure $ms 975 ): void { 976 $env = $state->getEnv(); 977 978 if ( !$ms ) { 979 $env->log( 980 'error/html2wt/figure', 981 "Couldn't parse media structure: ", 982 DOMCompat::getOuterHTML( $node ) 983 ); 984 $state->emitChunk( '', $node ); 985 return; 986 } 987 988 $outerElt = $ms->containerElt ?? $ms->mediaElt; 989 $linkElt = $ms->linkElt; 990 $elt = $ms->mediaElt; 991 $captionElt = $ms->captionElt; 992 993 $format = WTSUtils::getMediaFormat( $outerElt ); 994 995 // Try to identify the local title to use for this image. 996 $resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' ); 997 if ( !isset( $resource['value'] ) ) { 998 // from non-parsoid HTML: try to reconstruct resource from src? 999 // (this won't work for manual-thumb images) 1000 if ( !$elt->hasAttribute( 'src' ) ) { 1001 $env->log( 'error/html2wt/figure', 1002 'In WSP.figureHandler, img does not have resource or src:', 1003 DOMCompat::getOuterHTML( $node ) 1004 ); 1005 $state->emitChunk( '', $node ); 1006 return; 1007 } 1008 $src = $elt->getAttribute( 'src' ) ?? ''; 1009 if ( preg_match( '/^https?:/', $src ) ) { 1010 // external image link, presumably $wgAllowExternalImages=true 1011 $state->emitChunk( new AutoURLLinkText( $src, $node ), $node ); 1012 return; 1013 } 1014 $resource = [ 1015 'value' => $src, 1016 'fromsrc' => false, 1017 'modified' => false 1018 ]; 1019 } 1020 if ( empty( $resource['fromsrc'] ) ) { 1021 $resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 ); 1022 } 1023 1024 $nopts = []; 1025 $outerDP = DOMDataUtils::getDataParsoid( $outerElt ); 1026 $outerDMW = DOMDataUtils::getDataMw( $outerElt ); 1027 $mwAliases = $state->getEnv()->getSiteConfig()->mwAliases(); 1028 1029 // Return ref to the array element in case it is modified 1030 $getOpt = static function & ( $key ) use ( &$outerDP ): ?array { 1031 $null = null; 1032 if ( empty( $outerDP->optList ) ) { 1033 return $null; 1034 } 1035 foreach ( $outerDP->optList as $opt ) { 1036 if ( ( $opt['ck'] ?? null ) === $key ) { 1037 return $opt; 1038 } 1039 } 1040 return $null; 1041 }; 1042 // Return ref to the array element in case it is modified 1043 $getLastOpt = static function & ( $key ) use ( &$outerDP ) : ?array { 1044 $null = null; 1045 $opts = $outerDP->optList ?? []; 1046 for ( $i = count( $opts ) - 1; $i >= 0; $i-- ) { 1047 if ( ( $opts[$i]['ck'] ?? null ) === $key ) { 1048 return $opts[$i]; 1049 } 1050 } 1051 return $null; 1052 }; 1053 1054 // Identify a page # to use. 1055 $page = null; 1056 $pageFromHref = preg_match( 1057 '#[?]page=(\d+)$#D', 1058 ( $linkElt ? $linkElt->getAttribute( 'href' ) : null ) ?? '', 1059 $matches ) ? $matches[1] : null; 1060 $pageFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'page', true ); 1061 if ( $pageFromDataMw !== null ) { 1062 // FIXME: if $pageFromHref is null but $pageFromDataMw is 1063 // set, then we go ahead and serialize the page parameter 1064 // as unmodified. This helps transition old RESTBase 1065 // content where the ?page suffix on the URL was missing, 1066 // but eventually $restBaseMigrationHack should be left 1067 // false always. (T259931) 1068 $restBaseMigrationHack = 1069 ( $pageFromHref === null && $pageFromDataMw[1]->txt ); 1070 1071 if ( 1072 trim( $pageFromDataMw[1]->txt ) === $pageFromHref || 1073 $restBaseMigrationHack 1074 ) { 1075 $page = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'page' ); 1076 if ( !$page ) { 1077 $page = [ 1078 'value' => $pageFromDataMw[1]->txt, 1079 'modified' => false, 1080 'fromsrc' => false, 1081 'fromDataMW' => true, 1082 ]; 1083 } 1084 } 1085 } 1086 if ( !$page && $pageFromHref !== null ) { 1087 $page = [ 1088 'value' => $pageFromHref, 1089 'modified' => true, 1090 'fromsrc' => false, 1091 'fromDataMW' => false, 1092 ]; 1093 } 1094 1095 // Try to identify the local title to use for the link. 1096 $link = null; 1097 1098 $linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true ); 1099 if ( $linkFromDataMw !== null ) { 1100 // "link" attribute on the `outerElt` takes precedence 1101 if ( isset( $linkFromDataMw[1]->html ) ) { 1102 $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' ); 1103 } else { 1104 $link = [ 1105 'value' => "link={$linkFromDataMw[1]->txt}", 1106 'modified' => false, 1107 'fromsrc' => false, 1108 'fromDataMW' => true 1109 ]; 1110 } 1111 } elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) { 1112 $link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' ); 1113 if ( empty( $link['fromsrc'] ) ) { 1114 // strip page parameter if present on href 1115 $strippedHref = preg_replace( '#[?]page=\d+$#D', '', $linkElt->getAttribute( 'href' ) ?? '' ); 1116 if ( $strippedHref === $elt->getAttribute( 'resource' ) ) { 1117 // default link: same place as resource 1118 $link = $resource; 1119 } 1120 $link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 ); 1121 } 1122 } else { 1123 // Otherwise, just try and get it from data-mw 1124 $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' ); 1125 } 1126 1127 if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) { 1128 $linkOpt = $getOpt( 'link' ); 1129 if ( $linkOpt ) { 1130 $link['fromsrc'] = true; 1131 $link['value'] = $linkOpt['ak']; 1132 } 1133 } 1134 1135 // Reconstruct the caption 1136 if ( !$captionElt && is_string( $outerDMW->caption ?? null ) ) { 1137 // IMPORTANT: Assign to a variable to prevent the fragment 1138 // from getting GCed before we are done with it. 1139 $fragment = ContentUtils::createAndLoadDocumentFragment( 1140 $outerElt->ownerDocument, $outerDMW->caption, 1141 [ 'markNew' => true ] 1142 ); 1143 // FIXME: We should just be able to serialize the children of the 1144 // fragment, however, we need some way of marking this as being 1145 // inModifiedContent so that any bare text is assured to be escaped 1146 $captionElt = $outerElt->ownerDocument->createElement( 'div' ); 1147 DOMDataUtils::getDataParsoid( $captionElt )->tmp->isNew = true; 1148 DOMUtils::migrateChildren( $fragment, $captionElt ); 1149 // Needs a parent node in order for WTS to be happy 1150 $fragment->appendChild( $captionElt ); 1151 } 1152 1153 $caption = null; 1154 if ( $captionElt ) { 1155 $caption = $state->serializeCaptionChildrenToString( 1156 $captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ] 1157 ); 1158 } 1159 1160 // Fetch the alt (if any) 1161 $alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' ); 1162 // Fetch the lang (if any) 1163 $lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' ); 1164 1165 // Ok, start assembling options, beginning with link & alt & lang 1166 // Other media don't have links in output. 1167 $linkCond = DOMCompat::nodeName( $elt ) === 'img'; 1168 if ( $linkCond && $link ) { 1169 // Check whether the link goes to the default place, in which 1170 // case an explicit link tag isn't needed. 1171 // The link may be external, or may include wikitext template markup, 1172 // therefore check first that it parses to a title. 1173 $linkTitle = $env->normalizedTitleKey( 1174 Utils::decodeURIComponent( $link['value'] ), true 1175 ); 1176 $resourceTitle = $env->normalizedTitleKey( 1177 Utils::decodeURIComponent( $resource['value'] ), true 1178 ); 1179 if ( 1180 $link['value'] === $resource['value'] || 1181 ( $linkTitle !== null && $linkTitle === $resourceTitle ) 1182 ) { 1183 $linkCond = false; // No explicit link attribute needed 1184 } 1185 } 1186 1187 // "alt" for non-image is handle below 1188 $altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img'; 1189 1190 // This loop handles media options which *mostly* correspond 1-1 with 1191 // HTML attributes. `img_$name` is the name of the media option, 1192 // and $value is the Parsoid "shadow info" for the attribute. 1193 // $cond tells us whether we need to explicitly output this option; 1194 // if it is false we are using an implicit default. 1195 // `lang` and `alt` are fairly straightforward. `link` and `page` 1196 // are a little trickier, since we need to massage/fake the shadow 1197 // info because they don't come *directly* from the attribute. 1198 // link comes from the combination of a[href], img[src], and 1199 // img[resource], etc; page comes from the query part of a[href] etc. 1200 foreach ( [ 1201 [ 'name' => 'link', 'value' => $link, 'cond' => $linkCond ], 1202 [ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond ], 1203 [ 'name' => 'page', 'value' => $page, 'cond' => isset( $page['value'] ) ], 1204 [ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ) ] 1205 ] as $o ) { 1206 if ( !$o['cond'] ) { 1207 continue; 1208 } 1209 if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) { 1210 $nopts[] = [ 1211 'ck' => $o['name'], 1212 'ak' => [ $o['value']['value'] ], 1213 ]; 1214 } else { 1215 $value = $o['value'] ? $o['value']['value'] : ''; 1216 if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) { 1217 // see WikiLinkHandler::isWikitextOpt(): link and alt are allowed 1218 // to contain arbitrary wikitext, even though it is stripped 1219 // to a string before emitting. 1220 $value = $state->serializer->wteHandlers->escapeLinkContent( 1221 $state, $value, false, $node, true 1222 ); 1223 } 1224 $nopts[] = [ 1225 'ck' => $o['name'], 1226 'v' => $value, 1227 'ak' => $mwAliases['img_' . $o['name']], 1228 ]; 1229 } 1230 } 1231 1232 // Now we handle media options which all come from space-separated 1233 // values in a single HTML attribute, `class`. (But note that there 1234 // can also be "extra" classes added by `img_class` as well.) 1235 $classes = DOMCompat::getClassList( $outerElt ); 1236 $extra = []; // 'extra' classes 1237 $val = null; 1238 1239 foreach ( $classes as $c ) { 1240 switch ( $c ) { 1241 case 'mw-halign-none': 1242 case 'mw-halign-right': 1243 case 'mw-halign-left': 1244 case 'mw-halign-center': 1245 $val = substr( $c, 10 ); // strip mw-halign- prefix 1246 $nopts[] = [ 1247 'ck' => $val, 1248 'ak' => $mwAliases['img_' . $val], 1249 ]; 1250 break; 1251 1252 case 'mw-valign-top': 1253 case 'mw-valign-middle': 1254 case 'mw-valign-baseline': 1255 case 'mw-valign-sub': 1256 case 'mw-valign-super': 1257 case 'mw-valign-text-top': 1258 case 'mw-valign-bottom': 1259 case 'mw-valign-text-bottom': 1260 $val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_' 1261 $nopts[] = [ 1262 'ck' => $val, 1263 'ak' => $mwAliases['img_' . $val], 1264 ]; 1265 break; 1266 1267 case 'mw-image-border': 1268 $nopts[] = [ 1269 'ck' => 'border', 1270 'ak' => $mwAliases['img_border'], 1271 ]; 1272 break; 1273 1274 case 'mw-default-size': 1275 case 'mw-default-audio-height': 1276 // handled below 1277 break; 1278 1279 default: 1280 $extra[] = $c; 1281 break; 1282 } 1283 } 1284 1285 if ( count( $extra ) ) { 1286 $nopts[] = [ 1287 'ck' => 'class', 1288 'v' => implode( ' ', $extra ), 1289 'ak' => $mwAliases['img_class'], 1290 ]; 1291 } 1292 1293 // Now we handle parameters which don't have a representation 1294 // as HTML attributes; they are set only from the data-mw 1295 // values. (In theory they could perhaps be reverse engineered 1296 // from the thumbnail URL, but that would be fragile and expose 1297 // thumbnail implementation to the editor so we don't do that.) 1298 $mwParams = [ 1299 [ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ], 1300 // mw:Video specific 1301 [ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ], 1302 [ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ], 1303 [ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ] 1304 ]; 1305 1306 // `img_link` and `img_alt` are only surfaced as HTML attributes 1307 // for image media. For all other media we treat them as set only 1308 // from data-mw. 1309 if ( DOMCompat::nodeName( $elt ) !== 'img' ) { 1310 $mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ]; 1311 $mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ]; 1312 } 1313 1314 foreach ( $mwParams as $o ) { 1315 $v = $outerDMW->{$o['prop']} ?? null; 1316 if ( $v === null ) { 1317 $a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true ); 1318 if ( $a !== null && !isset( $a[1]->html ) ) { 1319 $v = $a[1]->txt; 1320 } 1321 } 1322 if ( $v !== null ) { 1323 $ak = $state->serializer->getAttributeValue( 1324 $outerElt, $o['ck'] 1325 ) ?? $mwAliases[$o['alias']]; 1326 $nopts[] = [ 1327 'ck' => $o['ck'], 1328 'ak' => $ak, 1329 'v' => $v 1330 ]; 1331 // Piggyback this here ... 1332 if ( $o['prop'] === 'thumb' ) { 1333 $format = ''; 1334 } 1335 } 1336 } 1337 1338 // These media options come from the HTML `typeof` attribute. 1339 switch ( $format ) { 1340 case 'Thumb': 1341 $nopts[] = [ 1342 'ck' => 'thumbnail', 1343 'ak' => $state->serializer->getAttributeValue( 1344 $outerElt, 'thumbnail' 1345 ) ?? $mwAliases['img_thumbnail'], 1346 ]; 1347 break; 1348 case 'Frame': 1349 $nopts[] = [ 1350 'ck' => 'framed', 1351 'ak' => $state->serializer->getAttributeValue( 1352 $outerElt, 'framed' 1353 ) ?? $mwAliases['img_framed'], 1354 ]; 1355 break; 1356 case 'Frameless': 1357 $nopts[] = [ 1358 'ck' => 'frameless', 1359 'ak' => $state->serializer->getAttributeValue( 1360 $outerElt, 'frameless' 1361 ) ?? $mwAliases['img_frameless'], 1362 ]; 1363 break; 1364 } 1365 1366 // Now handle the size-related options. This is complicated! 1367 // We consider the `height`, `data-height`, `width`, and 1368 // `data-width` attributes, as well as the `typeof` and the `class`. 1369 1370 // Get the user-specified height from wikitext 1371 $wh = $state->serializer->serializedImageAttrVal( 1372 $outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height' 1373 ); 1374 // Get the user-specified width from wikitext 1375 $ww = $state->serializer->serializedImageAttrVal( 1376 $outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width' 1377 ); 1378 1379 $sizeUnmodified = !empty( $ww['fromDataMW'] ) || 1380 ( empty( $ww['modified'] ) && empty( $wh['modified'] ) ); 1381 $upright = $getOpt( 'upright' ); 1382 1383 // XXX: Infer upright factor from default size for all thumbs by default? 1384 // Better for scaling with user prefs, but requires knowledge about 1385 // default used in VE. 1386 if ( $sizeUnmodified && $upright && 1387 // Only serialize upright where it is actually respected 1388 // This causes some dirty diffs, but makes sure that we don't 1389 // produce nonsensical output after a type switch. 1390 // TODO: Only strip if type was actually modified. 1391 in_array( $format, [ 'Frameless', 'Thumb' ], true ) 1392 ) { 1393 // preserve upright option 1394 $nopts[] = [ 1395 'ck' => $upright['ck'], 1396 'ak' => [ $upright['ak'] ], 1397 ]; 1398 }// FIXME: don't use ak here! 1399 1400 if ( !( DOMCompat::getClassList( $outerElt )->contains( 'mw-default-size' ) ) ) { 1401 $size = $getLastOpt( 'width' ); 1402 $sizeString = (string)( $size['ak'] ?? '' ); 1403 if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) { 1404 $sizeString = (string)( $ww['value'] ?? '' ); 1405 } 1406 if ( $sizeUnmodified && $sizeString !== '' ) { 1407 // preserve original width/height string if not touched 1408 $nopts[] = [ 1409 'ck' => 'width', 1410 'v' => $sizeString, // original size string 1411 'ak' => [ '$1' ] 1412 ]; 1413 } else { // don't add px or the like 1414 $bbox = null; 1415 // Serialize to a square bounding box 1416 if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) { 1417 $bbox = intval( $ww['value'] ); 1418 } 1419 if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) && 1420 // As with "mw-default-size", editing clients should remove the 1421 // "mw-default-audio-height" if they want to factor a defined 1422 // height into the bounding box size. However, note that, at 1423 // present, a defined height for audio is ignored while parsing, 1424 // so this only has the effect of modifying the width. 1425 ( 1426 DOMCompat::nodeName( $elt ) !== 'audio' || 1427 !DOMCompat::getClassList( $outerElt )->contains( 'mw-default-audio-height' ) 1428 ) 1429 ) { 1430 $height = intval( $wh['value'] ); 1431 if ( $bbox === null || $height > $bbox ) { 1432 $bbox = $height; 1433 } 1434 } 1435 if ( $bbox !== null ) { 1436 $nopts[] = [ 1437 'ck' => 'width', 1438 // MediaWiki interprets 100px as a width 1439 // restriction only, so we need to make the bounding 1440 // box explicitly square (100x100px). The 'px' is 1441 // added by the alias though, and can be localized. 1442 'v' => $bbox . 'x' . $bbox, 1443 'ak' => $mwAliases['img_width'], 1444 ]; 1445 } 1446 } 1447 }// adds the 'px' suffix 1448 1449 $opts = $outerDP->optList ?? []; // original wikitext options 1450 1451 // Add bogus options from old optlist in order to round-trip cleanly (T64500) 1452 foreach ( $opts as $o ) { 1453 if ( ( $o['ck'] ?? null ) === 'bogus' ) { 1454 $nopts[] = [ 1455 'ck' => 'bogus', 1456 'ak' => [ $o['ak'] ], 1457 ]; 1458 } 1459 } 1460 1461 // Put the caption last, by default. 1462 if ( is_string( $caption ) ) { 1463 $nopts[] = [ 1464 'ck' => 'caption', 1465 'ak' => [ $caption ], 1466 ]; 1467 } 1468 1469 // ok, sort the new options to match the order given in the old optlist 1470 // and try to match up the aliases used 1471 $changed = false; 1472 foreach ( $nopts as &$no ) { 1473 // Make sure we have an array here. Default in data-parsoid is 1474 // actually a string. 1475 // FIXME: don't reuse ak for two different things! 1476 if ( !is_array( $no['ak'] ) ) { 1477 $no['ak'] = [ $no['ak'] ]; 1478 } 1479 1480 $no['sortId'] = count( $opts ); 1481 $idx = -1; 1482 foreach ( $opts as $i => $o ) { 1483 if ( ( $o['ck'] ?? null ) === $no['ck'] && 1484 // for bogus options, make sure the source matches too. 1485 ( $o['ck'] !== 'bogus' || $o['ak'] === $no['ak'][0] ) 1486 ) { 1487 $idx = $i; 1488 break; 1489 } 1490 } 1491 if ( $idx < 0 ) { 1492 // Preferred words are first in the alias list 1493 // (but not in old versions of mediawiki). 1494 $no['ak'] = $no['ak'][0]; 1495 $changed = true; 1496 continue; 1497 } 1498 1499 $no['sortId'] = $idx; 1500 // use a matching alias, if there is one 1501 $a = null; 1502 foreach ( $no['ak'] as $b ) { 1503 // note the trim() here; that allows us to snarf eccentric 1504 // whitespace from the original option wikitext 1505 $b2 = $b; 1506 if ( isset( $no['v'] ) ) { 1507 $b2 = str_replace( '$1', $no['v'], $b ); 1508 } 1509 if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) { 1510 $a = $b; 1511 break; 1512 } 1513 } 1514 // use the alias (incl whitespace) from the original option wikitext 1515 // if found; otherwise use the last alias given (English default by 1516 // convention that works everywhere). 1517 // TODO: use first alias (localized) instead for RTL languages (T53852) 1518 if ( $a !== null && $no['ck'] !== 'caption' ) { 1519 $no['ak'] = $opts[$idx]['ak']; 1520 unset( $no['v'] ); // prevent double substitution 1521 } else { 1522 $no['ak'] = PHPUtils::lastItem( $no['ak'] ); 1523 if ( !( $no['ck'] === 'caption' && $a !== null ) ) { 1524 $changed = true; 1525 } 1526 } 1527 } 1528 1529 // Filter out bogus options if the image options/caption have changed. 1530 if ( $changed ) { 1531 $nopts = array_filter( $nopts, static function ( $no ) { 1532 return $no['ck'] !== 'bogus'; 1533 } ); 1534 // empty captions should get filtered out in this case, too (T64264) 1535 $nopts = array_filter( $nopts, static function ( $no ) { 1536 return !( $no['ck'] === 'caption' && $no['ak'] === '' ); 1537 } ); 1538 } 1539 1540 // sort! 1541 usort( $nopts, static function ( $a, $b ) { 1542 return $a['sortId'] <=> $b['sortId']; 1543 } ); 1544 1545 // emit all the options as wikitext! 1546 $wikitext = '[[' . $resource['value']; 1547 foreach ( $nopts as $o ) { 1548 $wikitext .= '|'; 1549 if ( isset( $o['v'] ) ) { 1550 $wikitext .= str_replace( '$1', $o['v'], $o['ak'] ); 1551 } else { 1552 $wikitext .= $o['ak']; 1553 } 1554 } 1555 $wikitext .= ']]'; 1556 1557 $state->emitChunk( new WikiLinkText( 1558 $wikitext, $node, $state->getEnv()->getSiteConfig(), 1559 // FIXME: Does this matter? Emit a constant for now, it'll all 1560 // be same in the follow up patch to consolidate the types 1561 'mw:Image' 1562 ), $node ); 1563 } 1564 1565} 1566