1<?php 2 3namespace Wikimedia\Parsoid\Html2Wt; 4 5use Closure; 6use DOMElement; 7use DOMNode; 8use Exception; 9use stdClass; 10use Wikimedia\Assert\Assert; 11use Wikimedia\Parsoid\Config\Env; 12use Wikimedia\Parsoid\Config\WikitextConstants; 13use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; 14use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler; 15use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandlerFactory; 16use Wikimedia\Parsoid\Tokens\KV; 17use Wikimedia\Parsoid\Tokens\TagTk; 18use Wikimedia\Parsoid\Tokens\Token; 19use Wikimedia\Parsoid\Utils\ContentUtils; 20use Wikimedia\Parsoid\Utils\DOMCompat; 21use Wikimedia\Parsoid\Utils\DOMDataUtils; 22use Wikimedia\Parsoid\Utils\DOMUtils; 23use Wikimedia\Parsoid\Utils\PHPUtils; 24use Wikimedia\Parsoid\Utils\TokenUtils; 25use Wikimedia\Parsoid\Utils\Utils; 26use Wikimedia\Parsoid\Utils\WTUtils; 27 28/** 29 * Wikitext to HTML serializer. 30 * Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor. 31 * 32 * This serializer is designed to eventually 33 * - accept arbitrary HTML and 34 * - serialize that to wikitext in a way that round-trips back to the same 35 * HTML DOM as far as possible within the limitations of wikitext. 36 * 37 * Not much effort has been invested so far on supporting 38 * non-Parsoid/VE-generated HTML. Some of this involves adaptively switching 39 * between wikitext and HTML representations based on the values of attributes 40 * and DOM context. A few special cases are already handled adaptively 41 * (multi-paragraph list item contents are serialized as HTML tags for 42 * example, generic A elements are serialized to HTML A tags), but in general 43 * support for this is mostly missing. 44 * 45 * Example issue: 46 * ``` 47 * <h1><p>foo</p></h1> will serialize to =\nfoo\n= whereas the 48 * correct serialized output would be: =<p>foo</p>= 49 * ``` 50 * 51 * What to do about this? 52 * - add a generic 'can this HTML node be serialized to wikitext in this 53 * context' detection method and use that to adaptively switch between 54 * wikitext and HTML serialization. 55 * 56 */ 57class WikitextSerializer { 58 59 /** @var string[] */ 60 private const IGNORED_ATTRIBUTES = [ 61 'data-parsoid' => true, 62 'data-ve-changed' => true, 63 'data-parsoid-changed' => true, 64 'data-parsoid-diff' => true, 65 'data-parsoid-serialize' => true, 66 DOMDataUtils::DATA_OBJECT_ATTR_NAME => true, 67 ]; 68 69 /** @var string[] attribute name => value regexp */ 70 private const PARSOID_ATTRIBUTES = [ 71 'about' => '/^#mwt\d+$/D', 72 'typeof' => '/(^|\s)mw:[^\s]+/', 73 ]; 74 75 // PORT-FIXME do different whitespace semantics matter? 76 77 /** @var string Regexp */ 78 private const TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP 79 = '/\n(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D'; 80 81 /** @var string Regexp */ 82 private const FORMATSTRING_REGEXP = 83 '/^(\n)?(\{\{ *_+)(\n? *\|\n? *_+ *= *)(_+)(\n? *\}\})(\n)?$/D'; 84 85 /** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */ 86 private const COMMENT_OR_WS_REGEXP = '/^(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D'; 87 88 /** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */ 89 private const HEADING_NOWIKI_REGEXP = '/^(?:' . Utils::COMMENT_REGEXP_FRAGMENT . ')*' 90 . '<nowiki>(=+[^=]+=+)<\/nowiki>(.+)$/D'; 91 92 /** @var array string[] */ 93 private static $separatorREs = [ 94 'pureSepRE' => '/^[ \t\r\n]*$/D', 95 'sepPrefixWithNlsRE' => '/^[ \t]*\n+[ \t\r\n]*/', 96 'sepSuffixWithNlsRE' => '/\n[ \t\r\n]*$/D', 97 ]; 98 99 /** @var WikitextEscapeHandlers */ 100 public $wteHandlers; 101 102 /** @var Env */ 103 public $env; 104 105 /** @var SerializerState */ 106 private $state; 107 108 /** @var Separators */ 109 private $separators; 110 111 /** 112 * @var array 113 * - env: (Env) 114 * - rtTestMode: (boolean) 115 * - logType: (string) 116 */ 117 private $options; 118 119 /** @var string Log type for trace() */ 120 private $logType; 121 122 /** 123 * @param array $options List of options for serialization: 124 * - env: (Env) (required) 125 * - rtTestMode: (boolean) 126 * - logType: (string) 127 */ 128 public function __construct( $options ) { 129 $this->env = $options['env']; 130 $this->options = array_merge( $options, [ 131 'rtTestMode' => $this->env->getSiteConfig()->rtTestMode(), 132 'logType' => 'trace/wts', 133 ] ); 134 $this->logType = $this->options['logType']; 135 $this->state = new SerializerState( $this, $this->options ); 136 $this->separators = new Separators( $this->env, $this->state ); 137 $this->wteHandlers = new WikitextEscapeHandlers( $this->options ); 138 } 139 140 /** 141 * Main link handler. 142 * @param DOMElement $node 143 * Used in multiple tag handlers (<a> and <link>), and hence added as top-level method 144 * PORT-TODO: rename to something like handleLink()? 145 */ 146 public function linkHandler( DOMElement $node ): void { 147 LinkHandlerUtils::linkHandler( $this->state, $node ); 148 } 149 150 /** 151 * Main figure handler. 152 * 153 * All figures have a fixed structure: 154 * ``` 155 * <figure or figure-inline typeof="mw:Image..."> 156 * <a or span><img ...><a or span> 157 * <figcaption>....</figcaption> 158 * </figure or figure-inline> 159 * ``` 160 * Pull out this fixed structure, being as generous as possible with 161 * possibly-broken HTML. 162 * 163 * @param DOMElement $node 164 * Used in multiple tag handlers(<figure> and <a>.linkHandler above), and hence added as 165 * top-level method 166 * PORT-TODO: rename to something like handleFigure()? 167 */ 168 public function figureHandler( DOMElement $node ): void { 169 LinkHandlerUtils::figureHandler( $this->state, $node ); 170 } 171 172 /** 173 * @param DOMElement $node 174 * @return void 175 */ 176 public function languageVariantHandler( DOMNode $node ): void { 177 LanguageVariantHandler::handleLanguageVariant( $this->state, $node ); 178 } 179 180 /** 181 * Figure out separator constraints and merge them with existing constraints 182 * in state so that they can be emitted when the next content emits source. 183 * @param DOMNode $nodeA 184 * @param DOMHandler $handlerA 185 * @param DOMNode $nodeB 186 * @param DOMHandler $handlerB 187 */ 188 public function updateSeparatorConstraints( 189 DOMNode $nodeA, DOMHandler $handlerA, DOMNode $nodeB, DOMHandler $handlerB 190 ): void { 191 $this->separators->updateSeparatorConstraints( $nodeA, $handlerA, $nodeB, $handlerB ); 192 } 193 194 /** 195 * Emit a separator based on the collected (and merged) constraints 196 * and existing separator text. Called when new output is triggered. 197 * @param DOMNode $node 198 * @return string|null 199 */ 200 public function buildSep( DOMNode $node ): ?string { 201 return $this->separators->buildSep( $node ); 202 } 203 204 /** 205 * Escape wikitext-like strings in '$text' so that $text renders as a plain string 206 * when rendered as HTML. The escaping is done based on the context in which $text 207 * is present (ex: start-of-line, in a link, etc.) 208 * 209 * @param SerializerState $state 210 * @param string $text 211 * @param array $opts 212 * - node: (DOMNode) 213 * - isLastChild: (bool) 214 * @return string 215 */ 216 public function escapeWikiText( SerializerState $state, string $text, array $opts ): string { 217 return $this->wteHandlers->escapeWikitext( $state, $text, $opts ); 218 } 219 220 /** 221 * @param array $opts 222 * @param DOMElement $elt 223 * @return ConstrainedText|string 224 */ 225 public function domToWikitext( array $opts, DOMElement $elt ) { 226 $opts['logType'] = $this->logType; 227 $serializer = new WikitextSerializer( $opts ); 228 return $serializer->serializeDOM( $elt ); 229 } 230 231 /** 232 * @param array $opts 233 * @param string $html 234 * @return ConstrainedText|string 235 */ 236 public function htmlToWikitext( array $opts, string $html ) { 237 $body = ContentUtils::ppToDOM( $this->env, $html, [ 'markNew' => true ] ); 238 return $this->domToWikitext( $opts, $body ); 239 } 240 241 /** 242 * @param DOMElement $node 243 * @param string $key 244 * @return string 245 */ 246 public function getAttributeKey( DOMElement $node, string $key ): string { 247 $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; 248 foreach ( $tplAttrs as $attr ) { 249 // If this attribute's key is generated content, 250 // serialize HTML back to generator wikitext. 251 // PORT-FIXME: bool check might not be safe. Need documentation on attrib format. 252 if ( ( $attr[0]->txt ?? null ) === $key && isset( $attr[0]->html ) ) { 253 return $this->htmlToWikitext( [ 254 'env' => $this->env, 255 'onSOL' => false, 256 ], $attr[0]->html ); 257 } 258 } 259 return $key; 260 } 261 262 /** 263 * @param DOMElement $node 264 * @param string $key Attribute name. 265 * @param mixed $value Fallback value to use if the attibute is not present. 266 * @return ConstrainedText|string 267 */ 268 public function getAttributeValue( DOMElement $node, string $key, $value ) { 269 $tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? []; 270 foreach ( $tplAttrs as $attr ) { 271 // If this attribute's value is generated content, 272 // serialize HTML back to generator wikitext. 273 // PORT-FIXME: not type safe. Need documentation on attrib format. 274 if ( ( $attr[0] === $key || ( $attr[0]->txt ?? null ) === $key ) 275 // Only return here if the value is generated (ie. .html), 276 // it may just be in .txt form. 277 && isset( $attr[1]->html ) 278 // !== null is required. html:"" will serialize to "" and 279 // will be returned here. This is used to suppress the =".." 280 // string in the attribute in scenarios where the template 281 // generates a "k=v" string. 282 // Ex: <div {{1x|1=style='color:red'}}>foo</div> 283 && $attr[1]->html !== null 284 ) { 285 return $this->htmlToWikitext( [ 286 'env' => $this->env, 287 'onSOL' => false, 288 'inAttribute' => true, 289 ], $attr[1]->html ); 290 } 291 } 292 return $value; 293 } 294 295 /** 296 * @param DOMElement $node 297 * @param string $key 298 * @return array|null A tuple in {@link WTSUtils::getShadowInfo()} format, 299 * with an extra 'fromDataMW' flag. 300 */ 301 public function getAttributeValueAsShadowInfo( DOMElement $node, string $key ): ?array { 302 $v = $this->getAttributeValue( $node, $key, null ); 303 if ( $v === null ) { 304 return $v; 305 } 306 return [ 307 'value' => $v, 308 'modified' => false, 309 'fromsrc' => true, 310 'fromDataMW' => true, 311 ]; 312 } 313 314 /** 315 * @param DOMElement $dataMWnode 316 * @param DOMElement $htmlAttrNode 317 * @param string $key 318 * @return array A tuple in {@link WTSUtils::getShadowInfo()} format, 319 * possibly with an extra 'fromDataMW' flag. 320 */ 321 public function serializedImageAttrVal( 322 DOMElement $dataMWnode, DOMElement $htmlAttrNode, string $key 323 ): array { 324 $v = $this->getAttributeValueAsShadowInfo( $dataMWnode, $key ); 325 return $v ?: WTSUtils::getAttributeShadowInfo( $htmlAttrNode, $key ); 326 } 327 328 /** 329 * @param DOMElement $node 330 * @param string $name 331 * @return array 332 */ 333 public function serializedAttrVal( DOMElement $node, string $name ): array { 334 return $this->serializedImageAttrVal( $node, $node, $name ); 335 } 336 337 /** 338 * @param DOMElement $node 339 * @param bool $wrapperUnmodified 340 * @return string 341 */ 342 public function serializeHTMLTag( DOMElement $node, bool $wrapperUnmodified ): string { 343 // TODO(arlolra): As of 1.3.0, html pre is considered an extension 344 // and wrapped in encapsulation. When that version is no longer 345 // accepted for serialization, we can remove this backwards 346 // compatibility code. 347 // 348 // 'inHTMLPre' flag has to be updated always, 349 // even when we are selsering in the wrapperUnmodified case. 350 $token = WTSUtils::mkTagTk( $node ); 351 if ( $token->getName() === 'pre' ) { 352 // html-syntax pre is very similar to nowiki 353 $this->state->inHTMLPre = true; 354 } 355 356 if ( $wrapperUnmodified ) { 357 $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; 358 return $this->state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? ''; 359 } 360 361 $da = $token->dataAttribs; 362 if ( !empty( $da->autoInsertedStart ) ) { 363 return ''; 364 } 365 366 $close = ''; 367 if ( ( Utils::isVoidElement( $token->getName() ) && empty( $da->noClose ) ) || 368 !empty( $da->selfClose ) 369 ) { 370 $close = ' /'; 371 } 372 373 $sAttribs = $this->serializeAttributes( $node, $token ); 374 if ( strlen( $sAttribs ) > 0 ) { 375 $sAttribs = ' ' . $sAttribs; 376 } 377 378 // srcTagName cannot be '' so, it is okay to use ?? operator 379 $tokenName = $da->srcTagName ?? $token->getName(); 380 $ret = "<{$tokenName}{$sAttribs}{$close}>"; 381 382 if ( strtolower( $tokenName ) === 'nowiki' ) { 383 $ret = WTUtils::escapeNowikiTags( $ret ); 384 } 385 386 return $ret; 387 } 388 389 /** 390 * @param DOMElement $node 391 * @param bool $wrapperUnmodified 392 * @return string 393 */ 394 public function serializeHTMLEndTag( DOMElement $node, $wrapperUnmodified ): string { 395 if ( $wrapperUnmodified ) { 396 $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; 397 return $this->state->getOrigSrc( $dsr->innerEnd(), $dsr->end ) ?? ''; 398 } 399 400 $token = WTSUtils::mkEndTagTk( $node ); 401 if ( $token->getName() === 'pre' ) { 402 $this->state->inHTMLPre = false; 403 } 404 405 // srcTagName cannot be '' so, it is okay to use ?? operator 406 $tokenName = $token->dataAttribs->srcTagName ?? $token->getName(); 407 $ret = ''; 408 409 if ( empty( $token->dataAttribs->autoInsertedEnd ) 410 && !Utils::isVoidElement( $token->getName() ) 411 && empty( $token->dataAttribs->selfClose ) 412 ) { 413 $ret = "</{$tokenName}>"; 414 } 415 416 if ( strtolower( $tokenName ) === 'nowiki' ) { 417 $ret = WTUtils::escapeNowikiTags( $ret ); 418 } 419 420 return $ret; 421 } 422 423 /** 424 * @param DOMElement $node 425 * @param Token $token 426 * @param bool $isWt 427 * @return string 428 */ 429 public function serializeAttributes( DOMElement $node, Token $token, bool $isWt = false ): string { 430 $attribs = $token->attribs; 431 432 $out = []; 433 foreach ( $attribs as $kv ) { 434 $k = $kv->k; 435 $v = null; 436 $vInfo = null; 437 438 // Unconditionally ignore 439 // (all of the IGNORED_ATTRIBUTES should be filtered out earlier, 440 // but ignore them here too just to make sure.) 441 if ( isset( self::IGNORED_ATTRIBUTES[$k] ) || $k === 'data-mw' ) { 442 continue; 443 } 444 445 // Ignore parsoid-like ids. They may have been left behind 446 // by clients and shouldn't be serialized. This can also happen 447 // in v2/v3 API when there is no matching data-parsoid entry found 448 // for this id. 449 if ( $k === 'id' && preg_match( '/^mw[\w-]{2,}$/D', $kv->v ) ) { 450 if ( WTUtils::isNewElt( $node ) ) { 451 $this->env->log( 'warn/html2wt', 452 'Parsoid id found on element without a matching data-parsoid ' 453 . 'entry: ID=' . $kv->v . '; ELT=' . DOMCompat::getOuterHTML( $node ) 454 ); 455 } else { 456 $vInfo = $token->getAttributeShadowInfo( $k ); 457 if ( !$vInfo['modified'] && $vInfo['fromsrc'] ) { 458 $out[] = $k . '=' . '"' . preg_replace( '/"/', '"', $vInfo['value'] ) . '"'; 459 } 460 } 461 continue; 462 } 463 464 // Parsoid auto-generates ids for headings and they should 465 // be stripped out, except if this is not auto-generated id. 466 if ( $k === 'id' && preg_match( '/h[1-6]/', $node->nodeName ) ) { 467 if ( !empty( DOMDataUtils::getDataParsoid( $node )->reusedId ) ) { 468 $vInfo = $token->getAttributeShadowInfo( $k ); 469 // PORT-FIXME: is this safe? value could be a token or token array 470 $out[] = $k . '=' . '"' . preg_replace( '/"/', '"', $vInfo['value'] ) . '"'; 471 } 472 continue; 473 } 474 475 // Strip Parsoid-inserted class="mw-empty-elt" attributes 476 if ( $k === 'class' 477 && isset( WikitextConstants::$Output['FlaggedEmptyElts'][$node->nodeName] ) 478 ) { 479 $kv->v = preg_replace( '/\bmw-empty-elt\b/', '', $kv->v, 1 ); 480 if ( !$kv->v ) { 481 continue; 482 } 483 } 484 485 // Strip other Parsoid-generated values 486 // 487 // FIXME: Given that we are currently escaping about/typeof keys 488 // that show up in wikitext, we could unconditionally strip these 489 // away right now. 490 $parsoidValueRegExp = self::PARSOID_ATTRIBUTES[$k] ?? null; 491 if ( $parsoidValueRegExp && preg_match( $parsoidValueRegExp, $kv->v ) ) { 492 $v = preg_replace( $parsoidValueRegExp, '', $kv->v ); 493 if ( $v ) { 494 $out[] = $k . '=' . '"' . $v . '"'; 495 } 496 continue; 497 } 498 499 if ( strlen( $k ) > 0 ) { 500 $vInfo = $token->getAttributeShadowInfo( $k ); 501 $v = $vInfo['value']; 502 // Deal with k/v's that were template-generated 503 $kk = $this->getAttributeKey( $node, $k ); 504 // Pass in kv.k, not k since k can potentially 505 // be original wikitext source for 'k' rather than 506 // the string value of the key. 507 $vv = $this->getAttributeValue( $node, $kv->k, $v ); 508 // Remove encapsulation from protected attributes 509 // in pegTokenizer.pegjs:generic_newline_attribute 510 $kk = preg_replace( '/^data-x-/i', '', $kk, 1 ); 511 // PORT-FIXME: is this type safe? $vv could be a ConstrainedText 512 if ( strlen( $vv ) > 0 ) { 513 if ( !$vInfo['fromsrc'] && !$isWt ) { 514 // Escape wikitext entities 515 $vv = preg_replace( '/>/', '>', Utils::escapeWtEntities( $vv ) ); 516 } 517 $out[] = $kk . '=' . '"' . preg_replace( '/"/', '"', $vv ) . '"'; 518 } elseif ( preg_match( '/[{<]/', $kk ) ) { 519 // Templated, <*include*>, or <ext-tag> generated 520 $out[] = $kk; 521 } else { 522 $out[] = $kk . '=""'; 523 } 524 continue; 525 // PORT-FIXME: is this type safe? $k->v could be a Token or Token array 526 } elseif ( strlen( $kv->v ) ) { 527 // not very likely.. 528 $out[] = $kv->v; 529 } 530 } 531 532 // SSS FIXME: It can be reasonably argued that we can permanently delete 533 // dangerous and unacceptable attributes in the interest of safety/security 534 // and the resultant dirty diffs should be acceptable. But, this is 535 // something to do in the future once we have passed the initial tests 536 // of parsoid acceptance. 537 // 538 // 'a' data attribs -- look for attributes that were removed 539 // as part of sanitization and add them back 540 $dataAttribs = $token->dataAttribs; 541 if ( isset( $dataAttribs->a ) && isset( $dataAttribs->sa ) ) { 542 $aKeys = array_keys( $dataAttribs->a ); 543 foreach ( $aKeys as $k ) { 544 // Attrib not present -- sanitized away! 545 if ( !KV::lookupKV( $attribs, $k ) ) { 546 $v = $dataAttribs->sa[$k] ?? null; 547 // PORT-FIXME check type 548 if ( $v !== null && $v !== '' ) { 549 $out[] = $k . '=' . '"' . preg_replace( '/"/', '"', $v ) . '"'; 550 } else { 551 // at least preserve the key 552 $out[] = $k; 553 } 554 } 555 } 556 } 557 // XXX: round-trip optional whitespace / line breaks etc 558 return implode( ' ', $out ); 559 } 560 561 /** 562 * @param DOMElement $node 563 */ 564 public function handleLIHackIfApplicable( DOMElement $node ): void { 565 $liHackSrc = DOMDataUtils::getDataParsoid( $node )->liHackSrc ?? null; 566 $prev = DOMUtils::previousNonSepSibling( $node ); 567 568 // If we are dealing with an LI hack, then we must ensure that 569 // we are dealing with either 570 // 571 // 1. A node with no previous sibling inside of a list. 572 // 573 // 2. A node whose previous sibling is a list element. 574 if ( $liHackSrc !== null 575 // Case 1 576 && ( ( $prev === null && DOMUtils::isList( $node->parentNode ) ) 577 // Case 2 578 || ( $prev !== null && DOMUtils::isListItem( $prev ) ) ) 579 ) { 580 $this->state->emitChunk( $liHackSrc, $node ); 581 } 582 } 583 584 /** 585 * @param string $format 586 * @param string $value 587 * @param bool $forceTrim 588 * @return string 589 */ 590 private function formatStringSubst( string $format, string $value, bool $forceTrim ): string { 591 // PORT-FIXME: JS is more agressive and removes various unicode whitespaces 592 // (most notably nbsp). Does that matter? 593 if ( $forceTrim ) { 594 $value = trim( $value ); 595 } 596 return preg_replace_callback( '/_+/', function ( $m ) use ( $value ) { 597 if ( $value === '' ) { 598 return $value; 599 } 600 $hole = $m[0]; 601 $holeLen = strlen( $hole ); 602 $valueLen = mb_strlen( $value ); 603 return $holeLen <= $valueLen ? $value : $value . str_repeat( ' ', $holeLen - $valueLen ); 604 }, $format, 1 ); 605 } 606 607 /** 608 * Generates a template parameter sort function that tries to preserve existing ordering 609 * but also to follow the order prescribed by the templatedata. 610 * @param array $dpArgInfo 611 * @param array|null $tplData 612 * @param array $dataMwKeys 613 * @return Closure 614 * PORT-FIXME: there's probably a better way to do this 615 */ 616 private function createParamComparator( 617 array $dpArgInfo, ?array $tplData, array $dataMwKeys 618 ): Closure { 619 // Record order of parameters in new data-mw 620 $newOrder = array_map( function ( $key, $i ) { 621 return [ $key, [ 'order' => $i ] ]; 622 }, $dataMwKeys, array_keys( $dataMwKeys ) ); 623 // Record order of parameters in templatedata (if present) 624 $tplDataOrder = []; 625 $aliasMap = []; 626 $keys = []; 627 if ( $tplData && isset( $tplData['paramOrder'] ) ) { 628 foreach ( $tplData['paramOrder'] as $i => $key ) { 629 $tplDataOrder[$key] = [ 'order' => $i ]; 630 $aliasMap[$key] = [ 'key' => $key, 'order' => -1 ]; 631 $keys[] = $key; 632 // Aliases have the same sort order as the main name. 633 $aliases = $tplData['params'][$key]['aliases'] ?? []; 634 foreach ( $aliases as $j => $alias ) { 635 $aliasMap[$alias] = [ 'key' => $key, 'order' => $j ]; 636 } 637 } 638 } 639 // Record order of parameters in original wikitext (from data-parsoid) 640 $origOrder = []; 641 foreach ( $dpArgInfo as $i => $argInfo ) { 642 $origOrder[$argInfo->k] = [ 'order' => $i, 'dist' => 0 ]; 643 } 644 // Canonical parameter key gets the same order as an alias parameter 645 // found in the original wikitext. 646 foreach ( $dpArgInfo as $i => $argInfo ) { 647 $canon = $aliasMap[$argInfo->k] ?? null; 648 if ( $canon !== null && !array_key_exists( $canon['key'], $origOrder ) ) { 649 $origOrder[$canon['key']] = $origOrder[$argInfo->k]; 650 } 651 } 652 // Find the closest "original parameter" for each templatedata parameter, 653 // so that newly-added parameters are placed near the parameters which 654 // templatedata says they should be adjacent to. 655 $nearestOrder = $origOrder; 656 $reduceF = function ( $acc, $val ) use ( &$origOrder, &$nearestOrder ) { 657 if ( isset( $origOrder[$val] ) ) { 658 $acc = $origOrder[$val]; 659 } 660 if ( !( isset( $nearestOrder[$val] ) && $nearestOrder[$val]['dist'] < $acc['dist'] ) ) { 661 $nearestOrder[$val] = $acc; 662 } 663 return [ 'order' => $acc['order'], 'dist' => $acc['dist'] + 1 ]; 664 }; 665 // Find closest original parameter before the key. 666 // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown 667 array_reduce( $keys, $reduceF, [ 'order' => -1, 'dist' => 2 * count( $keys ) ] ); 668 // Find closest original parameter after the key. 669 // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown 670 array_reduce( array_reverse( $keys ), $reduceF, 671 [ 'order' => count( $origOrder ), 'dist' => count( $keys ) ] ); 672 673 // Helper function to return a large number if the given key isn't 674 // in the sort order map 675 $big = max( count( $nearestOrder ), count( $newOrder ) ); 676 $defaultGet = function ( $map, $key1, $key2 = null ) use ( &$big ) { 677 $key = ( !$key2 || isset( $map[$key1] ) ) ? $key1 : $key2; 678 return $map[$key]['order'] ?? $big; 679 }; 680 681 return function ( $a, $b ) use ( 682 &$aliasMap, &$defaultGet, &$nearestOrder, &$tplDataOrder, &$newOrder 683 ) { 684 $aCanon = $aliasMap[$a] ?? [ 'key' => $a, 'order' => -1 ]; 685 $bCanon = $aliasMap[$b] ?? [ 'key' => $b, 'order' => -1 ]; 686 // primary key is `nearestOrder` (nearest original parameter) 687 $aOrder = $defaultGet( $nearestOrder, $a, $aCanon['key'] ); 688 $bOrder = $defaultGet( $nearestOrder, $b, $bCanon['key'] ); 689 if ( $aOrder !== $bOrder ) { 690 return $aOrder - $bOrder; 691 } 692 // secondary key is templatedata order 693 if ( $aCanon['key'] === $bCanon['key'] ) { 694 return $aCanon['order'] - $bCanon['order']; 695 } 696 $aOrder = $defaultGet( $tplDataOrder, $aCanon['key'] ); 697 $bOrder = $defaultGet( $tplDataOrder, $bCanon['key'] ); 698 if ( $aOrder !== $bOrder ) { 699 return $aOrder - $bOrder; 700 } 701 // tertiary key is original input order (makes sort stable) 702 $aOrder = $defaultGet( $newOrder, $a ); 703 $bOrder = $defaultGet( $newOrder, $b ); 704 return $aOrder - $bOrder; 705 }; 706 } 707 708 /** 709 * Serialize part of a templatelike expression. 710 * @param SerializerState $state 711 * @param string $buf 712 * @param DOMElement $node 713 * @param string $type The type of the part to be serialized. One of template, templatearg, 714 * parserfunction. 715 * @param stdClass $part The expression fragment to serialize. See $srcParts 716 * in serializeFromParts() for format. 717 * @param ?array $tplData Templatedata, see 718 * https://github.com/wikimedia/mediawiki-extensions-TemplateData/blob/master/Specification.md 719 * @param mixed $prevPart Previous part. See $srcParts in serializeFromParts(). PORT-FIXME type? 720 * @param mixed $nextPart Next part. See $srcParts in serializeFromParts(). PORT-FIXME type? 721 * @return string 722 */ 723 private function serializePart( 724 SerializerState $state, string $buf, DOMElement $node, string $type, stdClass $part, 725 ?array $tplData, $prevPart, $nextPart 726 ): string { 727 // Parse custom format specification, if present. 728 $defaultBlockSpc = "{{_\n| _ = _\n}}"; // "block" 729 $defaultInlineSpc = '{{_|_=_}}'; // "inline" 730 731 $format = isset( $tplData['format'] ) ? strtolower( $tplData['format'] ) : null; 732 if ( $format === 'block' ) { 733 $format = $defaultBlockSpc; 734 } elseif ( $format === 'inline' ) { 735 $format = $defaultInlineSpc; 736 } 737 // Check format string for validity. 738 preg_match( self::FORMATSTRING_REGEXP, $format, $parsedFormat ); 739 if ( !$parsedFormat ) { 740 preg_match( self::FORMATSTRING_REGEXP, $defaultInlineSpc, $parsedFormat ); 741 $format = null; // Indicates that no valid custom format was present. 742 } 743 $formatSOL = $parsedFormat[1] ?? ''; 744 $formatStart = $parsedFormat[2] ?? ''; 745 $formatParamName = $parsedFormat[3] ?? ''; 746 $formatParamValue = $parsedFormat[4] ?? ''; 747 $formatEnd = $parsedFormat[5] ?? ''; 748 $formatEOL = $parsedFormat[6] ?? ''; 749 $forceTrim = ( $format !== null ) || WTUtils::isNewElt( $node ); 750 751 // Shoehorn formatting of top-level templatearg wikitext into this code. 752 if ( $type === 'templatearg' ) { 753 $formatStart = preg_replace( '/{{/', '{{{', $formatStart, 1 ); 754 $formatEnd = preg_replace( '/}}/', '}}}', $formatEnd, 1 ); 755 } 756 757 // handle SOL newline requirement 758 if ( $formatSOL && !preg_match( '/\n$/D', ( $prevPart !== null ) ? $buf : $state->sep->src ) ) { 759 $buf .= "\n"; 760 } 761 762 // open the transclusion 763 $tgt = $part->target; 764 '@phan-var stdClass $tgt'; 765 $buf .= $this->formatStringSubst( $formatStart, $tgt->wt, $forceTrim ); 766 767 // Trim whitespace from data-mw keys to deal with non-compliant 768 // clients. Make sure param info is accessible for the stripped key 769 // since later code will be using the stripped key always. 770 $tplKeysFromDataMw = array_map( function ( $key ) use ( $part ) { 771 // PORT-FIXME do we care about different whitespace semantics for trim? 772 $strippedKey = trim( $key ); 773 if ( $key !== $strippedKey ) { 774 $part->params->{$strippedKey} = $part->params->{$key}; 775 } 776 return $strippedKey; 777 }, array_keys( get_object_vars( $part->params ) ) ); 778 if ( !$tplKeysFromDataMw ) { 779 return $buf . $formatEnd; 780 } 781 782 $env = $this->env; 783 784 // Per-parameter info from data-parsoid for pre-existing parameters 785 $dp = DOMDataUtils::getDataParsoid( $node ); 786 $dpArgInfo = isset( $part->i ) ? ( $dp->pi[$part->i] ?? [] ) : []; 787 788 // Build a key -> arg info map 789 $dpArgInfoMap = array_column( $dpArgInfo, null, 'k' ); 790 791 // 1. Process all parameters and build a map of 792 // arg-name -> [serializeAsNamed, name, value] 793 // 794 // 2. Serialize tpl args in required order 795 // 796 // 3. Format them according to formatParamName/formatParamValue 797 798 $kvMap = []; 799 foreach ( $tplKeysFromDataMw as $key ) { 800 $param = $part->params->{$key}; 801 $argInfo = $dpArgInfoMap[$key] ?? []; 802 803 // TODO: Other formats? 804 // Only consider the html parameter if the wikitext one 805 // isn't present at all. If it's present but empty, 806 // that's still considered a valid parameter. 807 if ( property_exists( $param, 'wt' ) ) { 808 $value = $param->wt; 809 } else { 810 $value = $this->htmlToWikitext( [ 'env' => $env ], $param->html ); 811 } 812 813 Assert::invariant( is_string( $value ), "For param: $key, wt property should be a string ' 814 . 'but got: $value" ); 815 816 $serializeAsNamed = !empty( $argInfo->named ); 817 818 // The name is usually equal to the parameter key, but 819 // if there's a key.wt attribute, use that. 820 $name = null; 821 if ( isset( $param->key->wt ) ) { 822 $name = $param->key->wt; 823 // And make it appear even if there wasn't 824 // data-parsoid information. 825 $serializeAsNamed = true; 826 } else { 827 $name = $key; 828 } 829 830 // Use 'k' as the key, not 'name'. 831 // 832 // The normalized form of 'k' is used as the key in both 833 // data-parsoid and data-mw. The full non-normalized form 834 // is present in '$param->key->wt' 835 $kvMap[$key] = [ 'serializeAsNamed' => $serializeAsNamed, 'name' => $name, 'value' => $value ]; 836 } 837 838 $argOrder = array_keys( $kvMap ); 839 usort( $argOrder, $this->createParamComparator( $dpArgInfo, $tplData, $argOrder ) ); 840 841 $argIndex = 1; 842 $numericIndex = 1; 843 844 $numPositionalArgs = array_reduce( $dpArgInfo, function ( $n, $pi ) use ( $part ) { 845 return ( isset( $part->params->{$pi->k} ) && empty( $pi->named ) ) ? $n + 1 : $n; 846 }, 0 ); 847 848 $argBuf = []; 849 foreach ( $argOrder as $param ) { 850 $kv = $kvMap[$param]; 851 // Add nowiki escapes for the arg value, as required 852 $escapedValue = $this->wteHandlers->escapeTplArgWT( $kv['value'], [ 853 'serializeAsNamed' => $kv['serializeAsNamed'] || $param !== $numericIndex, 854 'type' => $type, 855 'argPositionalIndex' => $numericIndex, 856 'numPositionalArgs' => $numPositionalArgs, 857 'argIndex' => $argIndex++, 858 'numArgs' => count( $tplKeysFromDataMw ), 859 ] ); 860 if ( $escapedValue['serializeAsNamed'] ) { 861 // WS trimming for values of named args 862 // PORT-FIXME check different whitespace trimming semantics 863 $argBuf[] = [ 'dpKey' => $param, 'name' => $kv['name'], 'value' => trim( $escapedValue['v'] ) ]; 864 } else { 865 $numericIndex++; 866 // No WS trimming for positional args 867 $argBuf[] = [ 'dpKey' => $param, 'name' => null, 'value' => $escapedValue['v'] ]; 868 } 869 } 870 871 // If no explicit format is provided, default format is: 872 // - 'inline' for new args 873 // - whatever format is available from data-parsoid for old args 874 // (aka, overriding formatParamName/formatParamValue) 875 // 876 // If an unedited node OR if paramFormat is unspecified, 877 // this strategy prevents unnecessary normalization 878 // of edited transclusions which don't have valid 879 // templatedata formatting information. 880 881 // "magic case": If the format string ends with a newline, an extra newline is added 882 // between the template name and the first parameter. 883 884 foreach ( $argBuf as $arg ) { 885 $name = $arg['name']; 886 $val = $arg['value']; 887 if ( $name === null ) { 888 // We are serializing a positional parameter. 889 // Whitespace is significant for these and 890 // formatting would change semantics. 891 $name = ''; 892 $modFormatParamName = '|_'; 893 $modFormatParamValue = '_'; 894 } elseif ( $name === '' ) { 895 // No spacing for blank parameters ({{foo|=bar}}) 896 // This should be an edge case and probably only for 897 // inline-formatted templates, but we are consciously 898 // forcing this default here. Can revisit if this is 899 // ever a problem. 900 $modFormatParamName = '|_='; 901 $modFormatParamValue = '_'; 902 } else { 903 // Preserve existing spacing, esp if there was a comment 904 // embedded in it. Otherwise, follow TemplateData's lead. 905 // NOTE: In either case, we are forcibly normalizing 906 // non-block-formatted transclusions into block formats 907 // by adding missing newlines. 908 $spc = $dpArgInfoMap[$arg['dpKey']]->spc ?? null; 909 if ( $spc && ( !$format || preg_match( Utils::COMMENT_REGEXP, $spc[3] ?? '' ) ) ) { 910 $nl = ( substr( $formatParamName, 0, 1 ) === "\n" ) ? "\n" : ''; 911 $modFormatParamName = $nl . '|' . $spc[0] . '_' . $spc[1] . '=' . $spc[2]; 912 $modFormatParamValue = '_' . $spc[3]; 913 } else { 914 $modFormatParamName = $formatParamName; 915 $modFormatParamValue = $formatParamValue; 916 } 917 } 918 919 // Don't create duplicate newlines. 920 $trailing = preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ); 921 if ( $trailing && substr( $formatParamName, 0, 1 ) === "\n" ) { 922 $modFormatParamName = substr( $formatParamName, 1 ); 923 } 924 925 $buf .= $this->formatStringSubst( $modFormatParamName, $name, $forceTrim ); 926 $buf .= $this->formatStringSubst( $modFormatParamValue, $val, $forceTrim ); 927 } 928 929 // Don't create duplicate newlines. 930 if ( preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf ) 931 && substr( $formatEnd, 0, 1 ) === "\n" 932 ) { 933 $buf .= substr( $formatEnd, 1 ); 934 } else { 935 $buf .= $formatEnd; 936 } 937 938 if ( $formatEOL ) { 939 if ( $nextPart === null ) { 940 // This is the last part of the block. Add the \n only 941 // if the next non-comment node is not a text node 942 // of if the text node doesn't have a leading \n. 943 $next = DOMUtils::nextNonDeletedSibling( $node ); 944 while ( $next && DOMUtils::isComment( $next ) ) { 945 $next = DOMUtils::nextNonDeletedSibling( $next ); 946 } 947 if ( !DOMUtils::isText( $next ) || substr( $next->nodeValue, 0, 1 ) !== "\n" ) { 948 $buf .= "\n"; 949 } 950 } elseif ( !is_string( $nextPart ) || substr( $nextPart, 0, 1 ) !== "\n" ) { 951 // If nextPart is another template, and it wants a leading nl, 952 // this \n we add here will count towards that because of the 953 // formatSOL check at the top. 954 $buf .= "\n"; 955 } 956 } 957 958 return $buf; 959 } 960 961 /** 962 * Serialize a template from its parts. 963 * @param SerializerState $state 964 * @param DOMElement $node 965 * @param stdClass[] $srcParts PORT-FIXME document 966 * @return string 967 */ 968 public function serializeFromParts( 969 SerializerState $state, DOMElement $node, array $srcParts 970 ): string { 971 $env = $this->env; 972 $useTplData = WTUtils::isNewElt( $node ) || DiffUtils::hasDiffMarkers( $node, $env ); 973 $buf = ''; 974 foreach ( $srcParts as $i => $part ) { 975 $prevPart = $srcParts[$i - 1] ?? null; 976 $nextPart = $srcParts[$i + 1] ?? null; 977 $tplArg = $part->templatearg ?? null; 978 if ( $tplArg ) { 979 $buf = $this->serializePart( $state, $buf, $node, 'templatearg', 980 $tplArg, null, $prevPart, $nextPart ); 981 continue; 982 } 983 984 $tpl = $part->template ?? null; 985 if ( !$tpl ) { 986 $buf .= $part; 987 continue; 988 } 989 990 // transclusion: tpl or parser function 991 $tplHref = $tpl->target->href ?? null; 992 $isTpl = is_string( $tplHref ); 993 $type = $isTpl ? 'template' : 'parserfunction'; 994 995 // While the API supports fetching multiple template data objects in one call, 996 // we will fetch one at a time to benefit from cached responses. 997 // 998 // Fetch template data for the template 999 $tplData = null; 1000 $apiResp = null; 1001 if ( $isTpl && $useTplData && !$this->env->noDataAccess() ) { 1002 $title = preg_replace( '#^\./#', '', $tplHref, 1 ); 1003 try { 1004 $tplData = $this->env->getDataAccess()->fetchTemplateData( $env->getPageConfig(), $title ); 1005 } catch ( Exception $err ) { 1006 // Log the error, and use default serialization mode. 1007 // Better to misformat a transclusion than to lose an edit. 1008 $env->log( 'error/html2wt/tpldata', $err ); 1009 } 1010 } 1011 // If the template doesn't exist, or does but has no TemplateData, ignore it 1012 if ( !empty( $tplData['missing'] ) || !empty( $tplData['notemplatedata'] ) ) { 1013 $tplData = null; 1014 } 1015 $buf = $this->serializePart( $state, $buf, $node, $type, $tpl, $tplData, $prevPart, $nextPart ); 1016 } 1017 return $buf; 1018 } 1019 1020 /** 1021 * @param DOMElement $node 1022 * @param SerializerState $state 1023 * @return string 1024 */ 1025 public function serializeExtensionStartTag( DOMElement $node, SerializerState $state ): string { 1026 $dataMw = DOMDataUtils::getDataMw( $node ); 1027 $extName = $dataMw->name; 1028 1029 // Serialize extension attributes in normalized form as: 1030 // key='value' 1031 // FIXME: with no dataAttribs, shadow info will mark it as new 1032 $attrs = (array)( $dataMw->attrs ?? [] ); 1033 $extTok = new TagTk( $extName, array_map( function ( $key ) use ( $attrs ) { 1034 return new KV( $key, $attrs[$key] ); 1035 }, array_keys( $attrs ) ) ); 1036 1037 if ( $node->hasAttribute( 'about' ) ) { 1038 $extTok->addAttribute( 'about', $node->getAttribute( 'about' ) ); 1039 } 1040 if ( $node->hasAttribute( 'typeof' ) ) { 1041 $extTok->addAttribute( 'typeof', $node->getAttribute( 'typeof' ) ); 1042 } 1043 1044 $attrStr = $this->serializeAttributes( $node, $extTok ); 1045 $src = '<' . $extName; 1046 if ( $attrStr ) { 1047 $src .= ' ' . $attrStr; 1048 } 1049 return $src . ( !empty( $dataMw->body ) ? '>' : ' />' ); 1050 } 1051 1052 /** 1053 * @param DOMElement $node 1054 * @param SerializerState $state 1055 * @return string 1056 */ 1057 public function defaultExtensionHandler( DOMElement $node, SerializerState $state ): string { 1058 $dataMw = DOMDataUtils::getDataMw( $node ); 1059 $src = $this->serializeExtensionStartTag( $node, $state ); 1060 if ( !isset( $dataMw->body ) ) { 1061 return $src; // We self-closed this already. 1062 } elseif ( is_string( $dataMw->body->extsrc ?? null ) ) { 1063 $src .= $dataMw->body->extsrc; 1064 } else { 1065 $state->getEnv()->log( 'error/html2wt/ext', 'Extension src unavailable for: ' 1066 . DOMCompat::getOuterHTML( $node ) ); 1067 } 1068 return $src . '</' . $dataMw->name . '>'; 1069 } 1070 1071 /** 1072 * Consolidate separator handling when emitting text. 1073 * @param string $res 1074 * @param DOMNode $node 1075 * @param bool $omitEscaping 1076 */ 1077 private function serializeText( string $res, DOMNode $node, bool $omitEscaping ): void { 1078 $state = $this->state; 1079 1080 // Deal with trailing separator-like text (at least 1 newline and other whitespace) 1081 preg_match( self::$separatorREs['sepSuffixWithNlsRE'], $res, $newSepMatch ); 1082 $res = preg_replace( self::$separatorREs['sepSuffixWithNlsRE'], '', $res, 1 ); 1083 1084 if ( !$state->inIndentPre ) { 1085 // Strip leading newlines and other whitespace 1086 if ( preg_match( self::$separatorREs['sepPrefixWithNlsRE'], $res, $match ) ) { 1087 $state->appendSep( $match[0] ); 1088 $res = substr( $res, strlen( $match[0] ) ); 1089 } 1090 } 1091 1092 if ( $omitEscaping ) { 1093 $state->emitChunk( $res, $node ); 1094 } else { 1095 // Always escape entities 1096 $res = Utils::escapeWtEntities( $res ); 1097 1098 // If not in pre context, escape wikitext 1099 // XXX refactor: Handle this with escape handlers instead! 1100 $state->escapeText = ( $state->onSOL || !$state->currNodeUnmodified ) && !$state->inHTMLPre; 1101 $state->emitChunk( $res, $node ); 1102 $state->escapeText = false; 1103 } 1104 1105 // Move trailing newlines into the next separator 1106 if ( $newSepMatch ) { 1107 if ( !$state->sep->src ) { 1108 $state->appendSep( $newSepMatch[0] ); 1109 } else { 1110 /* SSS FIXME: what are we doing with the stripped NLs?? */ 1111 } 1112 } 1113 } 1114 1115 /** 1116 * Serialize the content of a text node 1117 * @param DOMNode $node 1118 * @return DOMNode|null 1119 */ 1120 private function serializeTextNode( DOMNode $node ): ?DOMNode { 1121 $this->serializeText( $node->nodeValue, $node, false ); 1122 return $node->nextSibling; 1123 } 1124 1125 /** 1126 * Emit non-separator wikitext that does not need to be escaped. 1127 * @param string $res 1128 * @param DOMNode $node 1129 */ 1130 public function emitWikitext( string $res, DOMNode $node ): void { 1131 $this->serializeText( $res, $node, true ); 1132 } 1133 1134 /** 1135 * DOM-based serialization 1136 * @param DOMElement $node 1137 * @param DOMHandler $domHandler 1138 * @return DOMNode|null 1139 */ 1140 private function serializeDOMNode( DOMElement $node, DOMHandler $domHandler ) { 1141 // To serialize a node from source, the node should satisfy these 1142 // conditions: 1143 // 1144 // 1. It should not have a diff marker or be in a modified subtree 1145 // WTS should not be in a subtree with a modification flag that 1146 // applies to every node of a subtree (rather than an indication 1147 // that some node in the subtree is modified). 1148 // 1149 // 2. It should continue to be valid in any surrounding edited context 1150 // For some nodes, modification of surrounding context 1151 // can change serialized output of this node 1152 // (ex: <td>s and whether you emit | or || for them) 1153 // 1154 // 3. It should have valid, usable DSR 1155 // 1156 // 4. Either it has non-zero positive DSR width, or meets one of the 1157 // following: 1158 // 1159 // 4a. It is content like <p><br/><p> or an automatically-inserted 1160 // wikitext <references/> (HTML <ol>) (will have dsr-width 0) 1161 // 4b. it is fostered content (will have dsr-width 0) 1162 // 4c. it is misnested content (will have dsr-width 0) 1163 // 1164 // SSS FIXME: Additionally, we can guard against buggy DSR with 1165 // some sanity checks. We can test that non-sep src content 1166 // leading wikitext markup corresponds to the node type. 1167 // 1168 // Ex: If node.nodeName is 'UL', then src[0] should be '*' 1169 // 1170 // TO BE DONE 1171 1172 $state = $this->state; 1173 $wrapperUnmodified = false; 1174 $dp = DOMDataUtils::getDataParsoid( $node ); 1175 1176 if ( $state->selserMode 1177 && !$state->inModifiedContent 1178 && WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $node ) 1179 && Utils::isValidDSR( $dp->dsr ?? null ) 1180 && ( $dp->dsr->end > $dp->dsr->start 1181 // FIXME: <p><br/></p> 1182 // nodes that have dsr width 0 because currently, 1183 // we emit newlines outside the p-nodes. So, this check 1184 // tries to handle that scenario. 1185 || ( $dp->dsr->end === $dp->dsr->start && 1186 ( preg_match( '/^(p|br)$/D', $node->nodeName ) 1187 || !empty( DOMDataUtils::getDataMw( $node )->autoGenerated ) ) ) 1188 || !empty( $dp->fostered ) 1189 || !empty( $dp->misnested ) 1190 ) 1191 ) { 1192 if ( !DiffUtils::hasDiffMarkers( $node, $this->env ) ) { 1193 // If this HTML node will disappear in wikitext because of 1194 // zero width, then the separator constraints will carry over 1195 // to the node's children. 1196 // 1197 // Since we dont recurse into 'node' in selser mode, we update the 1198 // separator constraintInfo to apply to 'node' and its first child. 1199 // 1200 // We could clear constraintInfo altogether which would be 1201 // correct (but could normalize separators and introduce dirty 1202 // diffs unnecessarily). 1203 1204 $state->currNodeUnmodified = true; 1205 1206 if ( WTUtils::isZeroWidthWikitextElt( $node ) 1207 && $node->hasChildNodes() 1208 && ( $state->sep->constraints['constraintInfo']['sepType'] ?? null ) === 'sibling' 1209 ) { 1210 $state->sep->constraints['constraintInfo']['onSOL'] = $state->onSOL; 1211 $state->sep->constraints['constraintInfo']['sepType'] = 'parent-child'; 1212 $state->sep->constraints['constraintInfo']['nodeA'] = $node; 1213 $state->sep->constraints['constraintInfo']['nodeB'] = $node->firstChild; 1214 } 1215 1216 $out = $state->getOrigSrc( $dp->dsr->start, $dp->dsr->end ) ?? ''; 1217 1218 $this->trace( 'ORIG-src with DSR', function () use ( $dp, $out ) { 1219 return '[' . $dp->dsr->start . ',' . $dp->dsr->end . '] = ' 1220 . PHPUtils::jsonEncode( $out ); 1221 } ); 1222 1223 // When reusing source, we should only suppress serializing 1224 // to a single line for the cases we've allowed in 1225 // normal serialization. 1226 $suppressSLC = WTUtils::isFirstEncapsulationWrapperNode( $node ) 1227 || in_array( $node->nodeName, [ 'dl', 'ul', 'ol' ], true ) 1228 || ( $node->nodeName === 'table' 1229 && $node->parentNode->nodeName === 'dd' 1230 && DOMUtils::previousNonSepSibling( $node ) === null ); 1231 1232 // Use selser to serialize this text! The original 1233 // wikitext is `out`. But first allow 1234 // `ConstrainedText.fromSelSer` to figure out the right 1235 // type of ConstrainedText chunk(s) to use to represent 1236 // `out`, based on the node type. Since we might actually 1237 // have to break this wikitext into multiple chunks, 1238 // `fromSelSer` returns an array. 1239 if ( $suppressSLC ) { 1240 $state->singleLineContext->disable(); 1241 } 1242 foreach ( ConstrainedText::fromSelSer( $out, $node, $dp, $state->getEnv() ) as $ct ) { 1243 $state->emitChunk( $ct, $ct->node ); 1244 } 1245 if ( $suppressSLC ) { 1246 $state->singleLineContext->pop(); 1247 } 1248 1249 // Skip over encapsulated content since it has already been 1250 // serialized. 1251 if ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { 1252 return WTUtils::skipOverEncapsulatedContent( $node ); 1253 } else { 1254 return $node->nextSibling; 1255 } 1256 } 1257 1258 if ( DiffUtils::onlySubtreeChanged( $node, $this->env ) 1259 && WTSUtils::hasValidTagWidths( $dp->dsr ?? null ) 1260 // In general, we want to avoid nodes with auto-inserted 1261 // start/end tags since dsr for them might not be entirely 1262 // trustworthy. But, since wikitext does not have closing tags 1263 // for tr/td/th in the first place, dsr for them can be trusted. 1264 // 1265 // SSS FIXME: I think this is only for b/i tags for which we do 1266 // dsr fixups. It may be okay to use this for other tags. 1267 && ( ( empty( $dp->autoInsertedStart ) && empty( $dp->autoInsertedEnd ) ) 1268 || preg_match( '/^(td|th|tr)$/D', $node->nodeName ) ) 1269 ) { 1270 $wrapperUnmodified = true; 1271 } 1272 } 1273 1274 $state->currNodeUnmodified = false; 1275 1276 $currentModifiedState = $state->inModifiedContent; 1277 1278 $inModifiedContent = $state->selserMode && DiffUtils::hasInsertedDiffMark( $node, $this->env ); 1279 1280 if ( $inModifiedContent ) { 1281 $state->inModifiedContent = true; 1282 } 1283 1284 $next = $domHandler->handle( $node, $state, $wrapperUnmodified ); 1285 1286 if ( $inModifiedContent ) { 1287 $state->inModifiedContent = $currentModifiedState; 1288 } 1289 1290 return $next; 1291 } 1292 1293 /** 1294 * Internal worker. Recursively serialize a DOM subtree. 1295 * @private 1296 * @param DOMNode $node 1297 * @return DOMNode|null 1298 */ 1299 public function serializeNode( DOMNode $node ): ?DOMNode { 1300 $domHandler = $method = null; 1301 $domHandlerFactory = new DOMHandlerFactory(); 1302 $state = $this->state; 1303 1304 if ( $state->selserMode ) { 1305 $this->trace( 1306 function () use ( $node ) { 1307 return WTSUtils::traceNodeName( $node ); 1308 }, 1309 '; prev-unmodified: ', $state->prevNodeUnmodified, 1310 '; SOL: ', $state->onSOL ); 1311 } else { 1312 $this->trace( 1313 function () use ( $node ) { 1314 return WTSUtils::traceNodeName( $node ); 1315 }, 1316 '; SOL: ', $state->onSOL ); 1317 } 1318 1319 switch ( $node->nodeType ) { 1320 case XML_ELEMENT_NODE: 1321 '@phan-var DOMElement $node';/** @var DOMElement $node */ 1322 // Ignore DiffMarker metas, but clear unmodified node state 1323 if ( DOMUtils::isDiffMarker( $node ) ) { 1324 $state->updateModificationFlags( $node ); 1325 // `state.sep.lastSourceNode` is cleared here so that removed 1326 // separators between otherwise unmodified nodes don't get 1327 // restored. 1328 // `state.sep.lastSourceNode` is cleared here so that removed 1329 // separators between otherwise unmodified nodes don't get 1330 // restored. 1331 $state->updateSep( $node ); 1332 return $node->nextSibling; 1333 } 1334 $domHandler = $domHandlerFactory->getDOMHandler( $node ); 1335 Assert::invariant( $domHandler !== null, 'No dom handler found for ' 1336 . DOMCompat::getOuterHTML( $node ) ); 1337 $method = [ $this, 'serializeDOMNode' ]; 1338 break; 1339 case XML_TEXT_NODE: 1340 // This code assumes that the DOM is in normalized form with no 1341 // run of text nodes. 1342 // Accumulate whitespace from the text node into state.sep.src 1343 $text = $node->nodeValue; 1344 if ( !$state->inIndentPre 1345 // PORT-FIXME: original uses this->state->serializer->separatorREs 1346 // but that does not seem useful 1347 && preg_match( self::$separatorREs['pureSepRE'], $text ) 1348 ) { 1349 $state->appendSep( $text ); 1350 return $node->nextSibling; 1351 } 1352 if ( $state->selserMode ) { 1353 $prev = $node->previousSibling; 1354 if ( !$state->inModifiedContent && ( 1355 ( !$prev && DOMUtils::isBody( $node->parentNode ) ) || 1356 ( $prev && !DOMUtils::isDiffMarker( $prev ) ) 1357 ) ) { 1358 $state->currNodeUnmodified = true; 1359 } else { 1360 $state->currNodeUnmodified = false; 1361 } 1362 } 1363 1364 $domHandler = new DOMHandler( false ); 1365 $method = [ $this, 'serializeTextNode' ]; 1366 break; 1367 case XML_COMMENT_NODE: 1368 // Merge this into separators 1369 $state->appendSep( WTSUtils::commentWT( $node->nodeValue ) ); 1370 return $node->nextSibling; 1371 default: 1372 // PORT-FIXME the JS code used node.outerHTML here; probably a bug? 1373 Assert::invariant( 'Unhandled node type: ', $node->nodeType ); 1374 } 1375 1376 $prev = DOMUtils::previousNonSepSibling( $node ) ?: $node->parentNode; 1377 $this->updateSeparatorConstraints( 1378 $prev, $domHandlerFactory->getDOMHandler( $prev ), 1379 $node, $domHandler 1380 ); 1381 1382 $nextNode = call_user_func( $method, $node, $domHandler ); 1383 1384 $next = DOMUtils::nextNonSepSibling( $node ) ?: $node->parentNode; 1385 $this->updateSeparatorConstraints( 1386 $node, $domHandler, 1387 $next, $domHandlerFactory->getDOMHandler( $next ) 1388 ); 1389 1390 // Update modification flags 1391 $state->updateModificationFlags( $node ); 1392 1393 return $nextNode; 1394 } 1395 1396 /** 1397 * @param string $line 1398 * @return string 1399 */ 1400 private function stripUnnecessaryHeadingNowikis( string $line ): string { 1401 $state = $this->state; 1402 if ( !$state->hasHeadingEscapes ) { 1403 return $line; 1404 } 1405 1406 $escaper = function ( string $wt ) use ( $state ) { 1407 $ret = $state->serializer->wteHandlers->escapedText( $state, false, $wt, false, true ); 1408 return $ret; 1409 }; 1410 1411 preg_match( self::HEADING_NOWIKI_REGEXP, $line, $match ); 1412 if ( $match && !preg_match( self::COMMENT_OR_WS_REGEXP, $match[2] ) ) { 1413 // The nowikiing was spurious since the trailing = is not in EOL position 1414 return $escaper( $match[1] ) . $match[2]; 1415 } else { 1416 // All is good. 1417 return $line; 1418 } 1419 } 1420 1421 private function stripUnnecessaryIndentPreNowikis(): void { 1422 $env = $this->env; 1423 // FIXME: The solTransparentWikitextRegexp includes redirects, which really 1424 // only belong at the SOF and should be unique. See the "New redirect" test. 1425 // PORT-FIXME do the different whitespace semantics matter? 1426 $noWikiRegexp = '@^' 1427 . PHPUtils::reStrip( $env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '@' ) 1428 . '((?i:<nowiki>\s+</nowiki>))([^\n]*(?:\n|$))' . '@Dm'; 1429 $pieces = preg_split( $noWikiRegexp, $this->state->out, -1, PREG_SPLIT_DELIM_CAPTURE ); 1430 $out = $pieces[0]; 1431 for ( $i = 1; $i < count( $pieces ); $i += 4 ) { 1432 $out .= $pieces[$i]; 1433 $nowiki = $pieces[$i + 1]; 1434 $rest = $pieces[$i + 2]; 1435 // Ignore comments 1436 preg_match_all( '/<[^!][^<>]*>/', $rest, $htmlTags ); 1437 1438 // Not required if just sol transparent wt. 1439 $reqd = !preg_match( $env->getSiteConfig()->solTransparentWikitextRegexp(), $rest ); 1440 1441 if ( $reqd ) { 1442 foreach ( $htmlTags[0] as $j => $rawTagName ) { 1443 // Strip </, attributes, and > to get the tagname 1444 $tagName = preg_replace( '/<\/?|\s.*|>/', '', $rawTagName ); 1445 if ( !isset( WikitextConstants::$HTML['HTML5Tags'][$tagName] ) ) { 1446 // If we encounter any tag that is not a html5 tag, 1447 // it could be an extension tag. We could do a more complex 1448 // regexp or tokenize the string to determine if any block tags 1449 // show up outside the extension tag. But, for now, we just 1450 // conservatively bail and leave the nowiki as is. 1451 $reqd = true; 1452 break; 1453 } elseif ( TokenUtils::isBlockTag( $tagName ) ) { 1454 // FIXME: Extension tags shadowing html5 tags might not 1455 // have block semantics. 1456 // Block tags on a line suppress nowikis 1457 $reqd = false; 1458 } 1459 } 1460 } 1461 1462 // PORT-FIXME do the different whitespace semantics matter? 1463 if ( !$reqd ) { 1464 $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', '$1', $nowiki, 1 ); 1465 } elseif ( $env->shouldScrubWikitext() ) { 1466 $solTransparentWikitextNoWsRegexpFragment = PHPUtils::reStrip( 1467 $env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '/' ); 1468 $wsReplacementRE = '/^(' . $solTransparentWikitextNoWsRegexpFragment . ')?\s+/'; 1469 // Replace all leading whitespace 1470 do { 1471 $oldRest = $rest; 1472 $rest = preg_replace( $wsReplacementRE, '$1', $rest ); 1473 } while ( $rest !== $oldRest ); 1474 1475 // Protect against sol-sensitive wikitext characters 1476 $solCharsTest = '/^' . $solTransparentWikitextNoWsRegexpFragment . '[=*#:;]/'; 1477 $nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', 1478 preg_match( $solCharsTest, $rest ) ? '<nowiki/>' : '', $nowiki, 1 ); 1479 } 1480 $out = $out . $nowiki . $rest . $pieces[$i + 3]; 1481 } 1482 $this->state->out = $out; 1483 } 1484 1485 /** 1486 * This implements a heuristic to strip two common sources of <nowiki/>s. 1487 * When <i> and <b> tags are matched up properly, 1488 * - any single ' char before <i> or <b> does not need <nowiki/> protection. 1489 * - any single ' char before </i> or </b> does not need <nowiki/> protection. 1490 * @param string $line 1491 * @return string 1492 */ 1493 private function stripUnnecessaryQuoteNowikis( string $line ): string { 1494 if ( !$this->state->hasQuoteNowikis ) { 1495 return $line; 1496 } 1497 1498 // Optimization: We are interested in <nowiki/>s before quote chars. 1499 // So, skip this if we don't have both. 1500 if ( !( preg_match( '#<nowiki\s*/>#', $line ) && preg_match( "/'/", $line ) ) ) { 1501 return $line; 1502 } 1503 1504 // * Split out all the [[ ]] {{ }} '' ''' ''''' <..> </...> 1505 // parens in the regexp mean that the split segments will 1506 // be spliced into the result array as the odd elements. 1507 // * If we match up the tags properly and we see opening 1508 // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we 1509 // can remove all those nowikis. 1510 // Ex: '<nowiki/>''foo'' bar '<nowiki/>'''baz''' 1511 // * If we match up the tags properly and we see closing 1512 // <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we 1513 // can remove all those nowikis. 1514 // Ex: ''foo'<nowiki/>'' bar '''baz'<nowiki/>''' 1515 // phpcs:ignore Generic.Files.LineLength.TooLong 1516 $p = preg_split( "#('''''|'''|''|\[\[|\]\]|\{\{|\}\}|<\w+(?:\s+[^>]*?|\s*?)/?>|</\w+\s*>)#", $line, -1, PREG_SPLIT_DELIM_CAPTURE ); 1517 1518 // Which nowiki do we strip out? 1519 $nowikiIndex = -1; 1520 1521 // Verify that everything else is properly paired up. 1522 $stack = []; 1523 $quotesOnStack = 0; 1524 $n = count( $p ); 1525 $nonHtmlTag = null; 1526 for ( $j = 1; $j < $n; $j += 2 ) { 1527 // For HTML tags, pull out just the tag name for clearer code below. 1528 preg_match( '#^<(/?\w+)#', $p[$j], $matches ); 1529 $tag = mb_strtolower( $matches[1] ?? $p[$j] ); 1530 $tagLen = strlen( $tag ); 1531 $selfClose = false; 1532 if ( preg_match( '#/>$#D', $p[$j] ) ) { 1533 $tag .= '/'; 1534 $selfClose = true; 1535 } 1536 1537 // Ignore non-html-tag (<nowiki> OR extension tag) blocks 1538 if ( !$nonHtmlTag ) { 1539 if ( isset( $this->env->getSiteConfig()->getExtensionTagNameMap()[$tag] ) ) { 1540 $nonHtmlTag = $tag; 1541 continue; 1542 } 1543 } else { 1544 if ( $tagLen > 0 && $tag[0] === '/' && substr( $tag, 1 ) === $nonHtmlTag ) { 1545 $nonHtmlTag = null; 1546 } 1547 continue; 1548 } 1549 1550 if ( $tag === ']]' ) { 1551 if ( array_pop( $stack ) !== '[[' ) { 1552 return $line; 1553 } 1554 } elseif ( $tag === '}}' ) { 1555 if ( array_pop( $stack ) !== '{{' ) { 1556 return $line; 1557 } 1558 } elseif ( $tagLen > 0 && $tag[0] === '/' ) { // closing html tag 1559 // match html/ext tags 1560 $openTag = array_pop( $stack ); 1561 if ( $tag !== ( '/' . $openTag ) ) { 1562 return $line; 1563 } 1564 } elseif ( $tag === 'nowiki/' ) { 1565 // We only want to process: 1566 // - trailing single quotes (bar') 1567 // - or single quotes by themselves without a preceding '' sequence 1568 if ( substr( $p[$j - 1], -1 ) === "'" 1569 && !( $p[$j - 1] === "'" && $j > 1 && substr( $p[$j - 2], -2 ) === "''" ) 1570 // Consider <b>foo<i>bar'</i>baz</b> or <b>foo'<i>bar'</i>baz</b>. 1571 // The <nowiki/> before the <i> or </i> cannot be stripped 1572 // if the <i> is embedded inside another quote. 1573 && ( $quotesOnStack === 0 1574 // The only strippable scenario with a single quote elt on stack 1575 // is: ''bar'<nowiki/>'' 1576 // -> ["", "''", "bar'", "<nowiki/>", "", "''"] 1577 || ( $quotesOnStack === 1 1578 && $j + 2 < $n 1579 && $p[$j + 1] === '' 1580 && $p[$j + 2][0] === "'" 1581 && $p[$j + 2] === PHPUtils::lastItem( $stack ) ) ) 1582 ) { 1583 $nowikiIndex = $j; 1584 } 1585 continue; 1586 } elseif ( $selfClose || $tag === 'br' ) { 1587 // Skip over self-closing tags or what should have been self-closed. 1588 // ( While we could do this for all void tags defined in 1589 // mediawiki.wikitext.constants.js, <br> is the most common 1590 // culprit. ) 1591 continue; 1592 } elseif ( $tagLen > 0 && $tag[0] === "'" && PHPUtils::lastItem( $stack ) === $tag ) { 1593 array_pop( $stack ); 1594 $quotesOnStack--; 1595 } else { 1596 $stack[] = $tag; 1597 if ( $tagLen > 0 && $tag[0] === "'" ) { 1598 $quotesOnStack++; 1599 } 1600 } 1601 } 1602 1603 if ( count( $stack ) ) { 1604 return $line; 1605 } 1606 1607 if ( $nowikiIndex !== -1 ) { 1608 // We can only remove the final trailing nowiki. 1609 // 1610 // HTML : <i>'foo'</i> 1611 // line : ''<nowiki/>'foo'<nowiki/>'' 1612 $p[$nowikiIndex] = ''; 1613 return implode( '', $p ); 1614 } else { 1615 return $line; 1616 } 1617 } 1618 1619 /** 1620 * Serialize an HTML DOM document. 1621 * WARNING: You probably want to use {@link FromHTML::serializeDOM} instead. 1622 * @param DOMElement $body 1623 * @param bool|null $selserMode 1624 * @return ConstrainedText|string 1625 */ 1626 public function serializeDOM( DOMElement $body, bool $selserMode = false ) { 1627 Assert::invariant( DOMUtils::isBody( $body ), 'Expected a body node.' ); 1628 // `editedDoc` is simply body's ownerDocument. However, since we make 1629 // recursive calls to WikitextSerializer.prototype.serializeDOM with elements from dom fragments 1630 // from data-mw, we need this to be set prior to the initial call. 1631 // It's mainly required for correct serialization of citations in some 1632 // scenarios (Ex: <ref> nested in <references>). 1633 Assert::invariant( $this->env->getPageConfig()->editedDoc !== null, 'Should be set.' ); 1634 1635 if ( !$selserMode ) { 1636 // Strip <section> tags 1637 // Selser mode will have done that already before running dom-diff 1638 ContentUtils::stripSectionTagsAndFallbackIds( $body ); 1639 } 1640 1641 $this->logType = $selserMode ? 'trace/selser' : 'trace/wts'; 1642 1643 $state = $this->state; 1644 $state->initMode( $selserMode ); 1645 1646 $domNormalizer = new DOMNormalizer( $state ); 1647 $domNormalizer->normalize( $body ); 1648 1649 if ( $this->env->hasDumpFlag( 'dom:post-normal' ) ) { 1650 $options = [ 'storeDiffMark' => true, 'env' => $this->env ]; 1651 ContentUtils::dumpDOM( $body, 'DOM: post-normal', $options ); 1652 } 1653 1654 $state->kickOffSerialize( $body ); 1655 1656 if ( $state->hasIndentPreNowikis ) { 1657 // FIXME: Perhaps this can be done on a per-line basis 1658 // rather than do one post-pass on the entire document. 1659 $this->stripUnnecessaryIndentPreNowikis(); 1660 } 1661 1662 $splitLines = $state->selserMode 1663 || $state->hasQuoteNowikis 1664 || $state->hasSelfClosingNowikis 1665 || $state->hasHeadingEscapes; 1666 1667 if ( $splitLines ) { 1668 $state->out = implode( "\n", array_map( function ( $line ) { 1669 // FIXME: Perhaps this can be done on a per-line basis 1670 // rather than do one post-pass on the entire document. 1671 $line = $this->stripUnnecessaryQuoteNowikis( $line ); 1672 1673 // Strip (useless) trailing <nowiki/>s 1674 // Interim fix till we stop introducing them in the first place. 1675 // 1676 // Don't strip |param = <nowiki/> since that pattern is used 1677 // in transclusions and where the trailing <nowiki /> is a valid 1678 // template arg. So, use a conservative regexp to detect that usage. 1679 $line = preg_replace( '#^([^=]*?)(?:<nowiki\s*/>\s*)+$#D', '$1', $line, 1 ); 1680 1681 $line = $this->stripUnnecessaryHeadingNowikis( $line ); 1682 return $line; 1683 }, explode( "\n", $state->out ) ) ); 1684 } 1685 1686 if ( $state->redirectText && $state->redirectText !== 'unbuffered' ) { 1687 $firstLine = explode( "\n", $state->out, 1 )[0]; 1688 $nl = preg_match( '/^(\s|$)/D', $firstLine ) ? '' : "\n"; 1689 $state->out = $state->redirectText . $nl . $state->out; 1690 } 1691 1692 return $state->out; 1693 } 1694 1695 /** 1696 * @note Porting note: this replaces the pattern $serializer->env->log( $serializer->logType, ... ) 1697 * @param mixed ...$args 1698 * @deprecated Use PSR-3 logging instead 1699 */ 1700 public function trace( ...$args ) { 1701 $this->env->log( $this->logType, ...$args ); 1702 } 1703 1704} 1705