1<?php 2 3declare( strict_types = 1 ); 4 5namespace Wikimedia\Parsoid\Html2Wt; 6 7use Wikimedia\Assert\Assert; 8use Wikimedia\Parsoid\Config\Env; 9use Wikimedia\Parsoid\Config\WikitextConstants; 10use Wikimedia\Parsoid\Core\DomSourceRange; 11use Wikimedia\Parsoid\DOM\Comment; 12use Wikimedia\Parsoid\DOM\DocumentFragment; 13use Wikimedia\Parsoid\DOM\Element; 14use Wikimedia\Parsoid\DOM\Node; 15use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler; 16use Wikimedia\Parsoid\Utils\DOMCompat; 17use Wikimedia\Parsoid\Utils\DOMDataUtils; 18use Wikimedia\Parsoid\Utils\DOMUtils; 19use Wikimedia\Parsoid\Utils\PHPUtils; 20use Wikimedia\Parsoid\Utils\TokenUtils; 21use Wikimedia\Parsoid\Utils\Utils; 22use Wikimedia\Parsoid\Utils\WTUtils; 23 24class Separators { 25 /* 26 * This regexp looks for leading whitespace on the last line of a separator string. 27 * So, only comments (single or multi-line) or other newlines can precede that 28 * whitespace-of-interest. But, also account for any whitespace preceding newlines 29 * since that needs to be skipped over (Ex: " \n "). 30 */ 31 private const INDENT_PRE_WS_IN_SEP_REGEXP = 32 '/^((?: *\n|(?:' . Utils::COMMENT_REGEXP_FRAGMENT . '))*)( +)([^\n]*)$/D'; 33 34 /** 35 * @var SerializerState 36 */ 37 private $state; 38 39 /** 40 * @var Env 41 */ 42 private $env; 43 44 /** 45 * Clean up the constraints object to prevent excessively verbose output 46 * and clog up log files / test runs. 47 * 48 * @param array $constraints 49 * @return array 50 */ 51 private static function loggableConstraints( array $constraints ): array { 52 $c = [ 53 'a' => $constraints['a'] ?? null, 54 'b' => $constraints['b'] ?? null, 55 'min' => $constraints['min'] ?? null, 56 'max' => $constraints['max'] ?? null, 57 ]; 58 if ( !empty( $constraints['constraintInfo'] ) ) { 59 $constraintInfo = $constraints['constraintInfo']; 60 $c['constraintInfo'] = [ 61 'onSOL' => $constraintInfo['onSOL'] ?? false, 62 'sepType' => $constraintInfo['sepType'] ?? null, 63 'nodeA' => DOMCompat::nodeName( $constraintInfo['nodeA'] ), 64 'nodeB' => DOMCompat::nodeName( $constraintInfo['nodeB'] ), 65 ]; 66 } 67 return $c; 68 } 69 70 /** 71 * @param Node $n 72 * @return string|null 73 */ 74 private static function precedingSeparatorTextLen( Node $n ): ?int { 75 // Given the CSS white-space property and specifically, 76 // "pre" and "pre-line" values for this property, it seems that any 77 // sane HTML editor would have to preserve IEW in HTML documents 78 // to preserve rendering. One use-case where an editor might change 79 // IEW drastically would be when the user explicitly requests it 80 // (Ex: pretty-printing of raw source code). 81 // 82 // For now, we are going to exploit this. This information is 83 // only used to extrapolate DSR values and extract a separator 84 // string from source, and is only used locally. In addition, 85 // the extracted text is verified for being a valid separator. 86 // 87 // So, at worst, this can create a local dirty diff around separators 88 // and at best, it gets us a clean diff. 89 90 $len = 0; 91 $orig = $n; 92 while ( $n ) { 93 if ( DOMUtils::isIEW( $n ) ) { 94 $len += strlen( $n->nodeValue ); 95 } elseif ( $n instanceof Comment ) { 96 $len += WTUtils::decodedCommentLength( $n ); 97 } elseif ( $n !== $orig ) { // dont return if input node! 98 return null; 99 } 100 101 $n = $n->previousSibling; 102 } 103 104 return $len; 105 } 106 107 /** 108 * Helper for updateSeparatorConstraints. 109 * 110 * Collects, checks and integrates separator newline requirements to a simple 111 * min, max structure. 112 * 113 * @param Node $nodeA 114 * @param array $aCons 115 * @param Node $nodeB 116 * @param array $bCons 117 * @return array 118 */ 119 private function getSepNlConstraints( 120 Node $nodeA, array $aCons, Node $nodeB, array $bCons 121 ): array { 122 $env = $this->state->getEnv(); 123 124 $nlConstraints = [ 125 'min' => $aCons['min'] ?? null, 126 'max' => $aCons['max'] ?? null, 127 'constraintInfo' => [], 128 ]; 129 130 if ( isset( $bCons['min'] ) ) { 131 if ( $nlConstraints['max'] !== null && $nlConstraints['max'] < $bCons['min'] ) { 132 // Conflict, warn and let nodeB win. 133 $env->log( 134 'info/html2wt', 135 'Incompatible constraints 1:', 136 DOMCompat::nodeName( $nodeA ), 137 DOMCompat::nodeName( $nodeB ), 138 self::loggableConstraints( $nlConstraints ) 139 ); 140 $nlConstraints['min'] = $bCons['min']; 141 $nlConstraints['max'] = $bCons['min']; 142 } else { 143 $nlConstraints['min'] = max( $nlConstraints['min'] ?? 0, $bCons['min'] ); 144 } 145 } 146 147 if ( isset( $bCons['max'] ) ) { 148 if ( ( $nlConstraints['min'] ?? 0 ) > $bCons['max'] ) { 149 // Conflict, warn and let nodeB win. 150 $env->log( 151 'info/html2wt', 152 'Incompatible constraints 2:', 153 DOMCompat::nodeName( $nodeA ), 154 DOMCompat::nodeName( $nodeB ), 155 self::loggableConstraints( $nlConstraints ) 156 ); 157 $nlConstraints['min'] = $bCons['max']; 158 $nlConstraints['max'] = $bCons['max']; 159 } else { 160 $nlConstraints['max'] = min( $nlConstraints['max'] ?? $bCons['max'], $bCons['max'] ); 161 } 162 } 163 164 if ( $nlConstraints['max'] === null ) { 165 // Anything more than two lines will trigger paragraphs, so default to 166 // two if nothing is specified. (FIXME: This is a conservative strategy 167 // since strictly speaking, this is not always true. This is more a 168 // cautious fallback to handle cases where some DOM handler is missing 169 // a necessary max constraint.) 170 $nlConstraints['max'] = 2; 171 } 172 173 if ( ( $nlConstraints['min'] ?? 0 ) > $nlConstraints['max'] ) { 174 $nlConstraints['max'] = $nlConstraints['min']; 175 } 176 177 return $nlConstraints; 178 } 179 180 /** 181 * Create a separator given a (potentially empty) separator text and newline constraints. 182 * 183 * @param Node $node 184 * @param string $sep 185 * @param array $nlConstraints 186 * @return string 187 */ 188 private function makeSeparator( Node $node, string $sep, array $nlConstraints ): string { 189 $origSep = $sep; 190 $sepType = $nlConstraints['constraintInfo']['sepType'] ?? null; 191 192 // Split on comment/ws-only lines, consuming subsequent newlines since 193 // those lines are ignored by the PHP parser 194 // Ignore lines with ws and a single comment in them 195 $splitRe = implode( [ "#(?:\n(?:[ \t]*?", 196 Utils::COMMENT_REGEXP_FRAGMENT, 197 "[ \t]*?)+(?=\n))+|", 198 Utils::COMMENT_REGEXP_FRAGMENT, 199 "#" 200 ] ); 201 $sepNlCount = substr_count( implode( preg_split( $splitRe, $sep ) ), "\n" ); 202 $minNls = $nlConstraints['min'] ?? 0; 203 204 if ( $this->state->atStartOfOutput && $minNls > 0 ) { 205 // Skip first newline as we are in start-of-line context 206 $minNls--; 207 } 208 209 if ( $minNls > 0 && $sepNlCount < $minNls ) { 210 // Append newlines 211 $nlBuf = []; 212 for ( $i = 0; $i < ( $minNls - $sepNlCount ); $i++ ) { 213 $nlBuf[] = "\n"; 214 } 215 216 /* ------------------------------------------------------------------ 217 * The following two heuristics try to do a best-guess on where to 218 * add the newlines relative to nodeA and nodeB that best matches 219 * wikitext output expectations. 220 * 221 * 1. In a parent-child separator scenario, where the first child of 222 * nodeA is not an element, it could have contributed to the separator. 223 * In that case, the newlines should be prepended because they 224 * usually correspond to the parent's constraints, 225 * and the separator was plucked from the child. 226 * 227 * Try html2wt on this snippet: 228 * 229 * a<p><!--cmt-->b</p> 230 * 231 * 2. In a sibling scenario, if nodeB is a literal-HTML element, nodeA is 232 * forcing the newline and hence the newline should be emitted right 233 * after it. 234 * 235 * Try html2wt on this snippet: 236 * 237 * <p>foo</p> <p data-parsoid='{"stx":"html"}'>bar</p> 238 * -------------------------------------------------------------------- */ 239 $constraintInfo = $nlConstraints['constraintInfo'] ?? []; 240 $sepType = $constraintInfo['sepType'] ?? null; 241 $nodeA = $constraintInfo['nodeA'] ?? null; 242 $nodeB = $constraintInfo['nodeB'] ?? null; 243 if ( 244 $sepType === 'parent-child' && 245 !DOMUtils::isContentNode( DOMUtils::firstNonDeletedChild( $nodeA ) ) && 246 !( 247 isset( WikitextConstants::$HTML['ChildTableTags'][DOMCompat::nodeName( $nodeB )] ) && 248 !WTUtils::isLiteralHTMLNode( $nodeB ) 249 ) 250 ) { 251 $sep = implode( $nlBuf ) . $sep; 252 } elseif ( $sepType === 'sibling' && WTUtils::isLiteralHTMLNode( $nodeB ) ) { 253 $sep = implode( $nlBuf ) . $sep; 254 } else { 255 $sep .= implode( $nlBuf ); 256 } 257 } elseif ( isset( $nlConstraints['max'] ) && $sepNlCount > $nlConstraints['max'] && ( 258 // In selser mode, if the current node is an unmodified rendering-transparent node 259 // of a sibling pair, leave the separator alone since the excess newlines aren't 260 // going to change the semantics of how this node will be parsed in wt->html direction. 261 // This will instead eliminate a dirty diff on the page. 262 !$this->state->selserMode || 263 $sepType !== 'sibling' || 264 !$this->state->currNodeUnmodified || 265 !WTUtils::isRenderingTransparentNode( $node ) 266 ) ) { 267 // Strip some newlines outside of comments. 268 // 269 // Capture separators in a single array with a capturing version of 270 // the split regexp, so that we can work on the non-separator bits 271 // when stripping newlines. 272 // 273 // Dirty-diff minimizing heuristic: Strip newlines away from an unmodified node. 274 // If both nodes are unmodified, this dirties the separator before the current node. 275 // If both nodes are modified, this dirties the separator after the previous node. 276 $allBits = preg_split( '#(' . PHPUtils::reStrip( $splitRe, '#' ) . ')#', 277 $sep, -1, PREG_SPLIT_DELIM_CAPTURE ); 278 $newBits = []; 279 $n = $sepNlCount - $nlConstraints['max']; 280 281 $stripAtEnd = $this->state->prevNodeUnmodified; 282 while ( $n > 0 ) { 283 $bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits ); 284 while ( $bit && preg_match( $splitRe, $bit ) ) { 285 // Retain comment-only lines as is 286 $newBits[] = $bit; 287 $bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits ); 288 } 289 // @phan-suppress-next-line PhanPluginLoopVariableReuse 290 while ( $n > 0 && str_contains( $bit, "\n" ) ) { 291 $bit = preg_replace( '/\n([^\n]*)/', '$1', $bit, 1 ); 292 $n--; 293 } 294 $newBits[] = $bit; 295 } 296 if ( $stripAtEnd ) { 297 $newBits = array_merge( $allBits, array_reverse( $newBits ) ); 298 } else { 299 PHPUtils::pushArray( $newBits, $allBits ); 300 } 301 $sep = implode( $newBits ); 302 } 303 304 $this->state->getEnv()->log( 305 'debug/wts/sep', 306 'make-new |', 307 static function () use ( $nlConstraints, $sepNlCount, $minNls, $sep, $origSep ) { 308 $constraints = Utils::clone( $nlConstraints ); 309 unset( $constraints['constraintInfo'] ); 310 return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $origSep ) . ', ' . 311 $minNls . ', ' . $sepNlCount . ', ' . PHPUtils::jsonEncode( $constraints ); 312 } 313 ); 314 315 return $sep; 316 } 317 318 /** 319 * Merge two constraints. 320 * @param Env $env 321 * @param array $oldConstraints 322 * @param array $newConstraints 323 * @return array 324 */ 325 private static function mergeConstraints( 326 Env $env, array $oldConstraints, array $newConstraints 327 ): array { 328 $res = [ 329 'min' => max( $oldConstraints['min'] ?? 0, $newConstraints['min'] ?? 0 ), 330 'max' => min( $oldConstraints['max'] ?? 2, $newConstraints['max'] ?? 2 ), 331 'constraintInfo' => [], 332 ]; 333 334 if ( $res['min'] > $res['max'] ) { 335 $res['max'] = $res['min']; 336 $env->log( 337 'info/html2wt', 338 'Incompatible constraints (merge):', 339 $res, 340 self::loggableConstraints( $oldConstraints ), 341 self::loggableConstraints( $newConstraints ) 342 ); 343 } 344 345 return $res; 346 } 347 348 /** 349 * @param Node $node 350 * @return string 351 */ 352 public static function debugOut( Node $node ): string { 353 $value = ''; 354 if ( $node instanceof Element ) { 355 $value = DOMCompat::getOuterHTML( $node ); 356 } 357 if ( !$value ) { 358 $value = $node->nodeValue; 359 } 360 return mb_substr( PHPUtils::jsonEncode( $value ), 0, 40 ); 361 } 362 363 /** 364 * Figure out separator constraints and merge them with existing constraints 365 * in state so that they can be emitted when the next content emits source. 366 * 367 * @param Node $nodeA 368 * @param DOMHandler $sepHandlerA 369 * @param Node $nodeB 370 * @param DOMHandler $sepHandlerB 371 */ 372 public function updateSeparatorConstraints( 373 Node $nodeA, DOMHandler $sepHandlerA, Node $nodeB, DOMHandler $sepHandlerB 374 ): void { 375 $state = $this->state; 376 377 if ( $nodeB->parentNode === $nodeA ) { 378 // parent-child separator, nodeA parent of nodeB 379 '@phan-var Element|DocumentFragment $nodeA'; // @var Element|DocumentFragment $nodeA 380 $sepType = 'parent-child'; 381 $aCons = $sepHandlerA->firstChild( $nodeA, $nodeB, $state ); 382 $bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : []; 383 } elseif ( $nodeA->parentNode === $nodeB ) { 384 // parent-child separator, nodeB parent of nodeA 385 '@phan-var Element|DocumentFragment $nodeB'; // @var Element|DocumentFragment $nodeA 386 $sepType = 'child-parent'; 387 $aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : []; 388 $bCons = $sepHandlerB->lastChild( $nodeB, $nodeA, $state ); 389 } else { 390 // sibling separator 391 $sepType = 'sibling'; 392 $aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : []; 393 $bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : []; 394 } 395 $nlConstraints = $this->getSepNlConstraints( $nodeA, $aCons, $nodeB, $bCons ); 396 397 if ( !empty( $state->sep->constraints ) ) { 398 // Merge the constraints 399 $state->sep->constraints = self::mergeConstraints( 400 $this->env, 401 $state->sep->constraints, 402 $nlConstraints 403 ); 404 } else { 405 $state->sep->constraints = $nlConstraints; 406 } 407 408 $this->env->log( 409 'debug/wts/sep', 410 function () use ( $sepType, $nodeA, $nodeB, $state ) { 411 return 'constraint' . ' | ' . 412 $sepType . ' | ' . 413 '<' . DOMCompat::nodeName( $nodeA ) . ',' . DOMCompat::nodeName( $nodeB ) . 414 '>' . ' | ' . PHPUtils::jsonEncode( $state->sep->constraints ) . ' | ' . 415 self::debugOut( $nodeA ) . ' | ' . self::debugOut( $nodeB ); 416 } 417 ); 418 419 $state->sep->constraints['constraintInfo'] = [ 420 'onSOL' => $state->onSOL, 421 // force SOL state when separator is built/emitted 422 'forceSOL' => $sepHandlerB->forceSOL(), 423 'sepType' => $sepType, 424 'nodeA' => $nodeA, 425 'nodeB' => $nodeB, 426 ]; 427 } 428 429 /** 430 * @param Env $env 431 * @param SerializerState $state 432 */ 433 public function __construct( Env $env, SerializerState $state ) { 434 $this->env = $env; 435 $this->state = $state; 436 } 437 438 /** 439 * @param string $sep 440 * @param array $nlConstraints 441 * @return string 442 */ 443 private function makeSepIndentPreSafe( 444 string $sep, array $nlConstraints 445 ): string { 446 $state = $this->state; 447 $constraintInfo = $nlConstraints['constraintInfo'] ?? []; 448 $sepType = $constraintInfo['sepType'] ?? null; 449 $nodeA = $constraintInfo['nodeA'] ?? null; 450 $nodeB = $constraintInfo['nodeB'] ?? null; 451 $forceSOL = ( $constraintInfo['forceSOL'] ?? false ) && $sepType !== 'child-parent'; 452 $origNodeB = $nodeB; 453 454 // Ex: "<div>foo</div>\n <span>bar</span>" 455 // 456 // We also should test for onSOL state to deal with HTML like 457 // <ul> <li>foo</li></ul> 458 // and strip the leading space before non-indent-pre-safe tags 459 if ( 460 !$state->inPHPBlock && 461 !$state->inIndentPre && 462 preg_match( self::INDENT_PRE_WS_IN_SEP_REGEXP, $sep ) && ( 463 str_contains( $sep, "\n" ) || !empty( $constraintInfo['onSOL'] ) || $forceSOL 464 ) 465 ) { 466 // 'sep' is the separator before 'nodeB' and it has leading spaces on a newline. 467 // We have to decide whether that leading space will trigger indent-pres in wikitext. 468 // The decision depends on where this separator will be emitted relative 469 // to 'nodeA' and 'nodeB'. 470 471 $isIndentPreSafe = false; 472 473 // Example sepType scenarios: 474 // 475 // 1. sibling 476 // <div>foo</div> 477 // <span>bar</span> 478 // The span will be wrapped in an indent-pre if the leading space 479 // is not stripped since span is not a block tag 480 // 481 // 2. child-parent 482 // <span>foo 483 // </span>bar 484 // The " </span>bar" will be wrapped in an indent-pre if the 485 // leading space is not stripped since span is not a block tag 486 // 487 // 3. parent-child 488 // <div>foo 489 // <span>bar</span> 490 // </div> 491 // 492 // In all cases, only block-tags prevent indent-pres. 493 // (except for a special case for <br> nodes) 494 if ( $nodeB && WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ) ) { 495 $isIndentPreSafe = true; 496 } elseif ( $sepType === 'sibling' || $nodeA && DOMUtils::atTheTop( $nodeA ) ) { 497 Assert::invariant( !DOMUtils::atTheTop( $nodeA ) || $sepType === 'parent-child', __METHOD__ ); 498 499 // 'nodeB' is the first non-separator child of 'nodeA'. 500 // 501 // Walk past sol-transparent nodes in the right-sibling chain 502 // of 'nodeB' till we establish indent-pre safety. 503 while ( $nodeB && 504 ( DOMUtils::isDiffMarker( $nodeB ) || WTUtils::emitsSolTransparentSingleLineWT( $nodeB ) ) 505 ) { 506 $nodeB = $nodeB->nextSibling; 507 } 508 509 $isIndentPreSafe = !$nodeB || WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ); 510 } 511 512 // Check whether nodeB is nested inside an element that suppresses 513 // indent-pres. 514 if ( $nodeB && !$isIndentPreSafe && !DOMUtils::atTheTop( $nodeB ) ) { 515 $parentB = $nodeB->parentNode; // could be nodeA 516 while ( WTUtils::isZeroWidthWikitextElt( $parentB ) ) { 517 $parentB = $parentB->parentNode; 518 } 519 520 // The token stream paragraph wrapper (and legacy doBlockLevels) 521 // tracks this separately with $inBlockquote 522 $isIndentPreSafe = DOMUtils::hasNameOrHasAncestorOfName( 523 $parentB, 'blockquote' 524 ); 525 526 // First scope wins 527 while ( !$isIndentPreSafe && !DOMUtils::atTheTop( $parentB ) ) { 528 if ( 529 TokenUtils::tagOpensBlockScope( DOMCompat::nodeName( $parentB ) ) && 530 // Only html p-tag is indent pre suppressing 531 ( DOMCompat::nodeName( $parentB ) !== 'p' || WTUtils::isLiteralHTMLNode( $parentB ) ) 532 ) { 533 $isIndentPreSafe = true; 534 break; 535 } elseif ( TokenUtils::tagClosesBlockScope( DOMCompat::nodeName( $parentB ) ) ) { 536 break; 537 } 538 $parentB = $parentB->parentNode; 539 } 540 } 541 542 $stripLeadingSpace = ( !empty( $constraintInfo['onSOL'] ) || $forceSOL ) && 543 $nodeB && !WTUtils::isLiteralHTMLNode( $nodeB ) && 544 isset( WikitextConstants::$HTMLTagsRequiringSOLContext[DOMCompat::nodeName( $nodeB )] ); 545 if ( !$isIndentPreSafe || $stripLeadingSpace ) { 546 // Wrap non-nl ws from last line, but preserve comments. 547 // This avoids triggering indent-pres. 548 $sep = preg_replace_callback( 549 self::INDENT_PRE_WS_IN_SEP_REGEXP, 550 static function ( $matches ) use ( $stripLeadingSpace, $state ) { 551 if ( !$stripLeadingSpace ) { 552 // Since we nowiki-ed, we are no longer in sol state 553 $state->onSOL = false; 554 $state->hasIndentPreNowikis = true; 555 $space = '<nowiki>' . $matches[2] . '</nowiki>'; 556 } 557 return ( $matches[1] ?? '' ) . ( $space ?? '' ) . ( $matches[3] ?? '' ); 558 }, 559 $sep 560 ); 561 } 562 } 563 564 $state->getEnv()->log( 565 'debug/wts/sep', 566 'ipre-safe |', 567 static function () use ( $sep, $nlConstraints ) { 568 $constraints = Utils::clone( $nlConstraints ); 569 unset( $constraints['constraintInfo'] ); 570 return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $constraints ); 571 } 572 ); 573 574 return $sep; 575 } 576 577 /** 578 * Serializing auto inserted content should invalidate the original separator 579 * @param Element $node 580 * @return DomSourceRange|null 581 */ 582 private static function handleAutoInserted( Element $node ): ?DomSourceRange { 583 $dp = DOMDataUtils::getDataParsoid( $node ); 584 if ( !isset( $dp->dsr ) ) { 585 return null; 586 } 587 588 $dsr = clone $dp->dsr; 589 if ( !empty( $dp->autoInsertedStart ) ) { 590 $dsr->openWidth = null; 591 } 592 if ( !empty( $dp->autoInsertedEnd ) ) { 593 $dsr->closeWidth = null; 594 } 595 return $dsr; 596 } 597 598 /** 599 * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed 600 * in the wt->html direction. In this method, we attempt to recover leading trimmed whitespace 601 * using DSR information on $node. 602 * 603 * In some cases, $node might have an additional "data-mw-selser-wrapper" span 604 * that is added by SelSer - look past those wrappers. 605 * 606 * The recovery is attempted in two different ways: 607 * 1. If we have additional DSR fields about leading/trailing WS 608 * (represented by $state->haveTrimmedWsDSR), that info is used. 609 * 2. If not, we simply inspect source at $dsr->innerStart and if it 610 * happens to be whitespace, we use that. 611 * 612 * @param Node $node 613 * @return ?string 614 */ 615 private function fetchLeadingTrimmedSpace( Node $node ): ?string { 616 $origNode = $node; 617 $parentNode = $node->parentNode; 618 619 // Skip past the artificial span wrapper 620 if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) { 621 $node = $parentNode; 622 $parentNode = $parentNode->parentNode; 623 } 624 625 // Leading trimmed whitespace only makes sense for first child. 626 // Ignore comments (which are part of separators) + deletion markers. 627 if ( DOMUtils::previousNonSepSibling( $node ) ) { 628 return null; 629 } 630 631 '@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode 632 if ( isset( WikitextConstants::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) && 633 ( $origNode instanceof Element || !preg_match( '/^[ \t]/', $origNode->nodeValue ) ) 634 ) { 635 // Don't reintroduce whitespace that's already been captured as a DisplaySpace 636 if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) { 637 return null; 638 } 639 640 // FIXME: Is this complexity worth some minor dirty diff on this test? 641 // ParserTest: "3. List embedded in a formatting tag in a misnested way" 642 // I've not added an equivalent check in the trailing whitespace case. 643 if ( $origNode instanceof Element && 644 isset( DOMDataUtils::getDataParsoid( $origNode )->autoInsertedStart ) && 645 strspn( $origNode->firstChild->textContent ?? '', " \t" ) >= 1 646 ) { 647 return null; 648 } 649 650 $state = $this->state; 651 $dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null; 652 if ( Utils::isValidDSR( $dsr, true ) ) { 653 if ( $state->haveTrimmedWsDSR && ( 654 $dsr->leadingWS > 0 || ( $dsr->leadingWS === 0 && $dsr->trailingWS > 0 ) 655 ) ) { 656 $sep = $state->getOrigSrc( $dsr->innerStart(), $dsr->innerStart() + $dsr->leadingWS ) ?? ''; 657 return strspn( $sep, " \t" ) === strlen( $sep ) ? $sep : null; 658 } else { 659 $offset = $dsr->innerStart(); 660 if ( $offset < $dsr->innerEnd() ) { 661 $sep = $state->getOrigSrc( $offset, $offset + 1 ) ?? ''; 662 return preg_match( '/[ \t]/', $sep ) ? $sep : null; 663 } 664 } 665 } 666 } 667 668 return null; 669 } 670 671 /** 672 * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed 673 * in the wt->html direction. In this method, we attempt to recover trailing trimmed whitespace 674 * using DSR information on $node. 675 * 676 * In some cases, $node might have an additional "data-mw-selser-wrapper" span 677 * that is added by SelSer - look past those wrappers. 678 * 679 * The recovery is attempted in two different ways: 680 * 1. If we have additional DSR fields about leading/trailing WS 681 * (represented by $state->haveTrimmedWsDSR), that info is used. 682 * 2. If not, we simply inspect source at $dsr->innerEnd and if it 683 * happens to be whitespace, we use that. 684 * 685 * @param Node $node 686 * @return ?string 687 */ 688 private function fetchTrailingTrimmedSpace( Node $node ): ?string { 689 $origNode = $node; 690 $parentNode = $node->parentNode; 691 692 // Skip past the artificial span wrapper 693 if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) { 694 $node = $parentNode; 695 $parentNode = $parentNode->parentNode; 696 } 697 698 // Trailing trimmed whitespace only makes sense for last child. 699 // Ignore comments (which are part of separators) + deletion markers. 700 if ( DOMUtils::nextNonSepSibling( $node ) ) { 701 return null; 702 } 703 704 $sep = null; 705 '@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode 706 if ( isset( WikitextConstants::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) && 707 ( $origNode instanceof Element || !preg_match( '/[ \t]$/', $origNode->nodeValue ) ) 708 ) { 709 // Don't reintroduce whitespace that's already been captured as a DisplaySpace 710 if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) { 711 return null; 712 } 713 714 $state = $this->state; 715 $dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null; 716 if ( Utils::isValidDSR( $dsr, true ) ) { 717 if ( $state->haveTrimmedWsDSR && ( 718 $dsr->trailingWS > 0 || ( $dsr->trailingWS === 0 && $dsr->leadingWS > 0 ) 719 ) ) { 720 $sep = $state->getOrigSrc( $dsr->innerEnd() - $dsr->trailingWS, $dsr->innerEnd() ) ?? ''; 721 if ( !preg_match( '/^[ \t]*$/', $sep ) ) { 722 $sep = null; 723 } 724 } else { 725 $offset = $dsr->innerEnd() - 1; 726 // The > instead of >= is to deal with an edge case 727 // = = where that single space is captured by the 728 // getLeadingSpace case above 729 if ( $offset > $dsr->innerStart() ) { 730 $sep = $state->getOrigSrc( $offset, $offset + 1 ) ?? ''; 731 if ( !preg_match( '/[ \t]/', $sep ) ) { 732 $sep = null; 733 } 734 } 735 } 736 } 737 } 738 739 return $sep; 740 } 741 742 /** 743 * Emit a separator based on the collected (and merged) constraints 744 * and existing separator text. Called when new output is triggered. 745 * @param Node $node 746 * @param bool $leading 747 * if true, trimmed leading whitespace is emitted 748 * if false, trimmed railing whitespace is emitted 749 * @return string|null 750 */ 751 public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { 752 // Deal with scenarios where leading / trailing whitespace were trimmed. 753 // We now need to figure out if we need to add any leading / trailing WS back. 754 if ( $this->state->useWhitespaceHeuristics && $this->state->selserMode ) { 755 if ( $leading ) { 756 return $this->fetchLeadingTrimmedSpace( $node ); 757 } else { 758 $lastChild = DOMUtils::lastNonDeletedChild( $node ); 759 return $lastChild ? $this->fetchTrailingTrimmedSpace( $lastChild ) : null; 760 } 761 } 762 763 return null; 764 } 765 766 /** 767 * Emit a separator based on the collected (and merged) constraints 768 * and existing separator text. Called when new output is triggered. 769 * @param Node $node 770 * @return string|null 771 */ 772 public function buildSep( Node $node ): ?string { 773 $state = $this->state; 774 $sepType = $state->sep->constraints['constraintInfo']['sepType'] ?? null; 775 $sep = null; 776 $origNode = $node; 777 $prevNode = $state->sep->lastSourceNode; 778 $dsrA = null; 779 $dsrB = null; 780 781 /* ---------------------------------------------------------------------- 782 * Assuming we have access to the original source, we can use DSR offsets 783 * to extract separators from source only if: 784 * - we are in selser mode AND 785 * - this node is not part of a newly inserted subtree (marked 'modified') 786 * for which DSR isn't available 787 * - neither node is adjacent to a deleted block node 788 * (see the long comment in SerializerState::emitChunk in the middle) 789 * 790 * In other scenarios, DSR values on "adjacent" nodes in the edited DOM 791 * may not reflect deleted content between them. 792 * ---------------------------------------------------------------------- */ 793 $origSepNeeded = $node !== $prevNode && $state->selserMode; 794 $origSepNeededAndUsable = 795 $origSepNeeded && !$state->inModifiedContent && 796 !WTSUtils::nextToDeletedBlockNodeInWT( $prevNode, true ) && 797 !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ) && 798 WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $prevNode ) && 799 WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $node ); 800 801 if ( $origSepNeededAndUsable ) { 802 if ( $prevNode instanceof Element ) { 803 $dsrA = self::handleAutoInserted( $prevNode ); 804 } elseif ( !( $prevNode instanceof DocumentFragment ) ) { 805 // Check if $prevNode is the last child of a zero-width element, 806 // and use that for dsr purposes instead. Typical case: text in p. 807 if ( 808 !$prevNode->nextSibling && 809 $prevNode->parentNode !== $node && 810 $prevNode->parentNode instanceof Element && 811 ( DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr->closeWidth ?? null ) === 0 812 ) { 813 $dsrA = self::handleAutoInserted( $prevNode->parentNode ); 814 } elseif ( 815 // Can we extrapolate DSR from $prevNode->previousSibling? 816 // Yes, if $prevNode->parentNode didn't have its children edited. 817 $prevNode->previousSibling instanceof Element && 818 !DiffUtils::directChildrenChanged( $prevNode->parentNode, $this->env ) 819 ) { 820 $endDsr = DOMDataUtils::getDataParsoid( $prevNode->previousSibling )->dsr->end ?? null; 821 $correction = null; 822 if ( is_int( $endDsr ) ) { 823 if ( DOMUtils::isComment( $prevNode ) ) { 824 '@phan-var Comment $prevNode'; // @var Comment $prevNode 825 $correction = WTUtils::decodedCommentLength( $prevNode ); 826 } else { 827 $correction = strlen( $prevNode->nodeValue ); 828 } 829 $dsrA = new DomSourceRange( 830 $endDsr, 831 $endDsr + $correction + WTUtils::indentPreDSRCorrection( $prevNode ), 832 0, 833 0 834 ); 835 } 836 } 837 } 838 839 if ( !$dsrA ) { 840 // nothing to do -- no reason to compute dsrB if dsrA is null 841 } elseif ( $node instanceof Element ) { 842 // $node is parent of $prevNode 843 if ( $prevNode->parentNode === $node ) { 844 '@phan-var Element|DocumentFragment $node'; // @var Element|DocumentFragment $node 845 // FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid? 846 // 847 // When we are in the lastChild sep scenario and the parent doesn't have 848 // useable dsr, if possible, walk up the ancestor nodes till we find 849 // a dsr-bearing node 850 // 851 // This fix is needed to handle trailing newlines in this wikitext: 852 // [[File:foo.jpg|thumb|300px|foo\n{{1x|A}}\n{{1x|B}}\n{{1x|C}}\n\n]] 853 while ( 854 !$node->nextSibling && 855 !DOMUtils::atTheTop( $node ) && 856 ( 857 empty( DOMDataUtils::getDataParsoid( $node )->dsr ) || 858 DOMDataUtils::getDataParsoid( $node )->dsr->start === null || 859 DOMDataUtils::getDataParsoid( $node )->dsr->end === null 860 ) 861 ) { 862 $node = $node->parentNode; 863 } 864 } 865 866 // The top node could be a document fragment 867 $dsrB = $node instanceof Element ? self::handleAutoInserted( $node ) : null; 868 } elseif ( !( $node instanceof DocumentFragment ) ) { 869 // $node is text/comment. Can we extrapolate DSR from $node->parentNode? 870 // Yes, if this is the child of a zero-width element and 871 // is only preceded by separator elements. 872 // 873 // 1. text in p. 874 // 2. ws-only child of a node with auto-inserted start tag 875 // Ex: "<span> <s>x</span> </s>" --> <span> <s>x</s*></span><s*> </s> 876 // 3. ws-only children of a node with auto-inserted start tag 877 // Ex: "{|\n|-\n <!--foo--> \n|}" 878 $nodeParent = $node->parentNode; 879 // phpcs:ignore Generic.Files.LineLength.TooLong 880 '@phan-var Element|DocumentFragment $nodeParent'; // @var Element|DocumentFragment $nodeParent 881 882 if ( 883 $nodeParent !== $prevNode && 884 $nodeParent instanceof Element && 885 ( DOMDataUtils::getDataParsoid( $nodeParent )->dsr->openWidth ?? null ) === 0 886 ) { 887 $sepLen = self::precedingSeparatorTextLen( $node ); 888 if ( $sepLen !== null ) { 889 $dsrB = DOMDataUtils::getDataParsoid( $nodeParent )->dsr; 890 if ( is_int( $dsrB->start ) && $sepLen > 0 ) { 891 $dsrB = clone $dsrB; 892 $dsrB->start += $sepLen; 893 } 894 } 895 } 896 } 897 898 // FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid? 899 if ( Utils::isValidDSR( $dsrA ) && Utils::isValidDSR( $dsrB ) ) { 900 // Figure out containment relationship 901 if ( $dsrA->start <= $dsrB->start ) { 902 if ( $dsrB->end <= $dsrA->end ) { 903 if ( $dsrA->start === $dsrB->start && $dsrA->end === $dsrB->end ) { 904 // Both have the same dsr range, so there can't be any 905 // separators between them 906 $sep = ''; 907 } elseif ( ( $dsrA->openWidth ?? null ) !== null ) { 908 // B in A, from parent to child 909 $sep = $state->getOrigSrc( $dsrA->innerStart(), $dsrB->start ); 910 } 911 } elseif ( $dsrA->end <= $dsrB->start ) { 912 // B following A (siblingish) 913 $sep = $state->getOrigSrc( $dsrA->end, $dsrB->start ); 914 } elseif ( ( $dsrB->closeWidth ?? null ) !== null ) { 915 // A in B, from child to parent 916 $sep = $state->getOrigSrc( $dsrA->end, $dsrB->innerEnd() ); 917 } 918 } elseif ( $dsrA->end <= $dsrB->end ) { 919 if ( ( $dsrB->closeWidth ?? null ) !== null ) { 920 // A in B, from child to parent 921 $sep = $state->getOrigSrc( $dsrA->end, $dsrB->innerEnd() ); 922 } 923 } else { 924 $this->env->log( 'info/html2wt', 'dsr backwards: should not happen!' ); 925 } 926 927 // Reset if $sep is invalid 928 if ( $sep && !WTSUtils::isValidSep( $sep ) ) { 929 $sep = null; 930 } 931 } 932 } elseif ( $origSepNeeded && !DiffUtils::hasDiffMarkers( $prevNode, $this->env ) ) { 933 // Given the following conditions: 934 // - $prevNode has no diff markers. (checked above) 935 // - $prevNode's next non-sep sibling ($next) was inserted. 936 // - $next is an ancestor of $node. 937 // - all of those ancestor nodes from $node->$next have zero-width 938 // wikitext (otherwise, the separator isn't usable) 939 // Try to extract a separator from original source that existed 940 // between $prevNode and its original next sibling or its parent 941 // (if $prevNode was the last non-sep child). 942 // 943 // This minimizes dirty-diffs to that separator text from 944 // the insertion of $next after $prevNode. 945 $next = DOMUtils::nextNonSepSibling( $prevNode ); 946 $origSepUsable = $next && DiffUtils::hasInsertedDiffMark( $next, $this->env ); 947 948 // Check that $next is an ancestor of $node and all nodes 949 // on that path have zero-width wikitext 950 if ( $origSepUsable && $node !== $next ) { 951 $n = $node->parentNode; 952 while ( $n && $next !== $n ) { 953 if ( !WTUtils::isZeroWidthWikitextElt( $n ) ) { 954 $origSepUsable = false; 955 break; 956 } 957 $n = $n->parentNode; 958 } 959 $origSepUsable = $origSepUsable && $n !== null; 960 } 961 962 // Extract separator from original source if possible 963 if ( $origSepUsable ) { 964 $origNext = DOMUtils::nextNonSepSibling( $next ); 965 if ( !$origNext ) { // $prevNode was last non-sep child of its parent 966 // We could work harder for text/comments and extrapolate, but skipping that here 967 // FIXME: If we had a generic DSR extrapolation utility, that would be useful 968 $o1 = $prevNode instanceof Element ? 969 DOMDataUtils::getDataParsoid( $prevNode )->dsr->end ?? null : null; 970 if ( $o1 !== null ) { 971 $dsr2 = DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr ?? null; 972 $o2 = $dsr2 ? $dsr2->innerEnd() : null; 973 $sep = $o2 !== null ? $state->getOrigSrc( $o1, $o2 ) : null; 974 } 975 } elseif ( !DiffUtils::hasDiffMarkers( $origNext, $this->env ) ) { 976 // We could work harder for text/comments and extrapolate, but skipping that here 977 // FIXME: If we had a generic DSR extrapolation utility, that would be useful 978 $o1 = $prevNode instanceof Element ? 979 DOMDataUtils::getDataParsoid( $prevNode )->dsr->end ?? null : null; 980 if ( $o1 !== null ) { 981 $o2 = $origNext instanceof Element ? 982 DOMDataUtils::getDataParsoid( $origNext )->dsr->start ?? null : null; 983 $sep = $o2 !== null ? $state->getOrigSrc( $o1, $o2 ) : null; 984 } 985 } 986 987 if ( $sep !== null ) { 988 // Since this is an inserted node, we might have to augment this 989 // with newline constraints and so, we just set this recovered sep 990 // to the buffered sep in state->sep->src 991 $state->sep->src = $sep; 992 $sep = null; 993 } 994 } 995 } 996 997 // If all efforts failed, use special-purpose heuristics to recover 998 // trimmed leading / trailing whitespace from lists, headings, table-cells 999 if ( $sep === null ) { 1000 if ( $sepType === 'parent-child' ) { 1001 $sep = $this->recoverTrimmedWhitespace( $node, true ); 1002 if ( $sep !== null ) { 1003 $state->sep->src = $sep . $state->sep->src; 1004 } 1005 } elseif ( $sepType === 'child-parent' ) { 1006 $sep = $this->recoverTrimmedWhitespace( $node, false ); 1007 if ( $sep !== null ) { 1008 $state->sep->src .= $sep; 1009 } 1010 } else { 1011 $sep = null; 1012 } 1013 } 1014 1015 $this->env->log( 1016 'debug/wts/sep', 1017 static function () use ( $prevNode, $origNode, $sep, $state ) { 1018 return 'maybe-sep | ' . 1019 'prev:' . ( $prevNode ? DOMCompat::nodeName( $prevNode ) : '--none--' ) . 1020 ', node:' . DOMCompat::nodeName( $origNode ) . 1021 ', sep: ' . PHPUtils::jsonEncode( $sep ) . 1022 ', state.sep.src: ' . PHPUtils::jsonEncode( $state->sep->src ?? null ); 1023 } 1024 ); 1025 1026 // If the separator is being emitted before a node that emits sol-transparent WT, 1027 // go through makeSeparator to verify indent-pre constraints are met. 1028 $sepConstraints = $state->sep->constraints ?? [ 'max' => 0 ]; 1029 if ( $sep === null || ( $state->sep->src && $state->sep->src !== $sep ) ) { 1030 if ( !empty( $state->sep->constraints ) || !empty( $state->sep->src ) ) { 1031 // TODO: set modified flag if start or end node (but not both) are 1032 // modified / new so that the selser can use the separator 1033 $sep = $this->makeSeparator( $node, $state->sep->src ?? '', $sepConstraints ); 1034 } else { 1035 $sep = null; 1036 } 1037 } 1038 1039 if ( $sep !== null ) { 1040 $sep = self::makeSepIndentPreSafe( $sep, $sepConstraints ); 1041 } 1042 return $sep; 1043 } 1044} 1045