1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Html2Wt; 5 6use Composer\Semver\Semver; 7use DOMDocumentFragment; 8use DOMElement; 9use DOMNode; 10use stdClass; 11use Wikimedia\Assert\Assert; 12use Wikimedia\Parsoid\Config\Env; 13use Wikimedia\Parsoid\Core\SelserData; 14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; 15use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; 16use Wikimedia\Parsoid\Utils\DOMDataUtils; 17use Wikimedia\Parsoid\Utils\DOMUtils; 18use Wikimedia\Parsoid\Utils\PHPUtils; 19use Wikimedia\Parsoid\Utils\Utils; 20use Wikimedia\Parsoid\Utils\WTUtils; 21 22/** 23 * State object for the wikitext serializers. 24 */ 25class SerializerState { 26 27 /** 28 * Regexp for checking if what we have consumed wikimarkup that has special meaning at the 29 * beginning of the line, and is indeed at the beginning of the line (modulo comments and 30 * other ignored elements). 31 * 32 * @return string 33 */ 34 private function solWikitextRegexp(): string { 35 static $solWikitextRegexp = null; 36 if ( $solWikitextRegexp === null ) { 37 $sol = PHPUtils::reStrip( 38 $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), 39 '@' 40 ); 41 $solWikitextRegexp = '@' . 42 '^((?:' . $sol . '|' . 43 # SSS FIXME: What about onlyinclude and noinclude? 44 '<includeonly>.*?</includeonly>' . 45 ')*)' . 46 '([\ \*#:;{\|!=].*)$' . 47 '@D'; 48 } 49 return $solWikitextRegexp; 50 } 51 52 /** 53 * Regexp for checking whether we are at the start of the line (modulo comments and 54 * other ignored elements). 55 * 56 * @return string 57 */ 58 private function solRegexp(): string { 59 static $solRegexp = null; 60 if ( $solRegexp === null ) { 61 $sol = PHPUtils::reStrip( 62 $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), 63 '@' 64 ); 65 $solRegexp = '@' . 66 '(^|\\n)' . 67 '(' . 68 # SSS FIXME: What about onlyinclude and noinclude? 69 '<includeonly>.*?</includeonly>' . 70 '|' . $sol . 71 ')*$' . 72 '@D'; 73 } 74 return $solRegexp; 75 } 76 77 /** 78 * Separator information: 79 * - constraints (array<array|int>|null): min/max number of newlines 80 * - src (string|null): collected separator text from DOM text/comment nodes 81 * - lastSourceNode (?DOMNode): Seems to be bookkeeping to make sure we don't reuse 82 * original separators when `emitChunk` is called 83 * consecutively on the same node. However, it also 84 * differs from `state.prevNode` in that it only gets 85 * updated when a node calls `emitChunk` so that nodes 86 * serializing `justChildren` don't mix up `buildSep`. 87 * PORT-FIXME: could use a dedicated class 88 * @var stdClass 89 */ 90 public $sep; 91 92 /** 93 * Is the serializer at the start of a new wikitext line? 94 * @var bool 95 */ 96 public $onSOL = true; 97 98 /** 99 * True when wts kicks off, false after the first char has been output 100 * SSS FIXME: Can this be done away with in some way? 101 * @var bool 102 */ 103 public $atStartOfOutput = true; 104 105 /** 106 * Is the serializer currently handling link content (children of `<a>`)? 107 * @var bool 108 */ 109 public $inLink = false; 110 111 /** 112 * Is the serializer currently handling caption content? 113 * @var bool 114 */ 115 public $inCaption = false; 116 117 /** 118 * Is the serializer currently handling an indent-pre tag? 119 * @var bool 120 */ 121 public $inIndentPre = false; 122 123 /** 124 * Is the serializer currently handling a html-pre tag? 125 * @var bool 126 */ 127 public $inHTMLPre = false; 128 129 /** 130 * Is the serializer currently handling a tag that the PHP parser 131 * treats as a block tag? 132 * @var bool 133 */ 134 public $inPHPBlock = false; 135 136 /** 137 * Is the serializer being invoked recursively to serialize a 138 * template-generated attribute (via `WSP.getAttributeValue`'s 139 * template handling). If so, we should suppress some 140 * serialization escapes, like autolink protection, since 141 * these are not valid for attribute values. 142 * @var bool 143 */ 144 public $inAttribute = false; 145 146 /** 147 * Is the serializer currently processing a subtree that has been 148 * modified compared to original content (ex: via VE / CX)? 149 * 150 * @var bool 151 */ 152 public $inModifiedContent; 153 154 /** 155 * Did we introduce nowikis for indent-pre protection? 156 * If yes, we might run a post-pass to strip useless ones. 157 * @var bool 158 */ 159 public $hasIndentPreNowikis = false; 160 161 /** 162 * Did we introduce nowikis to preserve quote semantics? 163 * If yes, we might run a post-pass to strip useless ones. 164 * @var bool 165 */ 166 public $hasQuoteNowikis = false; 167 168 /** 169 * Did we introduce `<nowiki />`s? 170 * If yes, we do a postpass to remove unnecessary trailing ones. 171 * @var bool 172 */ 173 public $hasSelfClosingNowikis = false; 174 175 /** 176 * Did we introduce nowikis around `=.*=` text? 177 * If yes, we do a postpass to remove unnecessary escapes. 178 * @var bool 179 */ 180 public $hasHeadingEscapes = false; 181 182 /** 183 * Records the nesting level of wikitext tables 184 * @var int 185 */ 186 public $wikiTableNesting = 0; 187 188 /** 189 * Stack of wikitext escaping handlers -- these handlers are responsible 190 * for smart escaping when the surrounding wikitext context is known. 191 * @var callable[] See {@link serializeChildren()} 192 */ 193 public $wteHandlerStack = []; 194 195 /** 196 * This array is used by the wikitext escaping algorithm -- represents 197 * a "single line" of output wikitext as represented by a block node in 198 * the DOM. 199 * - firstNode (?DOMNode): first DOM node processed on this line 200 * - text (string): output so far from all nodes on the current line 201 * - chunks (ConstrainedText[]): list of chunks comprising the current line 202 * @var stdClass 203 * XXX: replace with output buffering per line 204 * PORT-FIXME: could use a dedicated class 205 */ 206 public $currLine; 207 208 /** 209 * Stack used to enforce single-line context 210 * @var SingleLineContext 211 */ 212 public $singleLineContext; 213 214 /** 215 * Text to be emitted at the start of file, for redirects 216 * @var string|null 217 */ 218 public $redirectText = null; 219 220 /** @var WikitextSerializer */ 221 public $serializer; 222 223 /** @var ParsoidExtensionAPI */ 224 public $extApi; 225 226 /** @var string The serialized output */ 227 public $out = ''; 228 229 /** 230 * Whether to use heuristics to determine if a list item, heading, table cell, etc. 231 * should have whitespace inserted after the "*#=|!" wikitext chars? This is normally 232 * true by default, but not so if HTML content version is older than 1.7.0. 233 * In practice, we are now at version 2.1, but Flow stores HTML, so till Flow migrates 234 * all its content over to a later version, we need a boolean flag. 235 * @var bool 236 */ 237 public $useWhitespaceHeuristics; 238 239 /** 240 * Are we in selective serialization mode? 241 * @see SelectiveSerializer 242 * @var bool 243 */ 244 public $selserMode; 245 246 /** @var SelserData */ 247 private $selserData; 248 249 /** 250 * If in selser mode, while processing a node, do we know if 251 * its previous node has not been modified in an edit? 252 * @var bool 253 */ 254 public $prevNodeUnmodified; 255 256 /** 257 * If in selser mode, while processing a node, do we know if 258 * it has not been modified in an edit? 259 * @var bool 260 */ 261 public $currNodeUnmodified; 262 263 /** 264 * Should we run the wikitext escaping code on the wikitext chunk 265 * that will be emitted? True unless we are in HTML <pre>. 266 * @var bool 267 */ 268 public $escapeText = false; 269 270 /** 271 * Used as fast patch for special protected characters in WikitextEscapeHandlers and 272 * comes from LanguageVariantHandler 273 * @var string|null 274 */ 275 public $protect; 276 277 /** @var Separators */ 278 public $separators; 279 280 /** @var Env */ 281 private $env; 282 283 /** @var DOMElement */ 284 private $prevNode; 285 286 /** 287 * Log prefix to use in trace output 288 * @var string 289 */ 290 private $logPrefix = 'OUT:'; 291 292 public $haveTrimmedWsDSR = false; 293 294 /** 295 * @param WikitextSerializer $serializer 296 * @param array $options 297 */ 298 public function __construct( WikitextSerializer $serializer, array $options = [] ) { 299 $this->env = $serializer->env; 300 $this->serializer = $serializer; 301 $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); 302 foreach ( $options as $name => $option ) { 303 // PORT-FIXME validate 304 if ( !( $option instanceof Env ) ) { 305 $this->$name = Utils::clone( $option ); 306 } 307 } 308 $this->resetCurrLine( null ); 309 $this->singleLineContext = new SingleLineContext(); 310 $this->resetSep(); 311 $this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' ); 312 $this->separators = new Separators( $this->env, $this ); 313 } 314 315 /** 316 * @note Porting note: this replaces direct access 317 * @return Env 318 */ 319 public function getEnv(): Env { 320 return $this->env; 321 } 322 323 /** 324 * Initialize a few boolean flags based on serialization mode. 325 * FIXME: Ideally, this should be private. Requires shuffing around 326 * where SerializerState is constructed so that $selserMode is known 327 * at the time of construction. 328 * @private for use by WikitextSerializer only 329 * @param bool $selserMode Are we running selective serialization? 330 */ 331 public function initMode( bool $selserMode ): void { 332 $this->useWhitespaceHeuristics = 333 Semver::satisfies( $this->env->getInputContentVersion(), '>=1.7.0' ); 334 $this->selserMode = $selserMode; 335 } 336 337 /** 338 * Appends the seperator source and updates the SOL state if necessary. 339 * @param string $src 340 * @param DOMNode $node 341 */ 342 public function appendSep( string $src, DOMNode $node ): void { 343 $this->sep->src = ( $this->sep->src ?: '' ) . $src; 344 $this->sepIntroducedSOL( $src, $node ); 345 } 346 347 /** 348 * Cycle the state after processing a node. 349 * @param DOMNode $node 350 */ 351 public function updateSep( DOMNode $node ): void { 352 $this->sep->lastSourceNode = $node; 353 } 354 355 private function resetSep() { 356 $this->sep = PHPUtils::arrayToObject( [ 357 'constraints' => null, 358 'src' => null, 359 'lastSourceNode' => null, 360 ] ); 361 } 362 363 /** 364 * Reset the current line state. 365 * @param ?DOMNode $node 366 */ 367 private function resetCurrLine( ?DOMNode $node ): void { 368 $this->currLine = (object)[ 369 'text' => '', 370 'chunks' => [], 371 'firstNode' => $node 372 ]; 373 } 374 375 /** 376 * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. 377 * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need 378 * to buffer more than the last line.) 379 */ 380 private function flushLine(): void { 381 $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); 382 $this->currLine->chunks = []; 383 } 384 385 /** 386 * Extracts a subset of the page source bound by the supplied indices. 387 * @param int $start Start offset, in bytes 388 * @param int $end End offset, in bytes 389 * @return string|null 390 */ 391 public function getOrigSrc( int $start, int $end ): ?string { 392 Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); 393 if ( 394 $start <= $end && 395 // FIXME: Having a $start greater than the source length is 396 // probably a canary for corruption. Maybe we should be throwing 397 // here instead. See T240053 398 $start <= strlen( $this->selserData->oldText ) 399 ) { 400 return substr( $this->selserData->oldText, $start, $end - $start ); 401 } else { 402 return null; 403 } 404 } 405 406 /** 407 * Like it says on the tin. 408 * @param DOMNode $node 409 */ 410 public function updateModificationFlags( DOMNode $node ): void { 411 $this->prevNodeUnmodified = $this->currNodeUnmodified; 412 $this->currNodeUnmodified = false; 413 $this->prevNode = $node; 414 } 415 416 /** 417 * Separators put us in SOL state. 418 * @param string $sep 419 * @param DOMNode $node 420 */ 421 private function sepIntroducedSOL( string $sep, DOMNode $node ): void { 422 // Don't get tripped by newlines in comments! Be wary of nowikis added 423 // by makeSepIndentPreSafe on the last line. 424 $nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep ); 425 if ( substr( $nonCommentSep, -1 ) === "\n" ) { 426 // Since we are stashing away newlines for emitting 427 // before the next element, we are in SOL state wrt 428 // the content of that next element. 429 // 430 // FIXME: The only serious caveat is if all these newlines 431 // will get stripped out in the context of any parent node 432 // that suppress newlines (ex: <li> nodes that are forcibly 433 // converted to non-html wikitext representation -- newlines 434 // will get suppressed in those context). We currently don't 435 // handle arbitrary HTML which cause these headaches. And, 436 // in any case, we might decide to emit such HTML as native 437 // HTML to avoid these problems. To be figured out later when 438 // it is a real issue. 439 $this->onSOL = true; 440 } 441 442 if ( preg_match( '/\n/', $nonCommentSep ) ) { 443 // process escapes in our full line 444 $this->flushLine(); 445 $this->resetCurrLine( $node ); 446 } 447 } 448 449 /** 450 * Accumulates chunks on the current line. 451 * @param ConstrainedText $chunk 452 * @param string $logPrefix 453 */ 454 private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) { 455 // Emitting text that has not been escaped 456 $this->currLine->text .= $chunk->text; 457 458 $this->currLine->chunks[] = $chunk; 459 460 $this->serializer->trace( '--->', $logPrefix, function () use ( $chunk ) { 461 return PHPUtils::jsonEncode( $chunk->text ); 462 } ); 463 } 464 465 /** 466 * Pushes the seperator to the current line and resets the separator state. 467 * @param string $sep 468 * @param DOMNode $node 469 * @param string $debugPrefix 470 */ 471 private function emitSep( string $sep, DOMNode $node, string $debugPrefix ): void { 472 $sep = ConstrainedText::cast( $sep, $node ); 473 474 // Replace newlines if we're in a single-line context 475 if ( $this->singleLineContext->enforced() ) { 476 $sep->text = preg_replace( '/\n/', ' ', $sep->text ); 477 } 478 479 $this->pushToCurrLine( $sep, $debugPrefix ); 480 $this->sepIntroducedSOL( $sep->text, $node ); 481 482 // Reset separator state 483 $this->resetSep(); 484 $this->updateSep( $node ); 485 } 486 487 /** 488 * Determines if we can use the original seperator for this node or if we 489 * need to build one based on its constraints, and then emits it. 490 * 491 * @param DOMNode $node 492 */ 493 private function emitSepForNode( DOMNode $node ): void { 494 /* When block nodes are deleted, the deletion affects whether unmodified 495 * newline separators between a pair of unmodified P tags can be reused. 496 * 497 * Example: 498 * ``` 499 * Original WT : "<div>x</div>foo\nbar" 500 * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>" 501 * Edited HTML : "<p>foo</p>\n<p>bar</p>" 502 * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>" 503 * Expected WT : "foo\n\nbar" 504 * ``` 505 * 506 * Note the additional newline between "foo" and "bar" even though originally, 507 * there was just a single newline. 508 * 509 * So, even though the two P tags and the separator between them is 510 * unmodified, it is insufficient to rely on just that. We have to look at 511 * what has happened on the two wikitext lines onto which the two P tags 512 * will get serialized. 513 * 514 * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is 515 * not really looking at ALL the nodes before/after the nodes that could 516 * serialize onto the wikitext lines. It is looking at the immediately 517 * adjacent nodes, i.e. it is not necessary to look if a block-tag was 518 * deleted 2 or 5 siblings away. If we had to actually examine all of those, 519 * nodes, this would get very complex, and it would be much simpler to just 520 * discard the original separators => potentially lots of dirty diffs. 521 * 522 * To understand why it is sufficient (for correctness) to examine just 523 * the immediately adjacent nodes, let us look at an additional example. 524 * ``` 525 * Original WT : "a<div>b</div>c<div>d</div>e\nf" 526 * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>" 527 * ``` 528 * Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be 529 * the case always no matter how much inline content showed up between the 530 * block tags in wikitext. If the b-`<div>` was deleted, we don't care 531 * about it, since we still have the d-`<div>` before the P tag that preserves 532 * the correctness of the single `"\n"` separator. If the d-`<div>` was deleted, 533 * we conservatively ignore the original separator and let normal P-P constraints 534 * take care of it. At worst, we might generate a dirty diff in this scenario. */ 535 $again = ( $node === $this->sep->lastSourceNode ); 536 $origSepUsable = !$again && 537 ( 538 // first-content-node of <body> ($this->prevNode) 539 ( 540 DOMUtils::isBody( $this->prevNode ) && 541 $node->parentNode === $this->prevNode 542 ) 543 || 544 // unmodified sibling node of $this->prevNode 545 ( 546 $this->prevNode && $this->prevNodeUnmodified && 547 $node->parentNode === $this->prevNode->parentNode && 548 !WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true ) 549 ) 550 ) && 551 $this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ); 552 553 $origSep = null; 554 if ( $origSepUsable ) { 555 if ( DOMUtils::isElt( $this->prevNode ) && DOMUtils::isElt( $node ) ) { 556 '@phan-var DOMElement $node';/** @var DOMElement $node */ 557 $origSep = $this->getOrigSrc( 558 // <body> won't have DSR in body_only scenarios 559 ( DOMUtils::isBody( $this->prevNode ) ? 560 0 : DOMDataUtils::getDataParsoid( $this->prevNode )->dsr->end ), 561 DOMDataUtils::getDataParsoid( $node )->dsr->start 562 ); 563 } elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) { 564 // We don't know where '$this->sep->src' comes from. So, reuse it 565 // only if it is a valid separator string. 566 $origSep = $this->sep->src; 567 } 568 } 569 570 if ( $origSep !== null ) { 571 $this->emitSep( $origSep, $node, 'ORIG-SEP:' ); 572 } else { 573 $sep = $this->separators->buildSep( $node ); 574 $this->emitSep( $sep ?: '', $node, 'SEP:' ); 575 } 576 } 577 578 /** 579 * Recovers and emits any trimmed whitespace for $node 580 * @param DOMNode $node 581 * @param bool $leading 582 * if true, trimmed leading whitespace is emitted 583 * if false, trimmed railing whitespace is emitted 584 * @return string|null 585 */ 586 public function recoverTrimmedWhitespace( DOMNode $node, bool $leading ): ?string { 587 $sep = $this->separators->recoverTrimmedWhitespace( $node, $leading ); 588 $this->serializer->trace( '--->', "TRIMMED-SEP:", function () use ( $sep ) { 589 return PHPUtils::jsonEncode( $sep ); 590 } ); 591 return $sep; 592 } 593 594 /** 595 * Pushes the chunk to the current line. 596 * @param ConstrainedText|string $res 597 * @param DOMNode $node 598 */ 599 public function emitChunk( $res, DOMNode $node ): void { 600 $res = ConstrainedText::cast( $res, $node ); 601 602 // Replace newlines if we're in a single-line context 603 if ( $this->singleLineContext->enforced() ) { 604 $res->text = preg_replace( '/\n/', ' ', $res->text ); 605 } 606 607 // Emit separator first 608 if ( $res->noSep ) { 609 /* skip separators for internal tokens from SelSer */ 610 if ( $this->onSOL ) { 611 // process escapes in our full line 612 $this->flushLine(); 613 $this->resetCurrLine( $node ); 614 } 615 } else { 616 $this->emitSepForNode( $node ); 617 } 618 619 // Escape 'res' if necessary 620 if ( $this->escapeText ) { 621 $res = new ConstrainedText( [ 622 'text' => $this->serializer->escapeWikiText( $this, $res->text, [ 623 'node' => $node, 624 'isLastChild' => DOMUtils::nextNonDeletedSibling( $node ) === null, 625 ] ), 626 'prefix' => $res->prefix, 627 'suffix' => $res->suffix, 628 'node' => $res->node, 629 ] ); 630 $this->escapeText = false; 631 } else { 632 // If 'res' is coming from selser and the current node is a paragraph tag, 633 // check if 'res' might need some leading chars nowiki-escaped before being output. 634 // Because of block-tag p-wrapping behavior, sol-sensitive characters that used to 635 // be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position 636 // if those block tags get deleted during edits. 637 // 638 // Ex: a<div>foo</div>*b 639 // -- wt2html --> <p>a</p><div>foo<div><p>*b</p> 640 // -- EDIT --> <p>a</p><p>*b</p> 641 // -- html2wt --> a\n\n<nowiki>*</nowiki>b 642 // 643 // In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p> 644 // will be marked unmodified and will be processed below. 645 if ( $this->selserMode 646 && $this->onSOL 647 && $this->currNodeUnmodified 648 // 'node' came from original Parsoid HTML unmodified. So, if its content 649 // needs nowiki-escaping, we know that the reason it didn't parse into 650 // lists/headings/whatever is because it didn't occur at the start of the 651 // line => it had a block-tag in the original wikitext. So if the previous 652 // node was also unmodified (and since it also came from original Parsoid 653 // HTML), we can safely infer that it couldn't have been an inline node or 654 // a P-tag (if it were, the p-wrapping code would have swallowed that content 655 // into 'node'). So, it would have to be some sort of block tag => this.onSOL 656 // couldn't have been true (because we could have serialized 'node' on the 657 // same line as the block tag) => we can save some effort by eliminating 658 // scenarios where 'this.prevNodeUnmodified' is true. 659 && !$this->prevNodeUnmodified 660 && $node->nodeName === 'p' && !WTUtils::isLiteralHTMLNode( $node ) 661 ) { 662 $pChild = DOMUtils::firstNonSepChild( $node ); 663 // If a text node, we have to make sure that the text doesn't 664 // get reparsed as non-text in the wt2html pipeline. 665 if ( $pChild && DOMUtils::isText( $pChild ) ) { 666 $match = $res->matches( $this->solWikitextRegexp() ); 667 if ( $match && isset( $match[2] ) ) { 668 if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) 669 // ! and | chars are harmless outside tables 670 || ( preg_match( '/^[\|!]/', $match[2] ) && $this->wikiTableNesting > 0 ) 671 // indent-pres are suppressed inside <blockquote> 672 || ( preg_match( '/^ [^\s]/', $match[2] ) 673 && !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) ) 674 ) { 675 $res = ConstrainedText::cast( ( $match[1] ?: '' ) 676 . '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>' 677 . substr( $match[2], 1 ), $node ); 678 } 679 } 680 } 681 } 682 } 683 684 // Output res 685 $this->pushToCurrLine( $res, $this->logPrefix ); 686 687 // Update sol flag. Test for newlines followed by optional includeonly or comments 688 if ( !$res->matches( $this->solRegexp() ) ) { 689 $this->onSOL = false; 690 } 691 692 // We've emit something so we're no longer at SOO. 693 $this->atStartOfOutput = false; 694 } 695 696 /** 697 * Serialize the children of a DOM node, sharing the global serializer state. 698 * Typically called by a DOM-based handler to continue handling its children. 699 * @param DOMElement|DOMDocumentFragment $node 700 * @param ?callable $wtEscaper ( $state, $text, $opts ) 701 * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? 702 * @param ?DOMNode $firstChild 703 */ 704 public function serializeChildren( 705 DOMNode $node, ?callable $wtEscaper = null, ?DOMNode $firstChild = null 706 ): void { 707 // SSS FIXME: Unsure if this is the right thing always 708 if ( $wtEscaper ) { 709 $this->wteHandlerStack[] = $wtEscaper; 710 } 711 712 $child = $firstChild ?: $node->firstChild; 713 while ( $child !== null ) { 714 // We always get the next child to process 715 $child = $this->serializer->serializeNode( $child ); 716 } 717 718 if ( $wtEscaper ) { 719 array_pop( $this->wteHandlerStack ); 720 } 721 722 // If we serialized children explicitly, 723 // we were obviously processing a modified node. 724 $this->currNodeUnmodified = false; 725 } 726 727 /** 728 * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` 729 * 730 * @param DOMElement|DOMDocumentFragment $node 731 * @param ?callable $wtEscaper See {@link serializeChildren()} 732 * @internal For use by WikitextSerializer only 733 */ 734 public function kickOffSerialize( 735 DOMNode $node, ?callable $wtEscaper = null 736 ): void { 737 $this->updateSep( $node ); 738 $this->currNodeUnmodified = false; 739 $this->updateModificationFlags( $node ); 740 $this->resetCurrLine( $node->firstChild ); 741 $this->serializeChildren( $node, $wtEscaper ); 742 // Emit child-parent seps. 743 $this->emitSepForNode( $node ); 744 // We've reached EOF, flush the remaining buffered text. 745 $this->flushLine(); 746 } 747 748 /** 749 * Serialize children to a string 750 * 751 * FIXME(arlorla): Shouldn't affect the separator state, but accidents have 752 * have been known to happen. T109793 suggests using its own wts / state. 753 * 754 * @param DOMElement|DOMDocumentFragment $node 755 * @param ?callable $wtEscaper See {@link serializeChildren()} 756 * @param string $inState 757 * @return string 758 */ 759 private function serializeChildrenToString( 760 DOMNode $node, ?callable $wtEscaper, string $inState 761 ): string { 762 $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; 763 Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' 764 . implode( ', ', $states ) ); 765 // FIXME: Make sure that the separators emitted here conform to the 766 // syntactic constraints of syntactic context. 767 $oldSep = $this->sep; 768 $oldSOL = $this->onSOL; 769 $oldOut = $this->out; 770 $oldStart = $this->atStartOfOutput; 771 $oldCurrLine = $this->currLine; 772 $oldLogPrefix = $this->logPrefix; 773 // Modification flags 774 $oldPrevNodeUnmodified = $this->prevNodeUnmodified; 775 $oldCurrNodeUnmodified = $this->currNodeUnmodified; 776 $oldPrevNode = $this->prevNode; 777 778 $this->out = ''; 779 $this->logPrefix = 'OUT(C):'; 780 $this->resetSep(); 781 $this->onSOL = false; 782 $this->atStartOfOutput = false; 783 $this->$inState = true; 784 785 $this->singleLineContext->disable(); 786 $this->kickOffSerialize( $node, $wtEscaper ); 787 $this->singleLineContext->pop(); 788 789 // restore the state 790 $bits = $this->out; 791 $this->out = $oldOut; 792 $this->$inState = false; 793 $this->sep = $oldSep; 794 $this->onSOL = $oldSOL; 795 $this->atStartOfOutput = $oldStart; 796 $this->currLine = $oldCurrLine; 797 $this->logPrefix = $oldLogPrefix; 798 // Modification flags 799 $this->prevNodeUnmodified = $oldPrevNodeUnmodified; 800 $this->currNodeUnmodified = $oldCurrNodeUnmodified; 801 $this->prevNode = $oldPrevNode; 802 return $bits; 803 } 804 805 /** 806 * Serialize children of a link to a string 807 * @param DOMElement|DOMDocumentFragment $node 808 * @param ?callable $wtEscaper See {@link serializeChildren()} 809 * @return string 810 */ 811 public function serializeLinkChildrenToString( 812 DOMNode $node, ?callable $wtEscaper = null 813 ): string { 814 return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); 815 } 816 817 /** 818 * Serialize children of a caption to a string 819 * @param DOMElement|DOMDocumentFragment $node 820 * @param ?callable $wtEscaper See {@link serializeChildren()} 821 * @return string 822 */ 823 public function serializeCaptionChildrenToString( 824 DOMNode $node, ?callable $wtEscaper = null 825 ): string { 826 return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); 827 } 828 829 /** 830 * Serialize children of an indent-pre to a string 831 * @param DOMElement|DOMDocumentFragment $node 832 * @param ?callable $wtEscaper See {@link serializeChildren()} 833 * @return string 834 */ 835 public function serializeIndentPreChildrenToString( 836 DOMNode $node, ?callable $wtEscaper = null 837 ): string { 838 return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); 839 } 840 841} 842