1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Utils; 5 6use DOMComment; 7use DOMElement; 8use DOMNode; 9use stdClass; 10use Wikimedia\Parsoid\Config\Env; 11use Wikimedia\Parsoid\Config\WikitextConstants as Consts; 12use Wikimedia\Parsoid\Ext\ExtensionTagHandler; 13use Wikimedia\Parsoid\Tokens\CommentTk; 14use Wikimedia\Parsoid\Wt2Html\Frame; 15 16/** 17 * These utilites pertain to extracting / modifying wikitext information from the DOM. 18 */ 19class WTUtils { 20 private const FIRST_ENCAP_REGEXP = 21 '#(?:^|\s)(mw:(?:Transclusion|Param|LanguageVariant|Extension(/[^\s]+)))(?=$|\s)#D'; 22 23 /** 24 * Regexp for checking marker metas typeofs representing 25 * transclusion markup or template param markup. 26 */ 27 private const TPL_META_TYPE_REGEXP = '#^mw:(?:Transclusion|Param)(?:/End)?$#D'; 28 29 /** 30 * Check whether a node's data-parsoid object includes 31 * an indicator that the original wikitext was a literal 32 * HTML element (like table or p) 33 * 34 * @param stdClass $dp 35 * @return bool 36 */ 37 public static function hasLiteralHTMLMarker( stdClass $dp ): bool { 38 return isset( $dp->stx ) && $dp->stx === 'html'; 39 } 40 41 /** 42 * Run a node through {@link #hasLiteralHTMLMarker}. 43 * @param DOMNode|null $node 44 * @return bool 45 */ 46 public static function isLiteralHTMLNode( ?DOMNode $node ): bool { 47 return ( $node && 48 $node instanceof DOMElement && 49 self::hasLiteralHTMLMarker( DOMDataUtils::getDataParsoid( $node ) ) ); 50 } 51 52 /** 53 * @param DOMNode $node 54 * @return bool 55 */ 56 public static function isZeroWidthWikitextElt( DOMNode $node ): bool { 57 return isset( Consts::$ZeroWidthWikitextTags[$node->nodeName] ) && 58 !self::isLiteralHTMLNode( $node ); 59 } 60 61 /** 62 * Is `$node` a block node that is also visible in wikitext? 63 * An example of an invisible block node is a `<p>`-tag that 64 * Parsoid generated, or a `<ul>`, `<ol>` tag. 65 * 66 * @param DOMNode $node 67 * @return bool 68 */ 69 public static function isBlockNodeWithVisibleWT( DOMNode $node ): bool { 70 return DOMUtils::isBlockNode( $node ) && !self::isZeroWidthWikitextElt( $node ); 71 } 72 73 /** 74 * Helper functions to detect when an A-$node uses [[..]]/[..]/... style 75 * syntax (for wikilinks, ext links, url links). rel-type is not sufficient 76 * anymore since mw:ExtLink is used for all the three link syntaxes. 77 * 78 * @param DOMElement $node 79 * @param stdClass|null $dp 80 * @return bool 81 */ 82 public static function usesWikiLinkSyntax( DOMElement $node, ?stdClass $dp ): bool { 83 // FIXME: Optimization from ComputeDSR to avoid refetching this property 84 // Is it worth the unnecessary code here? 85 if ( !$dp ) { 86 $dp = DOMDataUtils::getDataParsoid( $node ); 87 } 88 89 // SSS FIXME: This requires to be made more robust 90 // for when dp->stx value is not present 91 return $node->getAttribute( "rel" ) === "mw:WikiLink" || 92 ( isset( $dp->stx ) && $dp->stx !== "url" && $dp->stx !== "magiclink" ); 93 } 94 95 /** 96 * Helper function to detect when an A-node uses ext-link syntax. 97 * rel attribute is not sufficient anymore since mw:ExtLink is used for 98 * multiple link types 99 * 100 * @param DOMElement $node 101 * @param stdClass|null $dp 102 * @return bool 103 */ 104 public static function usesExtLinkSyntax( DOMElement $node, ?stdClass $dp ): bool { 105 // FIXME: Optimization from ComputeDSR to avoid refetching this property 106 // Is it worth the unnecessary code here? 107 if ( !$dp ) { 108 $dp = DOMDataUtils::getDataParsoid( $node ); 109 } 110 111 // SSS FIXME: This requires to be made more robust 112 // for when $dp->stx value is not present 113 return $node->getAttribute( "rel" ) === "mw:ExtLink" && 114 ( !isset( $dp->stx ) || ( $dp->stx !== "url" && $dp->stx !== "magiclink" ) ); 115 } 116 117 /** 118 * Helper function to detect when an A-node uses url-link syntax. 119 * rel attribute is not sufficient anymore since mw:ExtLink is used for 120 * multiple link types 121 * 122 * @param DOMElement $node 123 * @param stdClass|null $dp 124 * @return bool 125 */ 126 public static function usesURLLinkSyntax( DOMElement $node, stdClass $dp = null ): bool { 127 // FIXME: Optimization from ComputeDSR to avoid refetching this property 128 // Is it worth the unnecessary code here? 129 if ( !$dp ) { 130 $dp = DOMDataUtils::getDataParsoid( $node ); 131 } 132 133 // SSS FIXME: This requires to be made more robust 134 // for when $dp->stx value is not present 135 return $node->getAttribute( "rel" ) === "mw:ExtLink" && 136 isset( $dp->stx ) && $dp->stx === "url"; 137 } 138 139 /** 140 * Helper function to detect when an A-node uses magic-link syntax. 141 * rel attribute is not sufficient anymore since mw:ExtLink is used for 142 * multiple link types 143 * 144 * @param DOMElement $node 145 * @param stdClass|null $dp 146 * @return bool 147 */ 148 public static function usesMagicLinkSyntax( DOMElement $node, stdClass $dp = null ): bool { 149 if ( !$dp ) { 150 $dp = DOMDataUtils::getDataParsoid( $node ); 151 } 152 153 // SSS FIXME: This requires to be made more robust 154 // for when $dp->stx value is not present 155 return $node->getAttribute( "rel" ) === "mw:ExtLink" && 156 isset( $dp->stx ) && $dp->stx === "magiclink"; 157 } 158 159 /** 160 * Check whether a node's typeof indicates that it is a template expansion. 161 * 162 * @param DOMElement $node 163 * @return ?string The matched type, or null if no match. 164 */ 165 public static function matchTplType( DOMElement $node ): ?string { 166 return DOMUtils::matchTypeOf( $node, self::TPL_META_TYPE_REGEXP ); 167 } 168 169 /** 170 * Check whether a typeof indicates that it signifies an 171 * expanded attribute. 172 * 173 * @param DOMElement $node 174 * @return bool 175 */ 176 public static function hasExpandedAttrsType( DOMElement $node ): bool { 177 return DOMUtils::matchTypeOf( $node, '/^mw:ExpandedAttrs(\/[^\s]+)*$/' ) !== null; 178 } 179 180 /** 181 * Check whether a node is a meta tag that signifies a template expansion. 182 * 183 * @param DOMNode $node 184 * @return bool 185 */ 186 public static function isTplMarkerMeta( DOMNode $node ): bool { 187 return DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ) !== null; 188 } 189 190 /** 191 * Check whether a node is a meta signifying the start of a template expansion. 192 * 193 * @param DOMNode $node 194 * @return bool 195 */ 196 public static function isTplStartMarkerMeta( DOMNode $node ): bool { 197 $t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ); 198 return $t !== null && !preg_match( '#/End$#D', $t ); 199 } 200 201 /** 202 * Check whether a node is a meta signifying the end of a template 203 * expansion. 204 * 205 * @param DOMNode $node 206 * @return bool 207 */ 208 public static function isTplEndMarkerMeta( DOMNode $node ): bool { 209 $t = DOMUtils::matchNameAndTypeOf( $node, 'meta', self::TPL_META_TYPE_REGEXP ); 210 return $t !== null && preg_match( '#/End$#D', $t ); 211 } 212 213 /** 214 * Find the first wrapper element of encapsulated content. 215 * @param DOMNode $node 216 * @return DOMElement|null 217 */ 218 public static function findFirstEncapsulationWrapperNode( DOMNode $node ): ?DOMElement { 219 if ( !self::hasParsoidAboutId( $node ) ) { 220 return null; 221 } 222 /** @var DOMElement $node */ 223 DOMUtils::assertElt( $node ); 224 225 $about = $node->getAttribute( 'about' ); 226 $prev = $node; 227 do { 228 $node = $prev; 229 $prev = DOMUtils::previousNonDeletedSibling( $node ); 230 } while ( 231 $prev && 232 $prev instanceof DOMElement && 233 $prev->getAttribute( 'about' ) === $about 234 ); 235 $elt = self::isFirstEncapsulationWrapperNode( $node ) ? $node : null; 236 '@phan-var ?DOMElement $elt'; // @var ?DOMElement $elt 237 return $elt; 238 } 239 240 /** 241 * This tests whether a DOM $node is a new $node added during an edit session 242 * or an existing $node from parsed wikitext. 243 * 244 * As written, this function can only be used on non-template/extension content 245 * or on the top-level $nodes of template/extension content. This test will 246 * return the wrong results on non-top-level $nodes of template/extension content. 247 * 248 * @param DOMNode $node 249 * @return bool 250 */ 251 public static function isNewElt( DOMNode $node ): bool { 252 // We cannot determine newness on text/comment $nodes. 253 if ( !( $node instanceof DOMElement ) ) { 254 return false; 255 } 256 257 // For template/extension content, newness should be 258 // checked on the encapsulation wrapper $node. 259 $node = self::findFirstEncapsulationWrapperNode( $node ) ?? $node; 260 $dp = DOMDataUtils::getDataParsoid( $node ); 261 return !empty( $dp->tmp->isNew ); 262 } 263 264 /** 265 * Check whether a pre is caused by indentation in the original wikitext. 266 * @param DOMNode $node 267 * @return bool 268 */ 269 public static function isIndentPre( DOMNode $node ): bool { 270 return $node->nodeName === "pre" && !self::isLiteralHTMLNode( $node ); 271 } 272 273 /** 274 * @param DOMNode $node 275 * @return bool 276 */ 277 public static function isInlineMedia( DOMNode $node ): bool { 278 return DOMUtils::matchNameAndTypeOf( 279 $node, 'figure-inline', '#^mw:(Image|Video|Audio)($|/)#D' 280 ) !== null; 281 } 282 283 /** 284 * @param DOMNode $node 285 * @return bool 286 */ 287 public static function isGeneratedFigure( DOMNode $node ): bool { 288 return DOMUtils::matchTypeOf( $node, '#^mw:(Image|Video|Audio)($|/)#' ) !== null; 289 } 290 291 /** 292 * Find how much offset is necessary for the DSR of an 293 * indent-originated pre tag. 294 * 295 * @param DOMNode $textNode 296 * @return int 297 */ 298 public static function indentPreDSRCorrection( DOMNode $textNode ): int { 299 // NOTE: This assumes a text-node and doesn't check that it is one. 300 // 301 // FIXME: Doesn't handle text nodes that are not direct children of the pre 302 if ( self::isIndentPre( $textNode->parentNode ) ) { 303 if ( $textNode->parentNode->lastChild === $textNode ) { 304 // We dont want the trailing newline of the last child of the pre 305 // to contribute a pre-correction since it doesn't add new content 306 // in the pre-node after the text 307 $numNLs = preg_match_all( '/\n./', $textNode->nodeValue ); 308 } else { 309 $numNLs = preg_match_all( '/\n/', $textNode->nodeValue ); 310 } 311 return $numNLs; 312 } else { 313 return 0; 314 } 315 } 316 317 /** 318 * Check if $node is an ELEMENT $node belongs to a template/extension. 319 * 320 * NOTE: Use with caution. This technique works reliably for the 321 * root level elements of tpl-content DOM subtrees since only they 322 * are guaranteed to be marked and nested content might not 323 * necessarily be marked. 324 * 325 * @param DOMNode $node 326 * @return bool 327 */ 328 public static function hasParsoidAboutId( DOMNode $node ): bool { 329 if ( 330 $node instanceof DOMElement && 331 $node->hasAttribute( 'about' ) 332 ) { 333 $about = $node->getAttribute( 'about' ); 334 // SSS FIXME: Verify that our DOM spec clarifies this 335 // expectation on about-ids and that our clients respect this. 336 return $about && Utils::isParsoidObjectId( $about ); 337 } else { 338 return false; 339 } 340 } 341 342 /** 343 * Does $node represent a redirect link? 344 * 345 * @param DOMNode $node 346 * @return bool 347 */ 348 public static function isRedirectLink( DOMNode $node ): bool { 349 return $node->nodeName === 'link' && 350 DOMUtils::assertElt( $node ) && 351 preg_match( '#\bmw:PageProp/redirect\b#', $node->getAttribute( 'rel' ) ); 352 } 353 354 /** 355 * Does $node represent a category link? 356 * 357 * @param DOMNode|null $node 358 * @return bool 359 */ 360 public static function isCategoryLink( ?DOMNode $node ): bool { 361 return $node instanceof DOMelement && 362 $node->nodeName === 'link' && 363 preg_match( '#\bmw:PageProp/Category\b#', $node->getAttribute( 'rel' ) ); 364 } 365 366 /** 367 * Does $node represent a link that is sol-transparent? 368 * 369 * @param DOMNode $node 370 * @return bool 371 */ 372 public static function isSolTransparentLink( DOMNode $node ): bool { 373 return $node->nodeName === 'link' && 374 DOMUtils::assertElt( $node ) && 375 preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $node->getAttribute( 'rel' ) ); 376 } 377 378 /** 379 * Check if '$node' emits wikitext that is sol-transparent in wikitext form. 380 * This is a test for wikitext that doesn't introduce line breaks. 381 * 382 * Comment, whitespace text $nodes, category links, redirect links, behavior 383 * switches, and include directives currently satisfy this definition. 384 * 385 * This should come close to matching TokenUtils.isSolTransparent() 386 * 387 * @param DOMNode $node 388 * @return bool 389 */ 390 public static function emitsSolTransparentSingleLineWT( DOMNode $node ): bool { 391 if ( DOMUtils::isText( $node ) ) { 392 // NB: We differ here to meet the nl condition. 393 return (bool)preg_match( '/^[ \t]*$/D', $node->nodeValue ); 394 } elseif ( self::isRenderingTransparentNode( $node ) ) { 395 // NB: The only metas in a DOM should be for behavior switches and 396 // include directives, other than explicit HTML meta tags. This 397 // differs from our counterpart in Util where ref meta tokens 398 // haven't been expanded to spans yet. 399 return true; 400 } else { 401 return false; 402 } 403 } 404 405 /** 406 * This is the span added to headings to add fallback ids for when legacy 407 * and HTML5 ids don't match up. This prevents broken links to legacy ids. 408 * 409 * @param DOMNode $node 410 * @return bool 411 */ 412 public static function isFallbackIdSpan( DOMNode $node ): bool { 413 return DOMUtils::hasNameAndTypeOf( $node, 'span', 'mw:FallbackId' ); 414 } 415 416 /** 417 * These are primarily 'metadata'-like $nodes that don't show up in output rendering. 418 * - In Parsoid output, they are represented by link/meta tags. 419 * - In the PHP parser, they are completely stripped from the input early on. 420 * Because of this property, these rendering-transparent $nodes are also 421 * SOL-transparent for the purposes of parsing behavior. 422 * 423 * @param DOMNode $node 424 * @return bool 425 */ 426 public static function isRenderingTransparentNode( DOMNode $node ): bool { 427 // FIXME: Can we change this entire thing to 428 // DOMUtils::isComment($node) || 429 // DOMUtils::getDataParsoid($node).stx !== 'html' && 430 // ($node->nodeName === 'meta' || $node->nodeName === 'link') 431 // 432 return DOMUtils::isComment( $node ) || 433 self::isSolTransparentLink( $node ) || ( 434 // Catch-all for everything else. 435 $node->nodeName === 'meta' && 436 DOMUtils::assertElt( $node ) && 437 ( 438 // (Start|End)Tag metas clone data-parsoid from the tokens 439 // they're shadowing, which trips up on the stx check. 440 // TODO: Maybe that data should be nested in a property? 441 DOMUtils::matchTypeOf( $node, '/^mw:(StartTag|EndTag)$/' ) !== null || 442 !isset( DOMDataUtils::getDataParsoid( $node )->stx ) || 443 DOMDataUtils::getDataParsoid( $node )->stx !== 'html' 444 ) 445 ) || self::isFallbackIdSpan( $node ); 446 } 447 448 /** 449 * Is $node nested inside a table tag that uses HTML instead of native 450 * wikitext? 451 * 452 * @param DOMNode $node 453 * @return bool 454 */ 455 public static function inHTMLTableTag( DOMNode $node ): bool { 456 $p = $node->parentNode; 457 while ( DOMUtils::isTableTag( $p ) ) { 458 if ( self::isLiteralHTMLNode( $p ) ) { 459 return true; 460 } elseif ( $p->nodeName === 'table' ) { 461 // Don't cross <table> boundaries 462 return false; 463 } 464 $p = $p->parentNode; 465 } 466 467 return false; 468 } 469 470 /** 471 * Is $node the first wrapper element of encapsulated content? 472 * 473 * @param DOMNode $node 474 * @return bool 475 */ 476 public static function isFirstEncapsulationWrapperNode( DOMNode $node ): bool { 477 return DOMUtils::matchTypeOf( $node, self::FIRST_ENCAP_REGEXP ) !== null; 478 } 479 480 /** 481 * Is $node an encapsulation wrapper elt? 482 * 483 * All root-level $nodes of generated content are considered 484 * encapsulation wrappers and share an about-id. 485 * 486 * @param DOMNode $node 487 * @return bool 488 */ 489 public static function isEncapsulationWrapper( DOMNode $node ): bool { 490 // True if it has an encapsulation type or while walking backwards 491 // over elts with identical about ids, we run into a $node with an 492 // encapsulation type. 493 if ( !( $node instanceof DOMElement ) ) { 494 return false; 495 } 496 return self::findFirstEncapsulationWrapperNode( $node ) !== null; 497 } 498 499 /** 500 * Is $node a DOMFragment wrapper? 501 * 502 * @param DOMNode $node 503 * @return bool 504 */ 505 public static function isDOMFragmentWrapper( DOMNode $node ): bool { 506 // See TokenUtils::hasDOMFragmentType 507 return DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null; 508 } 509 510 /** 511 * Is $node a sealed DOMFragment of a specific type? 512 * 513 * @param DOMNode $node 514 * @param string $type 515 * @return bool 516 */ 517 public static function isSealedFragmentOfType( DOMNode $node, string $type ): bool { 518 return DOMUtils::hasTypeOf( $node, "mw:DOMFragment/sealed/$type" ); 519 } 520 521 /** 522 * Is $node a Parsoid-generated <section> tag? 523 * 524 * @param DOMNode $node 525 * @return bool 526 */ 527 public static function isParsoidSectionTag( DOMNode $node ): bool { 528 return $node->nodeName === 'section' && 529 DOMUtils::assertElt( $node ) && 530 $node->hasAttribute( 'data-mw-section-id' ); 531 } 532 533 /** 534 * Is the $node from extension content? 535 * @param DOMNode $node 536 * @param string $extType 537 * @return bool 538 */ 539 public static function fromExtensionContent( DOMNode $node, string $extType ): bool { 540 $parentNode = $node->parentNode; 541 while ( $parentNode && !DOMUtils::atTheTop( $parentNode ) ) { 542 if ( DOMUtils::hasTypeOf( $parentNode, "mw:Extension/$extType" ) ) { 543 return true; 544 } 545 $parentNode = $parentNode->parentNode; 546 } 547 return false; 548 } 549 550 /** 551 * Compute, when possible, the wikitext source for a $node in 552 * an environment env. Returns null if the source cannot be 553 * extracted. 554 * @param Frame $frame 555 * @param DOMElement $node 556 * @return string|null 557 */ 558 public static function getWTSource( Frame $frame, DOMElement $node ): ?string { 559 $dp = DOMDataUtils::getDataParsoid( $node ); 560 $dsr = $dp->dsr ?? null; 561 // FIXME: We could probably change the null return to '' 562 // Just need to verify that code that uses this won't break 563 return Utils::isValidDSR( $dsr ) ? 564 $dsr->substr( $frame->getSrcText() ) : null; 565 } 566 567 /** 568 * Gets all siblings that follow '$node' that have an 'about' as 569 * their about id. 570 * 571 * This is used to fetch transclusion/extension content by using 572 * the about-id as the key. This works because 573 * transclusion/extension content is a forest of dom-trees formed 574 * by adjacent dom-nodes. This is the contract that template 575 * encapsulation, dom-reuse, and VE code all have to abide by. 576 * 577 * The only exception to this adjacency rule is IEW nodes in 578 * fosterable positions (in tables) which are not span-wrapped to 579 * prevent them from getting fostered out. 580 * 581 * @param DOMNode $node 582 * @param string $about 583 * @return DOMNode[] 584 */ 585 public static function getAboutSiblings( DOMNode $node, string $about ): array { 586 $nodes = [ $node ]; 587 588 if ( !$about ) { 589 return $nodes; 590 } 591 592 $node = $node->nextSibling; 593 while ( $node && ( 594 $node instanceof DOMElement && 595 $node->getAttribute( 'about' ) === $about || 596 DOMUtils::isFosterablePosition( $node ) && !DOMUtils::isElt( $node ) && DOMUtils::isIEW( $node ) 597 ) ) { 598 $nodes[] = $node; 599 $node = $node->nextSibling; 600 } 601 602 // Remove already consumed trailing IEW, if any 603 while ( count( $nodes ) > 0 && DOMUtils::isIEW( $nodes[count( $nodes ) - 1] ) ) { 604 array_pop( $nodes ); 605 } 606 607 return $nodes; 608 } 609 610 /** 611 * This function is only intended to be used on encapsulated $nodes 612 * (Template/Extension/Param content). 613 * 614 * Given a '$node' that has an about-id, it is assumed that it is generated 615 * by templates or extensions. This function skips over all 616 * following content nodes and returns the first non-template node 617 * that follows it. 618 * 619 * @param DOMNode $node 620 * @return DOMNode|null 621 */ 622 public static function skipOverEncapsulatedContent( DOMNode $node ): ?DOMNode { 623 if ( $node instanceof DOMElement && $node->hasAttribute( 'about' ) ) { 624 $about = $node->getAttribute( 'about' ); 625 // Guaranteed not to be empty. It will at least include $node. 626 $aboutSiblings = self::getAboutSiblings( $node, $about ); 627 return end( $aboutSiblings )->nextSibling; 628 } else { 629 return $node->nextSibling; 630 } 631 } 632 633 /** 634 * Comment encoding/decoding. 635 * 636 * * Some relevant phab tickets: T94055, T70146, T60184, T95039 637 * 638 * The wikitext comment rule is very simple: <!-- starts a comment, 639 * and --> ends a comment. This means we can have almost anything as the 640 * contents of a comment (except the string "-->", but see below), including 641 * several things that are not valid in HTML5 comments: 642 * 643 * * For one, the html5 comment parsing algorithm [0] leniently accepts 644 * --!> as a closing comment tag, which differs from the php+tidy combo. 645 * 646 * * If the comment's data matches /^-?>/, html5 will end the comment. 647 * For example, <!-->stuff<--> breaks up as 648 * <!--> (the comment) followed by, stuff<--> (as text). 649 * 650 * * Finally, comment data shouldn't contain two consecutive hyphen-minus 651 * characters (--), nor end in a hyphen-minus character (/-$/) as defined 652 * in the spec [1]. 653 * 654 * We work around all these problems by using HTML entity encoding inside 655 * the comment body. The characters -, >, and & must be encoded in order 656 * to prevent premature termination of the comment by one of the cases 657 * above. Encoding other characters is optional; all entities will be 658 * decoded during wikitext serialization. 659 * 660 * In order to allow *arbitrary* content inside a wikitext comment, 661 * including the forbidden string "-->" we also do some minimal entity 662 * decoding on the wikitext. We are also limited by our inability 663 * to encode DSR attributes on the comment $node, so our wikitext entity 664 * decoding must be 1-to-1: that is, there must be a unique "decoded" 665 * string for every wikitext sequence, and for every decoded string there 666 * must be a unique wikitext which creates it. 667 * 668 * The basic idea here is to replace every string ab*c with the string with 669 * one more b in it. This creates a string with no instance of "ac", 670 * so you can use 'ac' to encode one more code point. In this case 671 * a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to 672 * encode "-->" (which is otherwise unspeakable in wikitext). 673 * 674 * Note that any user content which does not match the regular 675 * expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext 676 * representation, as shown in the first two examples below. 677 * 678 * User-authored comment text Wikitext HTML5 DOM 679 * -------------------------- ------------- ---------------------- 680 * & - > & - > & + > 681 * Use > here Use > here Use &gt; here 682 * --> --> ++> 683 * --> --&gt; ++&gt; 684 * --&gt; --&amp;gt; ++&amp;gt; 685 * 686 * [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state 687 * [1] http://www.w3.org/TR/html5/syntax.html#comments 688 * 689 * Map a wikitext-escaped comment to an HTML DOM-escaped comment. 690 * 691 * @param string $comment Wikitext-escaped comment. 692 * @return string DOM-escaped comment. 693 */ 694 public static function encodeComment( string $comment ): string { 695 // Undo wikitext escaping to obtain "true value" of comment. 696 $trueValue = preg_replace_callback( '/--&(amp;)*gt;/', function ( $m ) { 697 return Utils::decodeWtEntities( $m[0] ); 698 }, $comment ); 699 700 // Now encode '-', '>' and '&' in the "true value" as HTML entities, 701 // so that they can be safely embedded in an HTML comment. 702 // This part doesn't have to map strings 1-to-1. 703 // WARNING(T279451): This is actually the part which protects the 704 // "-type" key in self::fosterCommentData 705 return preg_replace_callback( '/[->&]/', function ( $m ) { 706 return Utils::entityEncodeAll( $m[0] ); 707 }, $trueValue ); 708 } 709 710 /** 711 * Map an HTML DOM-escaped comment to a wikitext-escaped comment. 712 * @param string $comment DOM-escaped comment. 713 * @return string Wikitext-escaped comment. 714 */ 715 public static function decodeComment( string $comment ): string { 716 // Undo HTML entity escaping to obtain "true value" of comment. 717 $trueValue = Utils::decodeWtEntities( $comment ); 718 719 // ok, now encode this "true value" of the comment in such a way 720 // that the string "-->" never shows up. (See above.) 721 return preg_replace_callback( '/--(&(amp;)*gt;|>)/', function ( $m ) { 722 $s = $m[0]; 723 return $s === '-->' ? '-->' : '--&' . substr( $s, 3 ); 724 }, $trueValue ); 725 } 726 727 /** 728 * Utility function: we often need to know the wikitext DSR length for 729 * an HTML DOM comment value. 730 * 731 * @param DOMComment|CommentTk|string $node A comment node containing a DOM-escaped comment. 732 * @return int The wikitext length in UTF-8 bytes necessary to encode this 733 * comment, including 7 characters for the `<!--` and `-->` delimiters. 734 */ 735 public static function decodedCommentLength( $node ): int { 736 // Add 7 for the "<!--" and "-->" delimiters in wikitext. 737 if ( $node instanceof DOMComment ) { 738 $value = $node->nodeValue; 739 } elseif ( $node instanceof CommentTk ) { 740 $value = $node->value; 741 } else { 742 $value = $node; 743 } 744 return strlen( self::decodeComment( $value ) ) + 7; 745 } 746 747 /** 748 * Escape `<nowiki>` tags. 749 * 750 * @param string $text 751 * @return string 752 */ 753 public static function escapeNowikiTags( string $text ): string { 754 return preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '<$1>', $text ); 755 } 756 757 /** 758 * Conditional encoding is because, while treebuilding, the value goes 759 * directly from token to dom node without the comment itself being 760 * stringified and parsed where the comment encoding would be necessary. 761 * 762 * @param string $typeOf 763 * @param array $attrs 764 * @param bool $encode 765 * @return string 766 */ 767 public static function fosterCommentData( string $typeOf, array $attrs, bool $encode ): string { 768 $str = PHPUtils::jsonEncode( [ 769 // WARNING(T279451): The choice of "-type" as the key is because 770 // "-" will be encoded with self::encodeComment when comments come 771 // from source wikitext (see the grammar), so we can be sure when 772 // reinserting that the comments are internal to Parsoid 773 '-type' => $typeOf, 774 'attrs' => $attrs 775 ] ); 776 if ( $encode ) { 777 $str = self::encodeComment( $str ); 778 } 779 return $str; 780 } 781 782 /** 783 * @param Env $env 784 * @param DOMNode $node 785 * @param bool $decode 786 * @return DOMNode|null 787 */ 788 public static function reinsertFosterableContent( Env $env, DOMNode $node, bool $decode ): 789 ?DOMNode { 790 if ( DOMUtils::isComment( $node ) && preg_match( '/^\{.+\}$/D', $node->nodeValue ) ) { 791 // XXX(T279451#6981267): Hardcode this for good measure, even 792 // though all production uses should already be passing in `false` 793 $decode = false; 794 // Convert serialized meta tags back from comments. 795 // We use this trick because comments won't be fostered, 796 // providing more accurate information about where tags are expected 797 // to be found. 798 // @phan-suppress-next-line PhanImpossibleCondition 799 $data = json_decode( $decode ? self::decodeComment( $node->nodeValue ) : $node->nodeValue ); 800 if ( $data === null ) { 801 // not a valid json attribute, do nothing 802 return null; 803 } 804 $type = $data->{'-type'} ?? ''; 805 if ( preg_match( '/^mw:/', $type ) ) { 806 $meta = $node->ownerDocument->createElement( 'meta' ); 807 foreach ( $data->attrs as $attr ) { 808 try { 809 $meta->setAttribute( ...$attr ); 810 } catch ( \Exception $e ) { 811 $env->log( 'warn', 'prepareDOM: Dropped invalid attribute', 812 PHPUtils::jsonEncode( $attr ) 813 ); 814 } 815 } 816 $node->parentNode->replaceChild( $meta, $node ); 817 return $meta; 818 } 819 } 820 return null; 821 } 822 823 /** 824 * @param Env $env 825 * @param DOMNode $node 826 * @return ?ExtensionTagHandler 827 */ 828 public static function getNativeExt( Env $env, DOMNode $node ): ?ExtensionTagHandler { 829 $match = DOMUtils::matchTypeOf( $node, '/^mw:Extension\/(.+?)$/' ); 830 $matchingTag = $match ? substr( $match, strlen( 'mw:Extension/' ) ) : null; 831 return $matchingTag ? 832 $env->getSiteConfig()->getExtTagImpl( $matchingTag ) : null; 833 } 834} 835