1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Config; 5 6use DOMDocument; 7use DOMElement; 8use DOMNode; 9use Wikimedia\Parsoid\Core\ContentModelHandler; 10use Wikimedia\Parsoid\Core\ResourceLimitExceededException; 11use Wikimedia\Parsoid\Logger\ParsoidLogger; 12use Wikimedia\Parsoid\Parsoid; 13use Wikimedia\Parsoid\Tokens\Token; 14use Wikimedia\Parsoid\Utils\DataBag; 15use Wikimedia\Parsoid\Utils\DOMCompat; 16use Wikimedia\Parsoid\Utils\DOMUtils; 17use Wikimedia\Parsoid\Utils\Title; 18use Wikimedia\Parsoid\Utils\TitleException; 19use Wikimedia\Parsoid\Utils\TitleNamespace; 20use Wikimedia\Parsoid\Utils\TokenUtils; 21use Wikimedia\Parsoid\Utils\Utils; 22use Wikimedia\Parsoid\Wt2Html\Frame; 23use Wikimedia\Parsoid\Wt2Html\PageConfigFrame; 24use Wikimedia\Parsoid\Wt2Html\ParserPipelineFactory; 25use Wikimedia\Parsoid\Wt2Html\TT\Sanitizer; 26 27// phpcs:disable MediaWiki.Commenting.FunctionComment.MissingDocumentationPublic 28 29/** 30 * Environment/Envelope class for Parsoid 31 * 32 * Carries around the SiteConfig and PageConfig during an operation 33 * and provides certain other services. 34 */ 35class Env { 36 37 /** @var SiteConfig */ 38 private $siteConfig; 39 40 /** @var PageConfig */ 41 private $pageConfig; 42 43 /** @var DataAccess */ 44 private $dataAccess; 45 46 /** 47 * The top-level frame for this conversion. This largely wraps the 48 * PageConfig. 49 * 50 * In the future we may replace PageConfig with the Frame, and add 51 * a 52 * @var Frame 53 */ 54 public $topFrame; 55 // XXX In the future, perhaps replace PageConfig with the Frame, and 56 // add $this->currentFrame (relocated from TokenTransformManager) if/when 57 // we've removed async parsing. 58 59 /** 60 * @var bool Are data accesses disabled? 61 * 62 * FIXME: This can probably moved to a NoDataAccess instance, rather than 63 * being an explicit mode of Parsoid. See T229469 64 */ 65 private $noDataAccess; 66 67 /** 68 * @var bool Are we using native template expansion? 69 * 70 * Parsoid implements native template expansion, which is currently 71 * only used during parser tests; in production, template expansion 72 * is done via MediaWiki's legacy preprocessor. 73 * 74 * FIXME: Hopefully this distinction can be removed when we're entirely 75 * in PHP land. 76 */ 77 private $nativeTemplateExpansion; 78 79 /** @phan-var array<string,int> */ 80 private $wt2htmlUsage = []; 81 82 /** @phan-var array<string,int> */ 83 private $html2wtUsage = []; 84 85 /** @var DOMDocument[] */ 86 private $liveDocs = []; 87 88 /** @var bool */ 89 private $wrapSections = true; 90 91 /** @var string */ 92 private $requestOffsetType = 'byte'; 93 94 /** @var string */ 95 private $currentOffsetType = 'byte'; 96 97 /** @var array<string,mixed> */ 98 private $behaviorSwitches = []; 99 100 /** 101 * Maps fragment id to the fragment forest (array of DOMNodes). 102 * @var array<string,DOMNode[]> 103 */ 104 private $fragmentMap = []; 105 106 /** 107 * @var int used to generate fragment ids as needed during parse 108 */ 109 private $fid = 1; 110 111 /** @var int used to generate uids as needed during this parse */ 112 private $uid = 1; 113 114 /** @var array[] Lints recorded */ 115 private $lints = []; 116 117 /** @var bool logLinterData */ 118 public $logLinterData = false; 119 120 /** @var bool[] */ 121 private $traceFlags; 122 123 /** @var bool[] */ 124 private $dumpFlags; 125 126 /** @var bool[] */ 127 private $debugFlags; 128 129 /** @var ParsoidLogger */ 130 private $parsoidLogger; 131 132 /** @var float */ 133 public $startTime; 134 135 /** @var bool */ 136 private $scrubWikitext = false; 137 138 /** 139 * The default content version that Parsoid assumes it's serializing or 140 * updating in the pb2pb endpoints 141 * 142 * @var string 143 */ 144 private $inputContentVersion; 145 146 /** 147 * The default content version that Parsoid will generate. 148 * 149 * @var string 150 */ 151 private $outputContentVersion; 152 153 /** 154 * If non-null, the language variant used for Parsoid HTML; 155 * we convert to this if wt2html, or from this if html2wt. 156 * @var string 157 */ 158 private $htmlVariantLanguage; 159 160 /** 161 * If non-null, the language variant to be used for wikitext. 162 * If null, heuristics will be used to identify the original wikitext variant 163 * in wt2html mode, and in html2wt mode new or edited HTML will be left unconverted. 164 * @var string 165 */ 166 private $wtVariantLanguage; 167 168 /** @var ParserPipelineFactory */ 169 private $pipelineFactory; 170 171 /** 172 * FIXME Used in DedupeStyles::dedupe() 173 * @var array 174 */ 175 public $styleTagKeys = []; 176 177 /** @var bool */ 178 public $pageBundle = false; 179 180 /** @var bool */ 181 public $discardDataParsoid = false; 182 183 /** @var DOMNode */ 184 private $origDOM; 185 186 /** @var DOMDocument */ 187 private $domDiff; 188 189 /** 190 * Page properties (module resources primarily) that need to be output 191 * @var array 192 */ 193 private $outputProps = []; 194 195 /** 196 * PORT-FIXME: public currently 197 * Cache of wikitext source for a title 198 * @var array 199 */ 200 public $pageCache = []; 201 202 /** 203 * PORT-FIXME: public currently 204 * HTML Cache of expanded transclusions to support 205 * reusing expansions from HTML of previous revision. 206 * @var array 207 */ 208 public $transclusionCache = []; 209 210 /** 211 * PORT-FIXME: public currently 212 * HTML Cache of expanded media wikiext to support 213 * reusing expansions from HTML of previous revision. 214 * @var array 215 */ 216 public $mediaCache = []; 217 218 /** 219 * PORT-FIXME: public currently 220 * HTML Cache of expanded extension tags to support 221 * reusing expansions from HTML of previous revision. 222 * @var array 223 */ 224 public $extensionCache = []; 225 226 /** 227 * @param SiteConfig $siteConfig 228 * @param PageConfig $pageConfig 229 * @param DataAccess $dataAccess 230 * @param array|null $options 231 * - wrapSections: (bool) Whether `<section>` wrappers should be added. 232 * - pageBundle: (bool) Sets ids on nodes and stores data-* attributes in a JSON blob. 233 * - scrubWikitext: (bool) Indicates emit "clean" wikitext. 234 * - traceFlags: (array) Flags indicating which components need to be traced 235 * - dumpFlags: (bool[]) Dump flags 236 * - debugFlags: (bool[]) Debug flags 237 * - noDataAccess: boolean 238 * - nativeTemplateExpansion: boolean 239 * - discardDataParsoid: boolean 240 * - offsetType: 'byte' (default), 'ucs2', 'char' 241 * See `Parsoid\Wt2Html\PP\Processors\ConvertOffsets`. 242 * - logLinterData: (bool) Should we log linter data if linting is enabled? 243 * - htmlVariantLanguage: string|null 244 * If non-null, the language variant used for Parsoid HTML; 245 * we convert to this if wt2html, or from this if html2wt. 246 * - wtVariantLanguage: string|null 247 * If non-null, the language variant to be used for wikitext. 248 * If null, heuristics will be used to identify the original 249 * wikitext variant in wt2html mode, and in html2wt mode new 250 * or edited HTML will be left unconverted. 251 * - logLevels: (string[]) Levels to log 252 */ 253 public function __construct( 254 SiteConfig $siteConfig, PageConfig $pageConfig, DataAccess $dataAccess, array $options = null 255 ) { 256 $options = $options ?? []; 257 $this->siteConfig = $siteConfig; 258 $this->pageConfig = $pageConfig; 259 $this->dataAccess = $dataAccess; 260 $this->topFrame = new PageConfigFrame( $this, $pageConfig, $siteConfig ); 261 if ( isset( $options['scrubWikitext'] ) ) { 262 $this->scrubWikitext = !empty( $options['scrubWikitext'] ); 263 } 264 if ( isset( $options['wrapSections'] ) ) { 265 $this->wrapSections = !empty( $options['wrapSections'] ); 266 } 267 if ( isset( $options['pageBundle'] ) ) { 268 $this->pageBundle = !empty( $options['pageBundle'] ); 269 } 270 $this->pipelineFactory = new ParserPipelineFactory( $this ); 271 $defaultContentVersion = Parsoid::defaultHTMLVersion(); 272 $this->inputContentVersion = $options['inputContentVersion'] ?? $defaultContentVersion; 273 // FIXME: We should have a check for the supported input content versions as well. 274 // That will require a separate constant. 275 $this->outputContentVersion = $options['outputContentVersion'] ?? $defaultContentVersion; 276 if ( !in_array( $this->outputContentVersion, Parsoid::AVAILABLE_VERSIONS, true ) ) { 277 throw new \UnexpectedValueException( 278 $this->outputContentVersion . ' is not an available content version.' ); 279 } 280 $this->htmlVariantLanguage = $options['htmlVariantLanguage'] ?? null; 281 $this->wtVariantLanguage = $options['wtVariantLanguage'] ?? null; 282 $this->noDataAccess = !empty( $options['noDataAccess'] ); 283 $this->nativeTemplateExpansion = !empty( $options['nativeTemplateExpansion'] ); 284 $this->discardDataParsoid = !empty( $options['discardDataParsoid'] ); 285 $this->requestOffsetType = $options['offsetType'] ?? 'byte'; 286 $this->logLinterData = !empty( $options['logLinterData'] ); 287 $this->traceFlags = $options['traceFlags'] ?? []; 288 $this->dumpFlags = $options['dumpFlags'] ?? []; 289 $this->debugFlags = $options['debugFlags'] ?? []; 290 $this->parsoidLogger = new ParsoidLogger( $this->siteConfig->getLogger(), [ 291 'logLevels' => $options['logLevels'] ?? [ 'fatal', 'error', 'warn', 'info' ], 292 'debugFlags' => $this->debugFlags, 293 'dumpFlags' => $this->dumpFlags, 294 'traceFlags' => $this->traceFlags 295 ] ); 296 } 297 298 /** 299 * @return bool 300 */ 301 public function hasTraceFlags(): bool { 302 return !empty( $this->traceFlags ); 303 } 304 305 /** 306 * Test which trace information to log 307 * 308 * @param string $flag Flag name. 309 * @return bool 310 */ 311 public function hasTraceFlag( string $flag ): bool { 312 return isset( $this->traceFlags[$flag] ); 313 } 314 315 /** 316 * @return bool 317 */ 318 public function hasDumpFlags(): bool { 319 return !empty( $this->dumpFlags ); 320 } 321 322 /** 323 * Test which state to dump 324 * 325 * @param string $flag Flag name. 326 * @return bool 327 */ 328 public function hasDumpFlag( string $flag ): bool { 329 return isset( $this->dumpFlags[$flag] ); 330 } 331 332 /** 333 * Get the site config 334 * @return SiteConfig 335 */ 336 public function getSiteConfig(): SiteConfig { 337 return $this->siteConfig; 338 } 339 340 /** 341 * Get the page config 342 * @return PageConfig 343 */ 344 public function getPageConfig(): PageConfig { 345 return $this->pageConfig; 346 } 347 348 /** 349 * Get the data access object 350 * @return DataAccess 351 */ 352 public function getDataAccess(): DataAccess { 353 return $this->dataAccess; 354 } 355 356 public function noDataAccess(): bool { 357 return $this->noDataAccess; 358 } 359 360 public function nativeTemplateExpansionEnabled(): bool { 361 return $this->nativeTemplateExpansion; 362 } 363 364 /** 365 * Get the current uid counter value 366 * @return int 367 */ 368 public function getUID(): int { 369 return $this->uid; 370 } 371 372 /** 373 * Get the current fragment id counter value 374 * @return int 375 */ 376 public function getFID(): int { 377 return $this->fid; 378 } 379 380 /** 381 * Whether `<section>` wrappers should be added. 382 * @todo Does this actually belong here? Should it be a behavior switch? 383 * @return bool 384 */ 385 public function getWrapSections(): bool { 386 return $this->wrapSections; 387 } 388 389 public function getPipelineFactory(): ParserPipelineFactory { 390 return $this->pipelineFactory; 391 } 392 393 /** 394 * Return the external format of character offsets in source ranges. 395 * Internally we always keep DomSourceRange and SourceRange information 396 * as UTF-8 byte offsets for efficiency (matches the native string 397 * representation), but for external use we can convert these to 398 * other formats when we output wt2html or input for html2wt. 399 * 400 * @see Parsoid\Wt2Html\PP\Processors\ConvertOffsets 401 * @return string 'byte', 'ucs2', or 'char' 402 */ 403 public function getRequestOffsetType(): string { 404 return $this->requestOffsetType; 405 } 406 407 /** 408 * Return the current format of character offsets in source ranges. 409 * This allows us to track whether the internal byte offsets have 410 * been converted to the external format (as returned by 411 * `getRequestOffsetType`) yet. 412 * 413 * @see Parsoid\Wt2Html\PP\Processors\ConvertOffsets 414 * @return string 'byte', 'ucs2', or 'char' 415 */ 416 public function getCurrentOffsetType(): string { 417 return $this->currentOffsetType; 418 } 419 420 /** 421 * Update the current offset type. Only 422 * Parsoid\Wt2Html\PP\Processors\ConvertOffsets should be doing this. 423 * @param string $offsetType 'byte', 'ucs2', or 'char' 424 */ 425 public function setCurrentOffsetType( string $offsetType ) { 426 $this->currentOffsetType = $offsetType; 427 } 428 429 /** 430 * Resolve strings that are page-fragments or subpage references with 431 * respect to the current page name. 432 * 433 * TODO: Handle namespaces relative links like [[User:../../]] correctly, they 434 * shouldn't be treated like links at all. 435 * 436 * @param string $str Page fragment or subpage reference. Not URL encoded. 437 * @param bool $resolveOnly If true, only trim and add the current title to 438 * lone fragments. TODO: This parameter seems poorly named. 439 * @return string Resolved title 440 */ 441 public function resolveTitle( string $str, bool $resolveOnly = false ): string { 442 $origName = $str; 443 $str = trim( $str ); // PORT-FIXME: Care about non-ASCII whitespace? 444 445 $pageConfig = $this->getPageConfig(); 446 447 // Resolve lonely fragments (important if the current page is a subpage, 448 // otherwise the relative link will be wrong) 449 if ( $str !== '' && $str[0] === '#' ) { 450 $str = $pageConfig->getTitle() . $str; 451 } 452 453 // Default return value 454 $titleKey = $str; 455 if ( $this->getSiteConfig()->namespaceHasSubpages( $pageConfig->getNs() ) ) { 456 // Resolve subpages 457 $reNormalize = false; 458 if ( preg_match( '!^(?:\.\./)+!', $str, $relUp ) ) { 459 $levels = strlen( $relUp[0] ) / 3; // Levels are indicated by '../'. 460 $titleBits = explode( '/', $pageConfig->getTitle() ); 461 if ( count( $titleBits ) <= $levels ) { 462 // Too many levels -- invalid relative link 463 return $origName; 464 } 465 $newBits = array_slice( $titleBits, 0, -$levels ); 466 if ( $str !== $relUp[0] ) { 467 $newBits[] = substr( $str, $levels * 3 ); 468 } 469 $str = implode( '/', $newBits ); 470 $reNormalize = true; 471 } elseif ( $str !== '' && $str[0] === '/' ) { 472 // Resolve absolute subpage links 473 $str = $pageConfig->getTitle() . $str; 474 $reNormalize = true; 475 } 476 477 if ( $reNormalize && !$resolveOnly ) { 478 // Remove final slashes if present. 479 // See https://gerrit.wikimedia.org/r/173431 480 $str = rtrim( $str, '/' ); 481 $titleKey = (string)$this->normalizedTitleKey( $str ); 482 } 483 } 484 485 // Strip leading ':' 486 if ( $titleKey !== '' && $titleKey[0] === ':' && !$resolveOnly ) { 487 $titleKey = substr( $titleKey, 1 ); 488 } 489 return $titleKey; 490 } 491 492 /** 493 * Convert a Title to a string 494 * @param Title $title 495 * @param bool $ignoreFragment 496 * @return string 497 */ 498 private function titleToString( Title $title, bool $ignoreFragment = false ): string { 499 $ret = $title->getPrefixedDBKey(); 500 if ( !$ignoreFragment ) { 501 $fragment = $title->getFragment() ?? ''; 502 if ( $fragment !== '' ) { 503 $ret .= '#' . $fragment; 504 } 505 } 506 return $ret; 507 } 508 509 /** 510 * Get normalized title key for a title string. 511 * 512 * @param string $str Should be in url-decoded format. 513 * @param bool $noExceptions Return null instead of throwing exceptions. 514 * @param bool $ignoreFragment Ignore the fragment, if any. 515 * @return string|null Normalized title key for a title string (or null for invalid titles). 516 */ 517 public function normalizedTitleKey( 518 string $str, bool $noExceptions = false, bool $ignoreFragment = false 519 ): ?string { 520 $title = $this->makeTitleFromURLDecodedStr( $str, 0, $noExceptions ); 521 if ( !$title ) { 522 return null; 523 } 524 return $this->titleToString( $title, $ignoreFragment ); 525 } 526 527 /** 528 * Normalize and resolve the page title 529 * @deprecated Just use $this->getPageConfig()->getTitle() directly 530 * @return string 531 */ 532 public function normalizeAndResolvePageTitle(): string { 533 return $this->getPageConfig()->getTitle(); 534 } 535 536 /** 537 * Create a Title object 538 * @param string $text URL-decoded text 539 * @param int|TitleNamespace $defaultNs 540 * @param bool $noExceptions 541 * @return Title|null 542 */ 543 private function makeTitle( string $text, $defaultNs = 0, bool $noExceptions = false ): ?Title { 544 try { 545 if ( preg_match( '!^(?:[#/]|\.\./)!', $text ) ) { 546 $defaultNs = $this->getPageConfig()->getNs(); 547 } 548 $text = $this->resolveTitle( $text ); 549 return Title::newFromText( $text, $this->getSiteConfig(), $defaultNs ); 550 } catch ( TitleException $e ) { 551 if ( $noExceptions ) { 552 return null; 553 } 554 throw $e; 555 } 556 } 557 558 /** 559 * Create a Title object 560 * @see Title::newFromURL in MediaWiki 561 * @param string $str URL-encoded text 562 * @param int|TitleNamespace $defaultNs 563 * @param bool $noExceptions 564 * @return Title|null 565 */ 566 public function makeTitleFromText( 567 string $str, $defaultNs = 0, bool $noExceptions = false 568 ): ?Title { 569 return $this->makeTitle( Utils::decodeURIComponent( $str ), $defaultNs, $noExceptions ); 570 } 571 572 /** 573 * Create a Title object 574 * @see Title::newFromText in MediaWiki 575 * @param string $str URL-decoded text 576 * @param int|TitleNamespace $defaultNs 577 * @param bool $noExceptions 578 * @return Title|null 579 */ 580 public function makeTitleFromURLDecodedStr( 581 string $str, $defaultNs = 0, bool $noExceptions = false 582 ): ?Title { 583 return $this->makeTitle( $str, $defaultNs, $noExceptions ); 584 } 585 586 /** 587 * Make a link to a Title 588 * @param Title $title 589 * @return string 590 */ 591 public function makeLink( Title $title ): string { 592 return Sanitizer::sanitizeTitleURI( 593 $this->getSiteConfig()->relativeLinkPrefix() . $this->titleToString( $title ), 594 false 595 ); 596 } 597 598 /** 599 * Test if an href attribute value could be a valid link target 600 * @param string|(Token|string)[] $href 601 * @return bool 602 */ 603 public function isValidLinkTarget( $href ): bool { 604 $href = TokenUtils::tokensToString( $href ); 605 606 // decode percent-encoding so that we can reliably detect 607 // bad page title characters 608 $hrefToken = Utils::decodeURIComponent( $href ); 609 return $this->normalizedTitleKey( $this->resolveTitle( $hrefToken, true ), true ) !== null; 610 } 611 612 /** 613 * Generate a new uid 614 * @return int 615 */ 616 public function generateUID(): int { 617 return $this->uid++; 618 } 619 620 /** 621 * Generate a new object id 622 * @return string 623 */ 624 public function newObjectId(): string { 625 return "mwt" . $this->generateUID(); 626 } 627 628 /** 629 * Generate a new about id 630 * @return string 631 */ 632 public function newAboutId(): string { 633 return "#" . $this->newObjectId(); 634 } 635 636 /** 637 * Store reference to original DOM (body) 638 * @param DOMElement $domBody 639 */ 640 public function setOrigDOM( DOMElement $domBody ): void { 641 $this->origDOM = $domBody; 642 } 643 644 /** 645 * Return reference to original DOM (body) 646 * @return DOMElement 647 */ 648 public function getOrigDOM(): DOMElement { 649 return $this->origDOM; 650 } 651 652 /** 653 * Store reference to DOM diff document 654 * @param DOMDocument $doc 655 */ 656 public function setDOMDiff( $doc ): void { 657 $this->domDiff = $doc; 658 } 659 660 /** 661 * Return reference to DOM diff document 662 * @return DOMDocument|null 663 */ 664 public function getDOMDiff(): ?DOMDocument { 665 return $this->domDiff; 666 } 667 668 /** 669 * Generate a new fragment id 670 * @return string 671 */ 672 public function newFragmentId(): string { 673 return "mwf" . (string)$this->fid++; 674 } 675 676 /** 677 * FIXME: This function could be given a better name to reflect what it does. 678 * 679 * @param DOMDocument $doc 680 * @param DataBag|null $bag 681 */ 682 public function referenceDataObject( DOMDocument $doc, ?DataBag $bag = null ): void { 683 // `bag` is a deliberate dynamic property; see DOMDataUtils::getBag() 684 // @phan-suppress-next-line PhanUndeclaredProperty dynamic property 685 $doc->bag = $bag ?? new DataBag(); 686 687 // Prevent GC from collecting the PHP wrapper around the libxml doc 688 $this->liveDocs[] = $doc; 689 } 690 691 /** 692 * @param string $html 693 * @param bool $validateXMLNames 694 * @return DOMDocument 695 */ 696 public function createDocument( 697 string $html = '', bool $validateXMLNames = false 698 ): DOMDocument { 699 $doc = DOMUtils::parseHTML( $html, $validateXMLNames ); 700 // Cache the head and body. 701 DOMCompat::getHead( $doc ); 702 DOMCompat::getBody( $doc ); 703 $this->referenceDataObject( $doc ); 704 return $doc; 705 } 706 707 /** 708 * BehaviorSwitchHandler support function that adds a property named by 709 * $variable and sets it to $state 710 * 711 * @deprecated Use setBehaviorSwitch() instead. 712 * @param string $variable 713 * @param mixed $state 714 */ 715 public function setVariable( string $variable, $state ): void { 716 $this->setBehaviorSwitch( $variable, $state ); 717 } 718 719 /** 720 * Record a behavior switch. 721 * 722 * @todo Does this belong here, or on some equivalent to MediaWiki's ParserOutput? 723 * @param string $switch Switch name 724 * @param mixed $state Relevant state data to record 725 */ 726 public function setBehaviorSwitch( string $switch, $state ): void { 727 $this->behaviorSwitches[$switch] = $state; 728 } 729 730 /** 731 * Fetch the state of a previously-recorded behavior switch. 732 * 733 * @todo Does this belong here, or on some equivalent to MediaWiki's ParserOutput? 734 * @param string $switch Switch name 735 * @param mixed|null $default Default value if the switch was never set 736 * @return mixed State data that was previously passed to setBehaviorSwitch(), or $default 737 */ 738 public function getBehaviorSwitch( string $switch, $default = null ) { 739 return $this->behaviorSwitches[$switch] ?? $default; 740 } 741 742 /** 743 * @return array<string,DOMNode[]> 744 */ 745 public function getDOMFragmentMap(): array { 746 return $this->fragmentMap; 747 } 748 749 /** 750 * @param string $id Fragment id 751 * @return DOMNode[] 752 */ 753 public function getDOMFragment( string $id ): array { 754 return $this->fragmentMap[$id]; 755 } 756 757 /** 758 * @param string $id Fragment id 759 * @param DOMNode[] $forest DOM forest (contiguous array of DOM trees) 760 * to store against the fragment id 761 */ 762 public function setDOMFragment( string $id, array $forest ): void { 763 $this->fragmentMap[$id] = $forest; 764 } 765 766 /** 767 * Record a lint 768 * @param string $type Lint type key 769 * @param array $lintData Data for the lint. 770 * - dsr: (SourceRange) 771 * - params: (array) 772 * - templateInfo: (array|null) 773 */ 774 public function recordLint( string $type, array $lintData ): void { 775 // Parsoid-JS tests don't like getting null properties where JS had undefined. 776 $lintData = array_filter( $lintData, function ( $v ) { 777 return $v !== null; 778 } ); 779 780 if ( empty( $lintData['dsr'] ) ) { 781 $this->log( 'error/lint', "Missing DSR; msg=", $lintData ); 782 return; 783 } 784 785 // This will always be recorded as a native 'byte' offset 786 $lintData['dsr'] = $lintData['dsr']->jsonSerialize(); 787 788 // Ensure a "params" array 789 if ( !isset( $lintData['params'] ) ) { 790 $lintData['params'] = []; 791 } 792 793 $this->lints[] = [ 'type' => $type ] + $lintData; 794 } 795 796 /** 797 * Retrieve recorded lints 798 * @return array[] 799 */ 800 public function getLints(): array { 801 return $this->lints; 802 } 803 804 /** 805 * Init lints to the passed array. 806 * 807 * FIXME: This is currently needed to reset lints after converting 808 * DSR offsets because of ordering of DOM passes. So, in reality, 809 * there should be no real use case for setting this anywhere else 810 * but from that single callsite. 811 * 812 * @param array $lints 813 */ 814 public function setLints( array $lints ): void { 815 $this->lints = $lints; 816 } 817 818 /** 819 * @param mixed ...$args 820 */ 821 public function log( ...$args ): void { 822 $this->parsoidLogger->log( ...$args ); 823 } 824 825 /** 826 * Update a profile timer. 827 * 828 * @param string $resource 829 * @param mixed $time 830 * @param mixed $cat 831 */ 832 public function bumpTimeUse( string $resource, $time, $cat ): void { 833 // --trace ttm:* trip on this if we throw an exception 834 // throw new \BadMethodCallException( 'not yet ported' ); 835 } 836 837 /** 838 * Update a profile counter. 839 * 840 * @param string $resource 841 * @param int $n The amount to increment the counter; defaults to 1. 842 */ 843 public function bumpCount( string $resource, int $n = 1 ): void { 844 throw new \BadMethodCallException( 'not yet ported' ); 845 } 846 847 /** 848 * Bump usage of some limited parser resource 849 * (ex: tokens, # transclusions, # list items, etc.) 850 * 851 * @param string $resource 852 * @param int $count How much of the resource is used? 853 * @throws ResourceLimitExceededException 854 */ 855 public function bumpWt2HtmlResourceUse( string $resource, int $count = 1 ): void { 856 $n = $this->wt2htmlUsage[$resource] ?? 0; 857 $n += $count; 858 $this->wt2htmlUsage[$resource] = $n; 859 $wt2htmlLimits = $this->siteConfig->getWt2HtmlLimits(); 860 if ( 861 isset( $wt2htmlLimits[$resource] ) && 862 $n > $wt2htmlLimits[$resource] 863 ) { 864 // TODO: re-evaluate whether throwing an exception is really 865 // the right failure strategy when Parsoid is integrated into MW 866 // (T221238) 867 throw new ResourceLimitExceededException( "wt2html: $resource limit exceeded: $n" ); 868 } 869 } 870 871 /** 872 * Bump usage of some limited serializer resource 873 * (ex: html size) 874 * 875 * @param string $resource 876 * @param int $count How much of the resource is used? (defaults to 1) 877 * @throws ResourceLimitExceededException 878 */ 879 public function bumpHtml2WtResourceUse( string $resource, int $count = 1 ): void { 880 $n = $this->html2wtUsage[$resource] ?? 0; 881 $n += $count; 882 $this->html2wtUsage[$resource] = $n; 883 $html2wtLimits = $this->siteConfig->getHtml2WtLimits(); 884 if ( 885 isset( $html2wtLimits[$resource] ) && 886 $n > $html2wtLimits[$resource] 887 ) { 888 throw new ResourceLimitExceededException( "html2wt: $resource limit exceeded: $n" ); 889 } 890 } 891 892 /** 893 * Get an appropriate content handler, given a contentmodel. 894 * 895 * @param string|null &$contentmodel An optional content model which 896 * will override whatever the source specifies. It gets set to the 897 * handler which is used. 898 * @return ContentModelHandler An appropriate content handler 899 */ 900 public function getContentHandler( 901 ?string &$contentmodel = null 902 ): ContentModelHandler { 903 $contentmodel = $contentmodel ?? $this->pageConfig->getContentModel(); 904 $handler = $this->siteConfig->getContentModelHandler( $contentmodel ); 905 if ( !$handler ) { 906 $this->log( 'warn', "Unknown contentmodel $contentmodel" ); 907 $contentmodel = 'wikitext'; 908 $handler = $this->siteConfig->getContentModelHandler( $contentmodel ); 909 } 910 return $handler; 911 } 912 913 /** 914 * Is the language converter enabled on this page? 915 * 916 * @return bool 917 */ 918 public function langConverterEnabled(): bool { 919 return $this->siteConfig->langConverterEnabledForLanguage( 920 $this->pageConfig->getPageLanguage() 921 ); 922 } 923 924 /** 925 * Indicates emit "clean" wikitext compared to what we would if we didn't normalize HTML 926 * @return bool 927 */ 928 public function shouldScrubWikitext(): bool { 929 return $this->scrubWikitext; 930 } 931 932 /** 933 * The HTML content version of the input document (for html2wt and html2html conversions). 934 * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation 935 * @see https://www.mediawiki.org/wiki/Specs/HTML/2.1.0#Versioning 936 * @return string A semver version number 937 */ 938 public function getInputContentVersion(): string { 939 return $this->inputContentVersion; 940 } 941 942 /** 943 * The HTML content version of the input document (for html2wt and html2html conversions). 944 * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation 945 * @see https://www.mediawiki.org/wiki/Specs/HTML/2.1.0#Versioning 946 * @return string A semver version number 947 */ 948 public function getOutputContentVersion(): string { 949 return $this->outputContentVersion; 950 } 951 952 /** 953 * If non-null, the language variant used for Parsoid HTML; we convert 954 * to this if wt2html, or from this (if html2wt). 955 * 956 * @return string|null 957 */ 958 public function getHtmlVariantLanguage(): ?string { 959 return $this->htmlVariantLanguage; 960 } 961 962 /** 963 * If non-null, the language variant to be used for wikitext. If null, 964 * heuristics will be used to identify the original wikitext variant 965 * in wt2html mode, and in html2wt mode new or edited HTML will be left 966 * unconverted. 967 * 968 * @return string|null 969 */ 970 public function getWtVariantLanguage(): ?string { 971 return $this->wtVariantLanguage; 972 } 973 974 /** 975 * Update K=[V1,V2,...] that might need to be output as part of the 976 * generated HTML. Ex: module styles, modules scripts, ... 977 * 978 * @param string $key 979 * @param array $value 980 */ 981 public function addOutputProperty( string $key, array $value ): void { 982 if ( !isset( $this->outputProps[$key] ) ) { 983 $this->outputProps[$key] = []; 984 } 985 $this->outputProps[$key] = array_merge( $this->outputProps[$key], $value ); 986 } 987 988 /** 989 * @return array 990 */ 991 public function getOutputProperties(): array { 992 return $this->outputProps; 993 } 994 995 /** 996 * Determine appropriate vary headers for the HTML form of this page. 997 * @return string 998 */ 999 public function htmlVary(): string { 1000 $varies = [ 'Accept' ]; // varies on Content-Type 1001 if ( $this->langConverterEnabled() ) { 1002 $varies[] = 'Accept-Language'; 1003 } 1004 1005 sort( $varies ); 1006 return implode( ', ', $varies ); 1007 } 1008 1009 /** 1010 * Determine an appropriate content-language for the HTML form of this page. 1011 * @return string 1012 */ 1013 public function htmlContentLanguage(): string { 1014 // PageConfig::htmlVariant is set iff we do variant conversion on the 1015 // HTML 1016 return $this->pageConfig->getVariant() ?? 1017 $this->pageConfig->getPageLanguage(); 1018 } 1019} 1020