1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Utils; 5 6use DOMDocument; 7use DOMElement; 8use DOMNode; 9use Wikimedia\Assert\Assert; 10use Wikimedia\Parsoid\Config\Env; 11use Wikimedia\Parsoid\Core\DomSourceRange; 12use Wikimedia\Parsoid\Wt2Html\XMLSerializer; 13 14/** 15 * These utilities are for processing content that's generated 16 * by parsing source input (ex: wikitext) 17 */ 18class ContentUtils { 19 /** 20 * XML Serializer. 21 * 22 * @param DOMNode $node 23 * @param array $options XMLSerializer options. 24 * @return string 25 */ 26 public static function toXML( DOMNode $node, array $options = [] ): string { 27 return XMLSerializer::serialize( $node, $options )['html']; 28 } 29 30 /** 31 * dataobject aware XML serializer, to be used in the DOM post-processing phase. 32 * 33 * @param DOMNode $node 34 * @param array $options 35 * @return string 36 */ 37 public static function ppToXML( DOMNode $node, array $options = [] ): string { 38 // We really only want to pass along `$options['keepTmp']` 39 DOMDataUtils::visitAndStoreDataAttribs( $node, $options ); 40 return self::toXML( $node, $options ); 41 } 42 43 /** 44 * .dataobject aware HTML parser, to be used in the DOM 45 * post-processing phase. 46 * 47 * @param Env $env 48 * @param string $html 49 * @param array|null $options 50 * @return DOMElement 51 */ 52 public static function ppToDOM( Env $env, string $html, array $options = [] ): DOMElement { 53 $options += [ 54 'node' => null, 55 'reinsertFosterableContent' => null, 56 ]; 57 $node = $options['node']; 58 if ( $node === null ) { 59 $node = DOMCompat::getBody( $env->createDocument( $html ) ); 60 } else { 61 DOMUtils::assertElt( $node ); 62 DOMCompat::setInnerHTML( $node, $html ); 63 } 64 65 if ( $options['reinsertFosterableContent'] ) { 66 DOMUtils::visitDOM( $node, function ( $n, ...$args ) use ( $env ) { 67 // untunnel fostered content 68 $meta = WTUtils::reinsertFosterableContent( $env, $n, true ); 69 $n = $meta ?? $n; 70 71 // load data attribs 72 DOMDataUtils::loadDataAttribs( $n, ...$args ); 73 }, $options ); 74 } else { 75 DOMDataUtils::visitAndLoadDataAttribs( $node, $options ); 76 } 77 return $node; 78 } 79 80 /** 81 * Pull the data-parsoid script element out of the doc before serializing. 82 * 83 * @param DOMNode $node 84 * @param array $options XMLSerializer options. 85 * @return array 86 */ 87 public static function extractDpAndSerialize( DOMNode $node, array $options = [] ): array { 88 $doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node; 89 $pb = DOMDataUtils::extractPageBundle( $doc ); 90 $out = XMLSerializer::serialize( $node, $options ); 91 $out['pb'] = $pb; 92 return $out; 93 } 94 95 /** 96 * Strip Parsoid-inserted section wrappers and fallback id spans with 97 * HTML4 ids for headings from the DOM. 98 * 99 * @param DOMElement $node 100 */ 101 public static function stripSectionTagsAndFallbackIds( DOMElement $node ): void { 102 $n = $node->firstChild; 103 while ( $n ) { 104 $next = $n->nextSibling; 105 if ( $n instanceof DOMElement ) { 106 // Recurse into subtree before stripping this 107 self::stripSectionTagsAndFallbackIds( $n ); 108 109 // Strip <section> tags 110 if ( WTUtils::isParsoidSectionTag( $n ) ) { 111 DOMUtils::migrateChildren( $n, $n->parentNode, $n ); 112 $n->parentNode->removeChild( $n ); 113 } 114 115 // Strip <span typeof='mw:FallbackId' ...></span> 116 if ( WTUtils::isFallbackIdSpan( $n ) ) { 117 $n->parentNode->removeChild( $n ); 118 } 119 } 120 $n = $next; 121 } 122 } 123 124 /** 125 * @param DOMNode $node 126 * @param DOMNode $clone 127 * @param array $options 128 */ 129 private static function cloneData( 130 DOMNode $node, DOMNode $clone, array $options 131 ): void { 132 if ( !( $node instanceof DOMElement ) ) { 133 return; 134 } 135 DOMUtils::assertElt( $clone ); 136 137 $d = DOMDataUtils::getNodeData( $node ); 138 DOMDataUtils::setNodeData( $clone, Utils::clone( $d ) ); 139 $node = $node->firstChild; 140 $clone = $clone->firstChild; 141 while ( $node ) { 142 self::cloneData( $node, $clone, $options ); 143 $node = $node->nextSibling; 144 $clone = $clone->nextSibling; 145 } 146 } 147 148 /** 149 * @param array $buf 150 * @param array &$opts 151 */ 152 private static function emit( array $buf, array &$opts ): void { 153 $str = implode( "\n", $buf ) . "\n"; 154 if ( isset( $opts['outBuffer'] ) ) { 155 $opts['outBuffer'] .= $str; 156 } elseif ( isset( $opts['outStream'] ) ) { 157 fwrite( $opts['outStream'], $str . "\n" ); 158 } else { 159 error_log( $str ); 160 } 161 } 162 163 /** 164 * Shift the DSR of a DOM fragment. 165 * @param Env $env 166 * @param DOMNode $rootNode 167 * @param callable $dsrFunc 168 * @return DOMNode Returns the $rootNode passed in to allow chaining. 169 */ 170 public static function shiftDSR( Env $env, DOMNode $rootNode, callable $dsrFunc ): DOMNode { 171 $doc = $rootNode->ownerDocument; 172 $convertString = function ( $str ) { 173 // Stub $convertString out to allow definition of a pair of 174 // mutually-recursive functions. 175 return $str; 176 }; 177 $convertNode = function ( DOMNode $node ) use ( 178 $env, $dsrFunc, &$convertString, &$convertNode 179 ) { 180 if ( !( $node instanceof DOMElement ) ) { 181 return; 182 } 183 $dp = DOMDataUtils::getDataParsoid( $node ); 184 if ( ( $dp->dsr ?? null ) !== null ) { 185 $dp->dsr = $dsrFunc( clone $dp->dsr ); 186 // We don't need to setDataParsoid because dp is not a copy 187 } 188 if ( ( $dp->tmp->origDSR ?? null ) !== null ) { 189 // Even though tmp shouldn't escape Parsoid, go ahead and 190 // convert to enable hybrid testing. 191 $dp->tmp->origDSR = $dsrFunc( clone $dp->tmp->origDSR ); 192 } 193 if ( ( $dp->extTagOffsets ?? null ) !== null ) { 194 $dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets ); 195 } 196 197 // Handle embedded HTML in Language Variant markup 198 $dmwv = DOMDataUtils::getJSONAttribute( $node, 'data-mw-variant', null ); 199 if ( $dmwv ) { 200 if ( isset( $dmwv->disabled ) ) { 201 $dmwv->disabled->t = $convertString( $dmwv->disabled->t ); 202 } 203 if ( isset( $dmwv->twoway ) ) { 204 foreach ( $dmwv->twoway as $l ) { 205 $l->t = $convertString( $l->t ); 206 } 207 } 208 if ( isset( $dmwv->oneway ) ) { 209 foreach ( $dmwv->oneway as $l ) { 210 $l->f = $convertString( $l->f ); 211 $l->t = $convertString( $l->t ); 212 } 213 } 214 if ( isset( $dmwv->filter ) ) { 215 $dmwv->filter->t = $convertString( $dmwv->filter->t ); 216 } 217 DOMDataUtils::setJSONAttribute( $node, 'data-mw-variant', $dmwv ); 218 } 219 220 if ( DOMUtils::matchTypeOf( $node, '#^mw:(ExpandedAttrs|Image|Extension)\b#D' ) ) { 221 $dmw = DOMDataUtils::getDataMw( $node ); 222 // Handle embedded HTML in template-affected attributes 223 if ( $dmw->attribs ?? null ) { 224 foreach ( $dmw->attribs as &$a ) { 225 foreach ( $a as $kOrV ) { 226 if ( gettype( $kOrV ) !== 'string' && isset( $kOrV->html ) ) { 227 $kOrV->html = $convertString( $kOrV->html ); 228 } 229 } 230 } 231 } 232 // Handle embedded HTML in figure-inline captions 233 if ( $dmw->caption ?? null ) { 234 $dmw->caption = $convertString( $dmw->caption ); 235 } 236 // FIXME: Cite-specific handling here maybe? 237 if ( $dmw->body->html ?? null ) { 238 $dmw->body->html = $convertString( $dmw->body->html ); 239 } 240 DOMDataUtils::setDataMw( $node, $dmw ); 241 } 242 243 if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment(/|$)#D' ) ) { 244 $dp = DOMDataUtils::getDataParsoid( $node ); 245 if ( $dp->html ?? null ) { 246 $nodes = $env->getDOMFragment( $dp->html ); 247 foreach ( $nodes as $n ) { 248 DOMPostOrder::traverse( $n, $convertNode ); 249 } 250 } 251 } 252 }; 253 $convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string { 254 $parentNode = $doc->createElement( 'body' ); 255 $node = self::ppToDOM( $env, $str, [ 'node' => $parentNode ] ); 256 DOMPostOrder::traverse( $node, $convertNode ); 257 return self::ppToXML( $node, [ 'innerXML' => true ] ); 258 }; 259 DOMPostOrder::traverse( $rootNode, $convertNode ); 260 return $rootNode; // chainable 261 } 262 263 /** 264 * Convert DSR offsets in a Document between utf-8/ucs2/codepoint 265 * indices. 266 * 267 * Offset types are: 268 * - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`. 269 * - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`. 270 * - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`. 271 * 272 * @see TokenUtils::convertTokenOffsets for a related function on tokens. 273 * 274 * @param Env $env 275 * @param DOMDocument $doc The document to convert 276 * @param string $from Offset type to convert from. 277 * @param string $to Offset type to convert to. 278 */ 279 public static function convertOffsets( 280 Env $env, 281 DOMDocument $doc, 282 string $from, 283 string $to 284 ): void { 285 $env->setCurrentOffsetType( $to ); 286 if ( $from === $to ) { 287 return; // Hey, that was easy! 288 } 289 $offsetMap = []; 290 $offsets = []; 291 $collect = function ( int $n ) use ( &$offsetMap, &$offsets ) { 292 if ( !array_key_exists( $n, $offsetMap ) ) { 293 $box = PHPUtils::arrayToObject( [ 'value' => $n ] ); 294 $offsetMap[$n] = $box; 295 $offsets[] =& $box->value; 296 } 297 }; 298 // Collect DSR offsets throughout the document 299 $collectDSR = function ( DomSourceRange $dsr ) use ( $collect ) { 300 if ( $dsr->start !== null ) { 301 $collect( $dsr->start ); 302 $collect( $dsr->innerStart() ); 303 } 304 if ( $dsr->end !== null ) { 305 $collect( $dsr->innerEnd() ); 306 $collect( $dsr->end ); 307 } 308 return $dsr; 309 }; 310 $body = DOMCompat::getBody( $doc ); 311 self::shiftDSR( $env, $body, $collectDSR ); 312 if ( count( $offsets ) === 0 ) { 313 return; /* nothing to do (shouldn't really happen) */ 314 } 315 // Now convert these offsets 316 TokenUtils::convertOffsets( 317 $env->topFrame->getSrcText(), $from, $to, $offsets 318 ); 319 // Apply converted offsets 320 $applyDSR = function ( DomSourceRange $dsr ) use ( $offsetMap ) { 321 $start = $dsr->start; 322 $openWidth = $dsr->openWidth; 323 if ( $start !== null ) { 324 $start = $offsetMap[$start]->value; 325 $openWidth = $offsetMap[$dsr->innerStart()]->value - $start; 326 } 327 $end = $dsr->end; 328 $closeWidth = $dsr->closeWidth; 329 if ( $end !== null ) { 330 $end = $offsetMap[$end]->value; 331 $closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value; 332 } 333 return new DomSourceRange( 334 $start, $end, $openWidth, $closeWidth 335 ); 336 }; 337 self::shiftDSR( $env, $body, $applyDSR ); 338 } 339 340 /** 341 * Dump the DOM with attributes. 342 * 343 * @param DOMNode $rootNode 344 * @param string $title 345 * @param array &$options 346 */ 347 public static function dumpDOM( 348 DOMNode $rootNode, string $title, array &$options = [] 349 ): void { 350 if ( !empty( $options['storeDiffMark'] ) || !empty( $options['dumpFragmentMap'] ) ) { 351 Assert::invariant( isset( $options['env'] ), "env should be set" ); 352 } 353 354 if ( $rootNode instanceof DOMElement ) { 355 // cloneNode doesn't clone data => walk DOM to clone it 356 $clonedRoot = $rootNode->cloneNode( true ); 357 self::cloneData( $rootNode, $clonedRoot, $options ); 358 } else { 359 $clonedRoot = $rootNode; 360 } 361 362 $buf = []; 363 if ( empty( $options['quiet'] ) ) { 364 $buf[] = '----- ' . $title . ' -----'; 365 } 366 367 $buf[] = self::ppToXML( $clonedRoot, $options ); 368 self::emit( $buf, $options ); 369 370 // Dump cached fragments 371 if ( !empty( $options['dumpFragmentMap'] ) ) { 372 foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) { 373 $buf = []; 374 $buf[] = str_repeat( '=', 15 ); 375 $buf[] = 'FRAGMENT ' . $k; 376 $buf[] = ''; 377 self::emit( $buf, $options ); 378 379 $newOpts = $options; 380 $newOpts['dumpFragmentMap'] = false; 381 $newOpts['quiet'] = true; 382 self::dumpDOM( is_array( $fragment ) ? $fragment[0] : $fragment, '', $newOpts ); 383 } 384 } 385 386 if ( empty( $options['quiet'] ) ) { 387 self::emit( [ str_repeat( '-', mb_strlen( $title ) + 12 ) ], $options ); 388 } 389 } 390 391} 392