1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers; 5 6use DOMDocumentFragment; 7use DOMElement; 8use DOMNode; 9use LogicException; 10use Wikimedia\Parsoid\Html2Wt\SerializerState; 11use Wikimedia\Parsoid\Html2Wt\WTSUtils; 12use Wikimedia\Parsoid\Utils\DOMCompat; 13use Wikimedia\Parsoid\Utils\DOMDataUtils; 14use Wikimedia\Parsoid\Utils\DOMUtils; 15use Wikimedia\Parsoid\Utils\WTUtils; 16 17/** 18 * HTML -> Wikitext serialization relies on walking the DOM and delegating 19 * the serialization requests to different DOM nodes. 20 * 21 * This class represents the interface that various DOM handlers are expected 22 * to implement. 23 * 24 * There is the core 'handle' method that deals with converting the content 25 * of the node into wikitext markup. 26 * 27 * Then there are 4 newline-constraint methods that specify the constraints 28 * that need to be satisfied for the markup to be valid. For example, list items 29 * should always start on a newline, but can only have a single newline separator. 30 * Paragraphs always start on a newline and need at least 2 newlines in wikitext 31 * for them to be recognized as paragraphs. 32 * 33 * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild) 34 * return an array with a 'min' and 'max' property. If a property is missing, it 35 * means that the dom node doesn't have any newline constraints. Some DOM handlers 36 * might therefore choose to implement none, some, or all of these methods. 37 * 38 * The return values of each of these methods are treated as consraints and the 39 * caller will have to resolve potentially conflicting constraints between a 40 * pair of nodes (siblings, parent-child). For example, if an after handler of 41 * a node wants 1 newline, but the before handler of its sibling wants none. 42 * 43 * Ideally, there should not be any incompatible constraints, but we haven't 44 * actually verified that this is the case. All consraint-hanlding code is in 45 * the separators-handling methods. 46 */ 47class DOMHandler { 48 49 /** @var bool */ 50 private $forceSOL; 51 52 /** 53 * @param bool $forceSOL 54 */ 55 public function __construct( bool $forceSOL = false ) { 56 $this->forceSOL = $forceSOL; 57 } 58 59 /** 60 * Serialize a DOM node to wikitext. 61 * Serialized wikitext should be returned via $state::emitChunk(). 62 * @param DOMElement $node 63 * @param SerializerState $state 64 * @param bool $wrapperUnmodified 65 * @return DOMNode|null The node to continue with (need not be an element always) 66 */ 67 public function handle( 68 DOMElement $node, SerializerState $state, bool $wrapperUnmodified = false 69 ): ?DOMNode { 70 throw new LogicException( 'Not implemented.' ); 71 } 72 73 /** 74 * How many newlines should be emitted *before* this node? 75 * 76 * @param DOMElement $node 77 * @param DOMNode $otherNode 78 * @param SerializerState $state 79 * @return array 80 */ 81 public function before( DOMElement $node, DOMNode $otherNode, SerializerState $state ): array { 82 return []; 83 } 84 85 /** 86 * How many newlines should be emitted *after* this node? 87 * 88 * @param DOMElement $node 89 * @param DOMNode $otherNode 90 * @param SerializerState $state 91 * @return array 92 */ 93 public function after( DOMElement $node, DOMNode $otherNode, SerializerState $state ): array { 94 return []; 95 } 96 97 /** 98 * How many newlines should be emitted before the first child? 99 * 100 * @param DOMElement|DOMDocumentFragment $node 101 * @param DOMNode $otherNode 102 * @param SerializerState $state 103 * @return array 104 */ 105 public function firstChild( DOMNode $node, DOMNode $otherNode, SerializerState $state ): array { 106 return []; 107 } 108 109 /** 110 * How many newlines should be emitted after the last child? 111 * 112 * @param DOMElement|DOMDocumentFragment $node 113 * @param DOMNode $otherNode 114 * @param SerializerState $state 115 * @return array 116 */ 117 public function lastChild( DOMNode $node, DOMNode $otherNode, SerializerState $state ): array { 118 return []; 119 } 120 121 /** 122 * Put the serializer in start-of-line mode before it is handled. 123 * All non-newline whitespace found between HTML nodes is stripped 124 * to ensure SOL state is guaranteed. 125 * 126 * @return bool 127 */ 128 public function isForceSOL(): bool { 129 return $this->forceSOL; 130 } 131 132 /** 133 * List helper: This is a shared *after* newline handler for list items. 134 * 135 * @param DOMElement $node 136 * @param DOMNode $otherNode 137 * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array. 138 */ 139 protected function wtListEOL( DOMElement $node, DOMNode $otherNode ): array { 140 if ( !DOMUtils::isElt( $otherNode ) || DOMUtils::atTheTop( $otherNode ) ) { 141 return [ 'min' => 0, 'max' => 2 ]; 142 } 143 '@phan-var DOMElement $otherNode';/** @var DOMElement $otherNode */ 144 145 if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) { 146 return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ]; 147 } 148 149 $nextSibling = DOMUtils::nextNonSepSibling( $node ); 150 $dp = DOMDataUtils::getDataParsoid( $otherNode ); 151 if ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' || isset( $dp->src ) ) { 152 return [ 'min' => 0, 'max' => 2 ]; 153 } elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) { 154 if ( DOMUtils::isList( $node ) && $otherNode->nodeName === $node->nodeName ) { 155 // Adjacent lists of same type need extra newline 156 return [ 'min' => 2, 'max' => 2 ]; 157 } elseif ( DOMUtils::isListItem( $node ) 158 || in_array( $node->parentNode->nodeName, [ 'li', 'dd' ], true ) 159 ) { 160 // Top-level list 161 return [ 'min' => 1, 'max' => 1 ]; 162 } else { 163 return [ 'min' => 1, 'max' => 2 ]; 164 } 165 } elseif ( DOMUtils::isList( $otherNode ) 166 || ( DOMUtils::isElt( $otherNode ) && ( $dp->stx ?? null ) === 'html' ) 167 ) { 168 // last child in ul/ol (the list element is our parent), defer 169 // separator constraints to the list. 170 return []; 171 } elseif ( 172 DOMUtils::isWikitextBlockNode( $node->parentNode ) && 173 DOMUtils::lastNonSepChild( $node->parentNode ) === $node 174 ) { 175 // A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line 176 // if it is the last non-separator child (ex: <div>..</ul></div>) 177 return [ 'min' => 1, 'max' => 2 ]; 178 } elseif ( DOMUtils::isFormattingElt( $otherNode ) ) { 179 return [ 'min' => 1, 'max' => 1 ]; 180 } else { 181 return [ 'min' => WTUtils::isNewElt( $node ) ? 2 : 1, 'max' => 2 ]; 182 } 183 } 184 185 /** 186 * List helper: DOM-based list bullet construction. 187 * @param SerializerState $state 188 * @param DOMElement $node 189 * @return string 190 */ 191 protected function getListBullets( SerializerState $state, DOMElement $node ): string { 192 $parentTypes = [ 193 'ul' => '*', 194 'ol' => '#' 195 ]; 196 $listTypes = [ 197 'ul' => '', 198 'ol' => '', 199 'dl' => '', 200 'li' => '', 201 'dt' => ';', 202 'dd' => ':' 203 ]; 204 205 // For new elements, for prettier wikitext serialization, 206 // emit a space after the last bullet (if required) 207 $space = $this->getLeadingSpace( $state, $node, ' ' ); 208 209 $res = ''; 210 while ( !DOMUtils::atTheTop( $node ) ) { 211 $dp = DOMDataUtils::getDataParsoid( $node ); 212 $stx = $dp->stx ?? null; 213 if ( ( $stx !== 'html' || isset( $dp->liHackSrc ) ) && isset( $listTypes[$node->nodeName] ) ) { 214 if ( $node->nodeName === 'li' ) { 215 $parentNode = $node->parentNode; 216 while ( $parentNode && !( isset( $parentTypes[$parentNode->nodeName] ) ) ) { 217 $parentNode = $parentNode->parentNode; 218 } 219 220 if ( $parentNode ) { 221 $res = $parentTypes[$parentNode->nodeName] . $res; 222 } else { 223 $state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.', 224 "Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:", 225 DOMCompat::getOuterHTML( $node ) 226 ); 227 } 228 } else { 229 $res = $listTypes[$node->nodeName] . $res; 230 } 231 } elseif ( $stx !== 'html' || 232 empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) 233 ) { 234 break; 235 } 236 237 $node = $node->parentNode; 238 } 239 240 // Don't emit a space if we aren't returning any bullets. 241 return strlen( $res ) ? $res . $space : ''; 242 } 243 244 /** 245 * Helper: Newline constraint helper for table nodes 246 * @param DOMNode $node 247 * @param DOMNode $origNode 248 * @return int 249 */ 250 protected function maxNLsInTable( DOMNode $node, DOMNode $origNode ): int { 251 return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2; 252 } 253 254 /** 255 * Private helper for serializing table nodes 256 * @param string $symbol 257 * @param ?string $endSymbol 258 * @param SerializerState $state 259 * @param DOMElement $node 260 * @return string 261 */ 262 private function serializeTableElement( 263 string $symbol, ?string $endSymbol, SerializerState $state, DOMElement $node 264 ): string { 265 $token = WTSUtils::mkTagTk( $node ); 266 $sAttribs = $state->serializer->serializeAttributes( $node, $token ); 267 if ( $sAttribs !== '' ) { 268 // IMPORTANT: use ?? not ?: in the first check because we want to preserve an 269 // empty string. Use != '' in the second to avoid treating '0' as empty. 270 return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' ); 271 } else { 272 return $symbol . ( $endSymbol != '' ? $endSymbol : '' ); 273 } 274 } 275 276 /** 277 * Helper: Handles content serialization for table nodes 278 * @param string $symbol 279 * @param ?string $endSymbol 280 * @param SerializerState $state 281 * @param DOMElement $node 282 * @param bool $wrapperUnmodified 283 * @return string 284 */ 285 protected function serializeTableTag( 286 string $symbol, 287 ?string $endSymbol, 288 SerializerState $state, 289 DOMElement $node, 290 bool $wrapperUnmodified 291 ): string { 292 if ( $wrapperUnmodified ) { 293 $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; 294 return $state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? ''; 295 } else { 296 return $this->serializeTableElement( $symbol, $endSymbol, $state, $node ); 297 } 298 } 299 300 /** 301 * Helper: Checks whether syntax information in data-parsoid is valid 302 * in the presence of table edits. For example "|" is no longer valid 303 * table-cell markup if a table cell is added before this cell. 304 * 305 * @param SerializerState $state 306 * @param DOMElement $node 307 * @return bool 308 */ 309 protected function stxInfoValidForTableCell( SerializerState $state, DOMElement $node ): bool { 310 // If row syntax is not set, nothing to worry about 311 if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) { 312 return true; 313 } 314 315 // If we have an identical previous sibling, nothing to worry about 316 $prev = DOMUtils::previousNonDeletedSibling( $node ); 317 return $prev !== null && $prev->nodeName === $node->nodeName; 318 } 319 320 /** 321 * Helper for several DOM handlers: Returns whitespace that needs to be emitted 322 * between the markup for the node and its content (ex: table cells, list items) 323 * based on node state (whether the node is original or new content) and other 324 * state (HTML version, whether selective serialization is enabled or not). 325 * @param SerializerState $state 326 * @param DOMElement $node 327 * @param string $newEltDefault 328 * @return string 329 */ 330 protected function getLeadingSpace( 331 SerializerState $state, DOMElement $node, string $newEltDefault 332 ): string { 333 $space = ''; 334 if ( WTUtils::isNewElt( $node ) ) { 335 $fc = DOMUtils::firstNonDeletedChild( $node ); 336 // PORT-FIXME are different \s semantics going to be a problem? 337 if ( $fc && ( !DOMUtils::isText( $fc ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) { 338 $space = $newEltDefault; 339 } 340 } 341 return $space; 342 } 343 344 /** 345 * Helper for several DOM handlers: Returns whitespace that needs to be emitted 346 * between the markup for the node and its next sibling based on node state 347 * (whether the node is original or new content) and other state (HTML version, 348 * whether selective serialization is enabled or not). 349 * @param SerializerState $state 350 * @param DOMElement $node 351 * @param string $newEltDefault 352 * @return string 353 */ 354 protected function getTrailingSpace( 355 SerializerState $state, DOMElement $node, string $newEltDefault 356 ): string { 357 $space = ''; 358 if ( WTUtils::isNewElt( $node ) ) { 359 $lc = DOMUtils::lastNonDeletedChild( $node ); 360 // PORT-FIXME are different \s semantics going to be a problem? 361 if ( $lc && ( !DOMUtils::isText( $lc ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) { 362 $space = $newEltDefault; 363 } 364 } 365 return $space; 366 } 367 368 /** 369 * Helper: Is this node auto-inserted by the HTML5 tree-builder 370 * during wt->html? 371 * @param DOMNode $node 372 * @return bool 373 */ 374 protected function isBuilderInsertedElt( DOMNode $node ): bool { 375 if ( !DOMUtils::isElt( $node ) ) { 376 return false; 377 } 378 '@phan-var DOMElement $node';/** @var DOMElement $node */ 379 $dp = DOMDataUtils::getDataParsoid( $node ); 380 return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd ); 381 } 382 383 /** 384 * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis 385 * N.B. We no longer emit self-closed nowikis as placeholders, so remove this 386 * once all our stored content is updated. 387 * @param DOMElement $node 388 * @param SerializerState $state 389 */ 390 protected function emitPlaceholderSrc( DOMElement $node, SerializerState $state ) { 391 $dp = DOMDataUtils::getDataParsoid( $node ); 392 if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) { 393 $state->hasSelfClosingNowikis = true; 394 } 395 // FIXME: Should this also check for tabs and plain space 396 // chars interspersed with newlines? 397 if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) { 398 $state->appendSep( $dp->src, $node ); 399 } else { 400 $state->serializer->emitWikitext( $dp->src, $node ); 401 } 402 } 403 404} 405