1<?php 2declare( strict_types = 1 ); 3 4namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers; 5 6use LogicException; 7use Wikimedia\Parsoid\DOM\DocumentFragment; 8use Wikimedia\Parsoid\DOM\Element; 9use Wikimedia\Parsoid\DOM\Node; 10use Wikimedia\Parsoid\Html2Wt\SerializerState; 11use Wikimedia\Parsoid\Html2Wt\WTSUtils; 12use Wikimedia\Parsoid\Utils\DOMCompat; 13use Wikimedia\Parsoid\Utils\DOMDataUtils; 14use Wikimedia\Parsoid\Utils\DOMUtils; 15use Wikimedia\Parsoid\Utils\WTUtils; 16 17/** 18 * HTML -> Wikitext serialization relies on walking the DOM and delegating 19 * the serialization requests to different DOM nodes. 20 * 21 * This class represents the interface that various DOM handlers are expected 22 * to implement. 23 * 24 * There is the core 'handle' method that deals with converting the content 25 * of the node into wikitext markup. 26 * 27 * Then there are 4 newline-constraint methods that specify the constraints 28 * that need to be satisfied for the markup to be valid. For example, list items 29 * should always start on a newline, but can only have a single newline separator. 30 * Paragraphs always start on a newline and need at least 2 newlines in wikitext 31 * for them to be recognized as paragraphs. 32 * 33 * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild) 34 * return an array with a 'min' and 'max' property. If a property is missing, it 35 * means that the dom node doesn't have any newline constraints. Some DOM handlers 36 * might therefore choose to implement none, some, or all of these methods. 37 * 38 * The return values of each of these methods are treated as consraints and the 39 * caller will have to resolve potentially conflicting constraints between a 40 * pair of nodes (siblings, parent-child). For example, if an after handler of 41 * a node wants 1 newline, but the before handler of its sibling wants none. 42 * 43 * Ideally, there should not be any incompatible constraints, but we haven't 44 * actually verified that this is the case. All consraint-hanlding code is in 45 * the separators-handling methods. 46 */ 47class DOMHandler { 48 49 /** @var bool */ 50 private $forceSOL; 51 52 /** 53 * @param bool $forceSOL 54 */ 55 public function __construct( bool $forceSOL = false ) { 56 $this->forceSOL = $forceSOL; 57 } 58 59 /** 60 * Serialize a DOM node to wikitext. 61 * Serialized wikitext should be returned via $state::emitChunk(). 62 * @param Element $node 63 * @param SerializerState $state 64 * @param bool $wrapperUnmodified 65 * @return Node|null The node to continue with (need not be an element always) 66 */ 67 public function handle( 68 Element $node, SerializerState $state, bool $wrapperUnmodified = false 69 ): ?Node { 70 throw new LogicException( 'Not implemented.' ); 71 } 72 73 /** 74 * How many newlines should be emitted *before* this node? 75 * 76 * @param Element $node 77 * @param Node $otherNode 78 * @param SerializerState $state 79 * @return array 80 */ 81 public function before( Element $node, Node $otherNode, SerializerState $state ): array { 82 return []; 83 } 84 85 /** 86 * How many newlines should be emitted *after* this node? 87 * 88 * @param Element $node 89 * @param Node $otherNode 90 * @param SerializerState $state 91 * @return array 92 */ 93 public function after( Element $node, Node $otherNode, SerializerState $state ): array { 94 return []; 95 } 96 97 /** 98 * How many newlines should be emitted before the first child? 99 * 100 * @param Element|DocumentFragment $node 101 * @param Node $otherNode 102 * @param SerializerState $state 103 * @return array 104 */ 105 public function firstChild( Node $node, Node $otherNode, SerializerState $state ): array { 106 return []; 107 } 108 109 /** 110 * How many newlines should be emitted after the last child? 111 * 112 * @param Element|DocumentFragment $node 113 * @param Node $otherNode 114 * @param SerializerState $state 115 * @return array 116 */ 117 public function lastChild( Node $node, Node $otherNode, SerializerState $state ): array { 118 return []; 119 } 120 121 /** 122 * Put the serializer in start-of-line mode before it is handled. 123 * All non-newline whitespace found between HTML nodes is stripped 124 * to ensure SOL state is guaranteed. 125 * 126 * @return bool 127 */ 128 public function forceSOL(): bool { 129 return $this->forceSOL; 130 } 131 132 /** 133 * List helper: This is a shared *after* newline handler for list items. 134 * 135 * @param Element $node 136 * @param Node $otherNode 137 * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array. 138 */ 139 protected function wtListEOL( Element $node, Node $otherNode ): array { 140 if ( !DOMUtils::isElt( $otherNode ) || DOMUtils::atTheTop( $otherNode ) ) { 141 return [ 'min' => 0, 'max' => 2 ]; 142 } 143 '@phan-var Element $otherNode';/** @var Element $otherNode */ 144 145 if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) { 146 return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ]; 147 } 148 149 $nextSibling = DOMUtils::nextNonSepSibling( $node ); 150 $dp = DOMDataUtils::getDataParsoid( $otherNode ); 151 if ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' || isset( $dp->src ) ) { 152 return [ 'min' => 0, 'max' => 2 ]; 153 } elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) { 154 if ( DOMUtils::isList( $node ) && DOMCompat::nodeName( $otherNode ) === DOMCompat::nodeName( $node ) ) { 155 // Adjacent lists of same type need extra newline 156 return [ 'min' => 2, 'max' => 2 ]; 157 } elseif ( DOMUtils::isListItem( $node ) 158 || in_array( DOMCompat::nodeName( $node->parentNode ), [ 'li', 'dd' ], true ) 159 ) { 160 // Top-level list 161 return [ 'min' => 1, 'max' => 1 ]; 162 } else { 163 return [ 'min' => 1, 'max' => 2 ]; 164 } 165 } elseif ( DOMUtils::isList( $otherNode ) 166 || ( DOMUtils::isElt( $otherNode ) && ( $dp->stx ?? null ) === 'html' ) 167 ) { 168 // last child in ul/ol (the list element is our parent), defer 169 // separator constraints to the list. 170 return []; 171 } elseif ( 172 DOMUtils::isWikitextBlockNode( $node->parentNode ) && 173 DOMUtils::lastNonSepChild( $node->parentNode ) === $node 174 ) { 175 // A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line 176 // if it is the last non-separator child (ex: <div>..</ul></div>) 177 return [ 'min' => 1, 'max' => 2 ]; 178 } elseif ( DOMUtils::isFormattingElt( $otherNode ) ) { 179 return [ 'min' => 1, 'max' => 1 ]; 180 } else { 181 return [ 'min' => WTUtils::isNewElt( $node ) ? 2 : 1, 'max' => 2 ]; 182 } 183 } 184 185 /** 186 * List helper: DOM-based list bullet construction. 187 * @param SerializerState $state 188 * @param Element $node 189 * @return string 190 */ 191 protected function getListBullets( SerializerState $state, Element $node ): string { 192 $parentTypes = [ 193 'ul' => '*', 194 'ol' => '#' 195 ]; 196 $listTypes = [ 197 'ul' => '', 198 'ol' => '', 199 'dl' => '', 200 'li' => '', 201 'dt' => ';', 202 'dd' => ':' 203 ]; 204 205 // For new elements, for prettier wikitext serialization, 206 // emit a space after the last bullet (if required) 207 $space = $this->getLeadingSpace( $state, $node, ' ' ); 208 209 $res = ''; 210 while ( !DOMUtils::atTheTop( $node ) ) { 211 $dp = DOMDataUtils::getDataParsoid( $node ); 212 if ( isset( $listTypes[DOMCompat::nodeName( $node )] ) ) { 213 if ( DOMCompat::nodeName( $node ) === 'li' ) { 214 $parentNode = $node->parentNode; 215 while ( $parentNode && !( isset( $parentTypes[DOMCompat::nodeName( $parentNode )] ) ) ) { 216 $parentNode = $parentNode->parentNode; 217 } 218 219 if ( $parentNode ) { 220 if ( !WTUtils::isLiteralHTMLNode( $parentNode ) ) { 221 $res = $parentTypes[DOMCompat::nodeName( $parentNode )] . $res; 222 } 223 } else { 224 $state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.', 225 "Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:", 226 DOMCompat::getOuterHTML( $node ) 227 ); 228 } 229 } elseif ( !WTUtils::isLiteralHTMLNode( $node ) ) { 230 $res = $listTypes[DOMCompat::nodeName( $node )] . $res; 231 } 232 } elseif ( !WTUtils::isLiteralHTMLNode( $node ) || 233 empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) 234 ) { 235 break; 236 } 237 238 $node = $node->parentNode; 239 } 240 241 // Don't emit a space if we aren't returning any bullets. 242 return strlen( $res ) ? $res . $space : ''; 243 } 244 245 /** 246 * Helper: Newline constraint helper for table nodes 247 * @param Node $node 248 * @param Node $origNode 249 * @return int 250 */ 251 protected function maxNLsInTable( Node $node, Node $origNode ): int { 252 return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2; 253 } 254 255 /** 256 * Private helper for serializing table nodes 257 * @param string $symbol 258 * @param ?string $endSymbol 259 * @param SerializerState $state 260 * @param Element $node 261 * @return string 262 */ 263 private function serializeTableElement( 264 string $symbol, ?string $endSymbol, SerializerState $state, Element $node 265 ): string { 266 $token = WTSUtils::mkTagTk( $node ); 267 $sAttribs = $state->serializer->serializeAttributes( $node, $token ); 268 if ( $sAttribs !== '' ) { 269 // IMPORTANT: use ?? not ?: in the first check because we want to preserve an 270 // empty string. Use != '' in the second to avoid treating '0' as empty. 271 return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' ); 272 } else { 273 return $symbol . ( $endSymbol != '' ? $endSymbol : '' ); 274 } 275 } 276 277 /** 278 * Helper: Handles content serialization for table nodes 279 * @param string $symbol 280 * @param ?string $endSymbol 281 * @param SerializerState $state 282 * @param Element $node 283 * @param bool $wrapperUnmodified 284 * @return string 285 */ 286 protected function serializeTableTag( 287 string $symbol, 288 ?string $endSymbol, 289 SerializerState $state, 290 Element $node, 291 bool $wrapperUnmodified 292 ): string { 293 if ( $wrapperUnmodified ) { 294 $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; 295 return $state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? ''; 296 } else { 297 return $this->serializeTableElement( $symbol, $endSymbol, $state, $node ); 298 } 299 } 300 301 /** 302 * Helper: Checks whether syntax information in data-parsoid is valid 303 * in the presence of table edits. For example "|" is no longer valid 304 * table-cell markup if a table cell is added before this cell. 305 * 306 * @param SerializerState $state 307 * @param Element $node 308 * @return bool 309 */ 310 protected function stxInfoValidForTableCell( SerializerState $state, Element $node ): bool { 311 // If row syntax is not set, nothing to worry about 312 if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) { 313 return true; 314 } 315 316 // If we have an identical previous sibling, nothing to worry about 317 $prev = DOMUtils::previousNonDeletedSibling( $node ); 318 return $prev !== null && DOMCompat::nodeName( $prev ) === DOMCompat::nodeName( $node ); 319 } 320 321 /** 322 * Helper for several DOM handlers: Returns whitespace that needs to be emitted 323 * between the markup for the node and its content (ex: table cells, list items) 324 * based on node state (whether the node is original or new content) and other 325 * state (HTML version, whether selective serialization is enabled or not). 326 * @param SerializerState $state 327 * @param Element $node 328 * @param string $newEltDefault 329 * @return string 330 */ 331 protected function getLeadingSpace( 332 SerializerState $state, Element $node, string $newEltDefault 333 ): string { 334 $space = ''; 335 if ( WTUtils::isNewElt( $node ) ) { 336 $fc = DOMUtils::firstNonDeletedChild( $node ); 337 // PORT-FIXME are different \s semantics going to be a problem? 338 if ( $fc && ( !DOMUtils::isText( $fc ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) { 339 $space = $newEltDefault; 340 } 341 } 342 return $space; 343 } 344 345 /** 346 * Helper for several DOM handlers: Returns whitespace that needs to be emitted 347 * between the markup for the node and its next sibling based on node state 348 * (whether the node is original or new content) and other state (HTML version, 349 * whether selective serialization is enabled or not). 350 * @param SerializerState $state 351 * @param Element $node 352 * @param string $newEltDefault 353 * @return string 354 */ 355 protected function getTrailingSpace( 356 SerializerState $state, Element $node, string $newEltDefault 357 ): string { 358 $space = ''; 359 if ( WTUtils::isNewElt( $node ) ) { 360 $lc = DOMUtils::lastNonDeletedChild( $node ); 361 // PORT-FIXME are different \s semantics going to be a problem? 362 if ( $lc && ( !DOMUtils::isText( $lc ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) { 363 $space = $newEltDefault; 364 } 365 } 366 return $space; 367 } 368 369 /** 370 * Helper: Is this node auto-inserted by the HTML5 tree-builder 371 * during wt->html? 372 * @param Node $node 373 * @return bool 374 */ 375 protected function isBuilderInsertedElt( Node $node ): bool { 376 if ( !DOMUtils::isElt( $node ) ) { 377 return false; 378 } 379 '@phan-var Element $node';/** @var Element $node */ 380 $dp = DOMDataUtils::getDataParsoid( $node ); 381 return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd ); 382 } 383 384 /** 385 * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis 386 * N.B. We no longer emit self-closed nowikis as placeholders, so remove this 387 * once all our stored content is updated. 388 * @param Element $node 389 * @param SerializerState $state 390 */ 391 protected function emitPlaceholderSrc( Element $node, SerializerState $state ) { 392 $dp = DOMDataUtils::getDataParsoid( $node ); 393 if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) { 394 $state->hasSelfClosingNowikis = true; 395 } 396 // FIXME: Should this also check for tabs and plain space 397 // chars interspersed with newlines? 398 if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) { 399 $state->appendSep( $dp->src, $node ); 400 } else { 401 $state->serializer->emitWikitext( $dp->src, $node ); 402 } 403 } 404 405} 406