1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers;
5
6use LogicException;
7use Wikimedia\Parsoid\DOM\DocumentFragment;
8use Wikimedia\Parsoid\DOM\Element;
9use Wikimedia\Parsoid\DOM\Node;
10use Wikimedia\Parsoid\Html2Wt\SerializerState;
11use Wikimedia\Parsoid\Html2Wt\WTSUtils;
12use Wikimedia\Parsoid\Utils\DOMCompat;
13use Wikimedia\Parsoid\Utils\DOMDataUtils;
14use Wikimedia\Parsoid\Utils\DOMUtils;
15use Wikimedia\Parsoid\Utils\WTUtils;
16
17/**
18 * HTML -> Wikitext serialization relies on walking the DOM and delegating
19 * the serialization requests to different DOM nodes.
20 *
21 * This class represents the interface that various DOM handlers are expected
22 * to implement.
23 *
24 * There is the core 'handle' method that deals with converting the content
25 * of the node into wikitext markup.
26 *
27 * Then there are 4 newline-constraint methods that specify the constraints
28 * that need to be satisfied for the markup to be valid. For example, list items
29 * should always start on a newline, but can only have a single newline separator.
30 * Paragraphs always start on a newline and need at least 2 newlines in wikitext
31 * for them to be recognized as paragraphs.
32 *
33 * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild)
34 * return an array with a 'min' and 'max' property. If a property is missing, it
35 * means that the dom node doesn't have any newline constraints. Some DOM handlers
36 * might therefore choose to implement none, some, or all of these methods.
37 *
38 * The return values of each of these methods are treated as consraints and the
39 * caller will have to resolve potentially conflicting constraints between a
40 * pair of nodes (siblings, parent-child). For example, if an after handler of
41 * a node wants 1 newline, but the before handler of its sibling wants none.
42 *
43 * Ideally, there should not be any incompatible constraints, but we haven't
44 * actually verified that this is the case. All consraint-hanlding code is in
45 * the separators-handling methods.
46 */
47class DOMHandler {
48
49	/** @var bool */
50	private $forceSOL;
51
52	/**
53	 * @param bool $forceSOL
54	 */
55	public function __construct( bool $forceSOL = false ) {
56		$this->forceSOL = $forceSOL;
57	}
58
59	/**
60	 * Serialize a DOM node to wikitext.
61	 * Serialized wikitext should be returned via $state::emitChunk().
62	 * @param Element $node
63	 * @param SerializerState $state
64	 * @param bool $wrapperUnmodified
65	 * @return Node|null The node to continue with (need not be an element always)
66	 */
67	public function handle(
68		Element $node, SerializerState $state, bool $wrapperUnmodified = false
69	): ?Node {
70		throw new LogicException( 'Not implemented.' );
71	}
72
73	/**
74	 * How many newlines should be emitted *before* this node?
75	 *
76	 * @param Element $node
77	 * @param Node $otherNode
78	 * @param SerializerState $state
79	 * @return array
80	 */
81	public function before( Element $node, Node $otherNode, SerializerState $state ): array {
82		return [];
83	}
84
85	/**
86	 * How many newlines should be emitted *after* this node?
87	 *
88	 * @param Element $node
89	 * @param Node $otherNode
90	 * @param SerializerState $state
91	 * @return array
92	 */
93	public function after( Element $node, Node $otherNode, SerializerState $state ): array {
94		return [];
95	}
96
97	/**
98	 * How many newlines should be emitted before the first child?
99	 *
100	 * @param Element|DocumentFragment $node
101	 * @param Node $otherNode
102	 * @param SerializerState $state
103	 * @return array
104	 */
105	public function firstChild( Node $node, Node $otherNode, SerializerState $state ): array {
106		return [];
107	}
108
109	/**
110	 * How many newlines should be emitted after the last child?
111	 *
112	 * @param Element|DocumentFragment $node
113	 * @param Node $otherNode
114	 * @param SerializerState $state
115	 * @return array
116	 */
117	public function lastChild( Node $node, Node $otherNode, SerializerState $state ): array {
118		return [];
119	}
120
121	/**
122	 * Put the serializer in start-of-line mode before it is handled.
123	 * All non-newline whitespace found between HTML nodes is stripped
124	 * to ensure SOL state is guaranteed.
125	 *
126	 * @return bool
127	 */
128	public function forceSOL(): bool {
129		return $this->forceSOL;
130	}
131
132	/**
133	 * List helper: This is a shared *after* newline handler for list items.
134	 *
135	 * @param Element $node
136	 * @param Node $otherNode
137	 * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array.
138	 */
139	protected function wtListEOL( Element $node, Node $otherNode ): array {
140		if ( !DOMUtils::isElt( $otherNode ) || DOMUtils::atTheTop( $otherNode ) ) {
141			return [ 'min' => 0, 'max' => 2 ];
142		}
143		'@phan-var Element $otherNode';/** @var Element $otherNode */
144
145		if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) {
146			return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ];
147		}
148
149		$nextSibling = DOMUtils::nextNonSepSibling( $node );
150		$dp = DOMDataUtils::getDataParsoid( $otherNode );
151		if ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' || isset( $dp->src ) ) {
152			return [ 'min' => 0, 'max' => 2 ];
153		} elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) {
154			if ( DOMUtils::isList( $node ) && DOMCompat::nodeName( $otherNode ) === DOMCompat::nodeName( $node ) ) {
155				// Adjacent lists of same type need extra newline
156				return [ 'min' => 2, 'max' => 2 ];
157			} elseif ( DOMUtils::isListItem( $node )
158				|| in_array( DOMCompat::nodeName( $node->parentNode ), [ 'li', 'dd' ], true )
159			) {
160				// Top-level list
161				return [ 'min' => 1, 'max' => 1 ];
162			} else {
163				return [ 'min' => 1, 'max' => 2 ];
164			}
165		} elseif ( DOMUtils::isList( $otherNode )
166			|| ( DOMUtils::isElt( $otherNode ) && ( $dp->stx ?? null ) === 'html' )
167		) {
168			// last child in ul/ol (the list element is our parent), defer
169			// separator constraints to the list.
170			return [];
171		} elseif (
172			DOMUtils::isWikitextBlockNode( $node->parentNode ) &&
173			DOMUtils::lastNonSepChild( $node->parentNode ) === $node
174		) {
175			// A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line
176			// if it is the last non-separator child (ex: <div>..</ul></div>)
177			return [ 'min' => 1, 'max' => 2 ];
178		} elseif ( DOMUtils::isFormattingElt( $otherNode ) ) {
179			return [ 'min' => 1, 'max' => 1 ];
180		} else {
181			return [ 'min' => WTUtils::isNewElt( $node ) ? 2 : 1, 'max' => 2 ];
182		}
183	}
184
185	/**
186	 * List helper: DOM-based list bullet construction.
187	 * @param SerializerState $state
188	 * @param Element $node
189	 * @return string
190	 */
191	protected function getListBullets( SerializerState $state, Element $node ): string {
192		$parentTypes = [
193			'ul' => '*',
194			'ol' => '#'
195		];
196		$listTypes = [
197			'ul' => '',
198			'ol' => '',
199			'dl' => '',
200			'li' => '',
201			'dt' => ';',
202			'dd' => ':'
203		];
204
205		// For new elements, for prettier wikitext serialization,
206		// emit a space after the last bullet (if required)
207		$space = $this->getLeadingSpace( $state, $node, ' ' );
208
209		$res = '';
210		while ( !DOMUtils::atTheTop( $node ) ) {
211			$dp = DOMDataUtils::getDataParsoid( $node );
212			if ( isset( $listTypes[DOMCompat::nodeName( $node )] ) ) {
213				if ( DOMCompat::nodeName( $node ) === 'li' ) {
214					$parentNode = $node->parentNode;
215					while ( $parentNode && !( isset( $parentTypes[DOMCompat::nodeName( $parentNode )] ) ) ) {
216						$parentNode = $parentNode->parentNode;
217					}
218
219					if ( $parentNode ) {
220						if ( !WTUtils::isLiteralHTMLNode( $parentNode ) ) {
221							$res = $parentTypes[DOMCompat::nodeName( $parentNode )] . $res;
222						}
223					} else {
224						$state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.',
225							"Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:",
226							DOMCompat::getOuterHTML( $node )
227						);
228					}
229				} elseif ( !WTUtils::isLiteralHTMLNode( $node ) ) {
230					$res = $listTypes[DOMCompat::nodeName( $node )] . $res;
231				}
232			} elseif ( !WTUtils::isLiteralHTMLNode( $node ) ||
233				empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd )
234			) {
235				break;
236			}
237
238			$node = $node->parentNode;
239		}
240
241		// Don't emit a space if we aren't returning any bullets.
242		return strlen( $res ) ? $res . $space : '';
243	}
244
245	/**
246	 * Helper: Newline constraint helper for table nodes
247	 * @param Node $node
248	 * @param Node $origNode
249	 * @return int
250	 */
251	protected function maxNLsInTable( Node $node, Node $origNode ): int {
252		return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2;
253	}
254
255	/**
256	 * Private helper for serializing table nodes
257	 * @param string $symbol
258	 * @param ?string $endSymbol
259	 * @param SerializerState $state
260	 * @param Element $node
261	 * @return string
262	 */
263	private function serializeTableElement(
264		string $symbol, ?string $endSymbol, SerializerState $state, Element $node
265	): string {
266		$token = WTSUtils::mkTagTk( $node );
267		$sAttribs = $state->serializer->serializeAttributes( $node, $token );
268		if ( $sAttribs !== '' ) {
269			// IMPORTANT: use ?? not ?: in the first check because we want to preserve an
270			// empty string. Use != '' in the second to avoid treating '0' as empty.
271			return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' );
272		} else {
273			return $symbol . ( $endSymbol != '' ? $endSymbol : '' );
274		}
275	}
276
277	/**
278	 * Helper: Handles content serialization for table nodes
279	 * @param string $symbol
280	 * @param ?string $endSymbol
281	 * @param SerializerState $state
282	 * @param Element $node
283	 * @param bool $wrapperUnmodified
284	 * @return string
285	 */
286	protected function serializeTableTag(
287		string $symbol,
288		?string $endSymbol,
289		SerializerState $state,
290		Element $node,
291		bool $wrapperUnmodified
292	): string {
293		if ( $wrapperUnmodified ) {
294			$dsr = DOMDataUtils::getDataParsoid( $node )->dsr;
295			return $state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? '';
296		} else {
297			return $this->serializeTableElement( $symbol, $endSymbol, $state, $node );
298		}
299	}
300
301	/**
302	 * Helper: Checks whether syntax information in data-parsoid is valid
303	 * in the presence of table edits. For example "|" is no longer valid
304	 * table-cell markup if a table cell is added before this cell.
305	 *
306	 * @param SerializerState $state
307	 * @param Element $node
308	 * @return bool
309	 */
310	protected function stxInfoValidForTableCell( SerializerState $state, Element $node ): bool {
311		// If row syntax is not set, nothing to worry about
312		if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) {
313			return true;
314		}
315
316		// If we have an identical previous sibling, nothing to worry about
317		$prev = DOMUtils::previousNonDeletedSibling( $node );
318		return $prev !== null && DOMCompat::nodeName( $prev ) === DOMCompat::nodeName( $node );
319	}
320
321	/**
322	 * Helper for several DOM handlers: Returns whitespace that needs to be emitted
323	 * between the markup for the node and its content (ex: table cells, list items)
324	 * based on node state (whether the node is original or new content) and other
325	 * state (HTML version, whether selective serialization is enabled or not).
326	 * @param SerializerState $state
327	 * @param Element $node
328	 * @param string $newEltDefault
329	 * @return string
330	 */
331	protected function getLeadingSpace(
332		SerializerState $state, Element $node, string $newEltDefault
333	): string {
334		$space = '';
335		if ( WTUtils::isNewElt( $node ) ) {
336			$fc = DOMUtils::firstNonDeletedChild( $node );
337			// PORT-FIXME are different \s semantics going to be a problem?
338			if ( $fc && ( !DOMUtils::isText( $fc ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) {
339				$space = $newEltDefault;
340			}
341		}
342		return $space;
343	}
344
345	/**
346	 * Helper for several DOM handlers: Returns whitespace that needs to be emitted
347	 * between the markup for the node and its next sibling based on node state
348	 * (whether the node is original or new content) and other state (HTML version,
349	 * whether selective serialization is enabled or not).
350	 * @param SerializerState $state
351	 * @param Element $node
352	 * @param string $newEltDefault
353	 * @return string
354	 */
355	protected function getTrailingSpace(
356		SerializerState $state, Element $node, string $newEltDefault
357	): string {
358		$space = '';
359		if ( WTUtils::isNewElt( $node ) ) {
360			$lc = DOMUtils::lastNonDeletedChild( $node );
361			// PORT-FIXME are different \s semantics going to be a problem?
362			if ( $lc && ( !DOMUtils::isText( $lc ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) {
363				$space = $newEltDefault;
364			}
365		}
366		return $space;
367	}
368
369	/**
370	 * Helper: Is this node auto-inserted by the HTML5 tree-builder
371	 * during wt->html?
372	 * @param Node $node
373	 * @return bool
374	 */
375	protected function isBuilderInsertedElt( Node $node ): bool {
376		if ( !DOMUtils::isElt( $node ) ) {
377			return false;
378		}
379		'@phan-var Element $node';/** @var Element $node */
380		$dp = DOMDataUtils::getDataParsoid( $node );
381		return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd );
382	}
383
384	/**
385	 * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis
386	 * N.B. We no longer emit self-closed nowikis as placeholders, so remove this
387	 * once all our stored content is updated.
388	 * @param Element $node
389	 * @param SerializerState $state
390	 */
391	protected function emitPlaceholderSrc( Element $node, SerializerState $state ) {
392		$dp = DOMDataUtils::getDataParsoid( $node );
393		if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) {
394			$state->hasSelfClosingNowikis = true;
395		}
396		// FIXME: Should this also check for tabs and plain space
397		// chars interspersed with newlines?
398		if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) {
399			$state->appendSep( $dp->src, $node );
400		} else {
401			$state->serializer->emitWikitext( $dp->src, $node );
402		}
403	}
404
405}
406