1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers;
5
6use DOMDocumentFragment;
7use DOMElement;
8use DOMNode;
9use LogicException;
10use Wikimedia\Parsoid\Html2Wt\SerializerState;
11use Wikimedia\Parsoid\Html2Wt\WTSUtils;
12use Wikimedia\Parsoid\Utils\DOMCompat;
13use Wikimedia\Parsoid\Utils\DOMDataUtils;
14use Wikimedia\Parsoid\Utils\DOMUtils;
15use Wikimedia\Parsoid\Utils\WTUtils;
16
17/**
18 * HTML -> Wikitext serialization relies on walking the DOM and delegating
19 * the serialization requests to different DOM nodes.
20 *
21 * This class represents the interface that various DOM handlers are expected
22 * to implement.
23 *
24 * There is the core 'handle' method that deals with converting the content
25 * of the node into wikitext markup.
26 *
27 * Then there are 4 newline-constraint methods that specify the constraints
28 * that need to be satisfied for the markup to be valid. For example, list items
29 * should always start on a newline, but can only have a single newline separator.
30 * Paragraphs always start on a newline and need at least 2 newlines in wikitext
31 * for them to be recognized as paragraphs.
32 *
33 * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild)
34 * return an array with a 'min' and 'max' property. If a property is missing, it
35 * means that the dom node doesn't have any newline constraints. Some DOM handlers
36 * might therefore choose to implement none, some, or all of these methods.
37 *
38 * The return values of each of these methods are treated as consraints and the
39 * caller will have to resolve potentially conflicting constraints between a
40 * pair of nodes (siblings, parent-child). For example, if an after handler of
41 * a node wants 1 newline, but the before handler of its sibling wants none.
42 *
43 * Ideally, there should not be any incompatible constraints, but we haven't
44 * actually verified that this is the case. All consraint-hanlding code is in
45 * the separators-handling methods.
46 */
47class DOMHandler {
48
49	/** @var bool */
50	private $forceSOL;
51
52	/**
53	 * @param bool $forceSOL
54	 */
55	public function __construct( bool $forceSOL = false ) {
56		$this->forceSOL = $forceSOL;
57	}
58
59	/**
60	 * Serialize a DOM node to wikitext.
61	 * Serialized wikitext should be returned via $state::emitChunk().
62	 * @param DOMElement $node
63	 * @param SerializerState $state
64	 * @param bool $wrapperUnmodified
65	 * @return DOMNode|null The node to continue with (need not be an element always)
66	 */
67	public function handle(
68		DOMElement $node, SerializerState $state, bool $wrapperUnmodified = false
69	): ?DOMNode {
70		throw new LogicException( 'Not implemented.' );
71	}
72
73	/**
74	 * How many newlines should be emitted *before* this node?
75	 *
76	 * @param DOMElement $node
77	 * @param DOMNode $otherNode
78	 * @param SerializerState $state
79	 * @return array
80	 */
81	public function before( DOMElement $node, DOMNode $otherNode, SerializerState $state ): array {
82		return [];
83	}
84
85	/**
86	 * How many newlines should be emitted *after* this node?
87	 *
88	 * @param DOMElement $node
89	 * @param DOMNode $otherNode
90	 * @param SerializerState $state
91	 * @return array
92	 */
93	public function after( DOMElement $node, DOMNode $otherNode, SerializerState $state ): array {
94		return [];
95	}
96
97	/**
98	 * How many newlines should be emitted before the first child?
99	 *
100	 * @param DOMElement|DOMDocumentFragment $node
101	 * @param DOMNode $otherNode
102	 * @param SerializerState $state
103	 * @return array
104	 */
105	public function firstChild( DOMNode $node, DOMNode $otherNode, SerializerState $state ): array {
106		return [];
107	}
108
109	/**
110	 * How many newlines should be emitted after the last child?
111	 *
112	 * @param DOMElement|DOMDocumentFragment $node
113	 * @param DOMNode $otherNode
114	 * @param SerializerState $state
115	 * @return array
116	 */
117	public function lastChild( DOMNode $node, DOMNode $otherNode, SerializerState $state ): array {
118		return [];
119	}
120
121	/**
122	 * Put the serializer in start-of-line mode before it is handled.
123	 * All non-newline whitespace found between HTML nodes is stripped
124	 * to ensure SOL state is guaranteed.
125	 *
126	 * @return bool
127	 */
128	public function isForceSOL(): bool {
129		return $this->forceSOL;
130	}
131
132	/**
133	 * List helper: This is a shared *after* newline handler for list items.
134	 *
135	 * @param DOMElement $node
136	 * @param DOMNode $otherNode
137	 * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array.
138	 */
139	protected function wtListEOL( DOMElement $node, DOMNode $otherNode ): array {
140		if ( !DOMUtils::isElt( $otherNode ) || DOMUtils::atTheTop( $otherNode ) ) {
141			return [ 'min' => 0, 'max' => 2 ];
142		}
143		'@phan-var DOMElement $otherNode';/** @var DOMElement $otherNode */
144
145		if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) {
146			return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ];
147		}
148
149		$nextSibling = DOMUtils::nextNonSepSibling( $node );
150		$dp = DOMDataUtils::getDataParsoid( $otherNode );
151		if ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' || isset( $dp->src ) ) {
152			return [ 'min' => 0, 'max' => 2 ];
153		} elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) {
154			if ( DOMUtils::isList( $node ) && $otherNode->nodeName === $node->nodeName ) {
155				// Adjacent lists of same type need extra newline
156				return [ 'min' => 2, 'max' => 2 ];
157			} elseif ( DOMUtils::isListItem( $node )
158				|| in_array( $node->parentNode->nodeName, [ 'li', 'dd' ], true )
159			) {
160				// Top-level list
161				return [ 'min' => 1, 'max' => 1 ];
162			} else {
163				return [ 'min' => 1, 'max' => 2 ];
164			}
165		} elseif ( DOMUtils::isList( $otherNode )
166			|| ( DOMUtils::isElt( $otherNode ) && ( $dp->stx ?? null ) === 'html' )
167		) {
168			// last child in ul/ol (the list element is our parent), defer
169			// separator constraints to the list.
170			return [];
171		} elseif (
172			DOMUtils::isWikitextBlockNode( $node->parentNode ) &&
173			DOMUtils::lastNonSepChild( $node->parentNode ) === $node
174		) {
175			// A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line
176			// if it is the last non-separator child (ex: <div>..</ul></div>)
177			return [ 'min' => 1, 'max' => 2 ];
178		} elseif ( DOMUtils::isFormattingElt( $otherNode ) ) {
179			return [ 'min' => 1, 'max' => 1 ];
180		} else {
181			return [ 'min' => WTUtils::isNewElt( $node ) ? 2 : 1, 'max' => 2 ];
182		}
183	}
184
185	/**
186	 * List helper: DOM-based list bullet construction.
187	 * @param SerializerState $state
188	 * @param DOMElement $node
189	 * @return string
190	 */
191	protected function getListBullets( SerializerState $state, DOMElement $node ): string {
192		$parentTypes = [
193			'ul' => '*',
194			'ol' => '#'
195		];
196		$listTypes = [
197			'ul' => '',
198			'ol' => '',
199			'dl' => '',
200			'li' => '',
201			'dt' => ';',
202			'dd' => ':'
203		];
204
205		// For new elements, for prettier wikitext serialization,
206		// emit a space after the last bullet (if required)
207		$space = $this->getLeadingSpace( $state, $node, ' ' );
208
209		$res = '';
210		while ( !DOMUtils::atTheTop( $node ) ) {
211			$dp = DOMDataUtils::getDataParsoid( $node );
212			$stx = $dp->stx ?? null;
213			if ( ( $stx !== 'html' || isset( $dp->liHackSrc ) ) && isset( $listTypes[$node->nodeName] ) ) {
214				if ( $node->nodeName === 'li' ) {
215					$parentNode = $node->parentNode;
216					while ( $parentNode && !( isset( $parentTypes[$parentNode->nodeName] ) ) ) {
217						$parentNode = $parentNode->parentNode;
218					}
219
220					if ( $parentNode ) {
221						$res = $parentTypes[$parentNode->nodeName] . $res;
222					} else {
223						$state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.',
224							"Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:",
225							DOMCompat::getOuterHTML( $node )
226						);
227					}
228				} else {
229					$res = $listTypes[$node->nodeName] . $res;
230				}
231			} elseif ( $stx !== 'html' ||
232				empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd )
233			) {
234				break;
235			}
236
237			$node = $node->parentNode;
238		}
239
240		// Don't emit a space if we aren't returning any bullets.
241		return strlen( $res ) ? $res . $space : '';
242	}
243
244	/**
245	 * Helper: Newline constraint helper for table nodes
246	 * @param DOMNode $node
247	 * @param DOMNode $origNode
248	 * @return int
249	 */
250	protected function maxNLsInTable( DOMNode $node, DOMNode $origNode ): int {
251		return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2;
252	}
253
254	/**
255	 * Private helper for serializing table nodes
256	 * @param string $symbol
257	 * @param ?string $endSymbol
258	 * @param SerializerState $state
259	 * @param DOMElement $node
260	 * @return string
261	 */
262	private function serializeTableElement(
263		string $symbol, ?string $endSymbol, SerializerState $state, DOMElement $node
264	): string {
265		$token = WTSUtils::mkTagTk( $node );
266		$sAttribs = $state->serializer->serializeAttributes( $node, $token );
267		if ( $sAttribs !== '' ) {
268			// IMPORTANT: use ?? not ?: in the first check because we want to preserve an
269			// empty string. Use != '' in the second to avoid treating '0' as empty.
270			return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' );
271		} else {
272			return $symbol . ( $endSymbol != '' ? $endSymbol : '' );
273		}
274	}
275
276	/**
277	 * Helper: Handles content serialization for table nodes
278	 * @param string $symbol
279	 * @param ?string $endSymbol
280	 * @param SerializerState $state
281	 * @param DOMElement $node
282	 * @param bool $wrapperUnmodified
283	 * @return string
284	 */
285	protected function serializeTableTag(
286		string $symbol,
287		?string $endSymbol,
288		SerializerState $state,
289		DOMElement $node,
290		bool $wrapperUnmodified
291	): string {
292		if ( $wrapperUnmodified ) {
293			$dsr = DOMDataUtils::getDataParsoid( $node )->dsr;
294			return $state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? '';
295		} else {
296			return $this->serializeTableElement( $symbol, $endSymbol, $state, $node );
297		}
298	}
299
300	/**
301	 * Helper: Checks whether syntax information in data-parsoid is valid
302	 * in the presence of table edits. For example "|" is no longer valid
303	 * table-cell markup if a table cell is added before this cell.
304	 *
305	 * @param SerializerState $state
306	 * @param DOMElement $node
307	 * @return bool
308	 */
309	protected function stxInfoValidForTableCell( SerializerState $state, DOMElement $node ): bool {
310		// If row syntax is not set, nothing to worry about
311		if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) {
312			return true;
313		}
314
315		// If we have an identical previous sibling, nothing to worry about
316		$prev = DOMUtils::previousNonDeletedSibling( $node );
317		return $prev !== null && $prev->nodeName === $node->nodeName;
318	}
319
320	/**
321	 * Helper for several DOM handlers: Returns whitespace that needs to be emitted
322	 * between the markup for the node and its content (ex: table cells, list items)
323	 * based on node state (whether the node is original or new content) and other
324	 * state (HTML version, whether selective serialization is enabled or not).
325	 * @param SerializerState $state
326	 * @param DOMElement $node
327	 * @param string $newEltDefault
328	 * @return string
329	 */
330	protected function getLeadingSpace(
331		SerializerState $state, DOMElement $node, string $newEltDefault
332	): string {
333		$space = '';
334		if ( WTUtils::isNewElt( $node ) ) {
335			$fc = DOMUtils::firstNonDeletedChild( $node );
336			// PORT-FIXME are different \s semantics going to be a problem?
337			if ( $fc && ( !DOMUtils::isText( $fc ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) {
338				$space = $newEltDefault;
339			}
340		}
341		return $space;
342	}
343
344	/**
345	 * Helper for several DOM handlers: Returns whitespace that needs to be emitted
346	 * between the markup for the node and its next sibling based on node state
347	 * (whether the node is original or new content) and other state (HTML version,
348	 * whether selective serialization is enabled or not).
349	 * @param SerializerState $state
350	 * @param DOMElement $node
351	 * @param string $newEltDefault
352	 * @return string
353	 */
354	protected function getTrailingSpace(
355		SerializerState $state, DOMElement $node, string $newEltDefault
356	): string {
357		$space = '';
358		if ( WTUtils::isNewElt( $node ) ) {
359			$lc = DOMUtils::lastNonDeletedChild( $node );
360			// PORT-FIXME are different \s semantics going to be a problem?
361			if ( $lc && ( !DOMUtils::isText( $lc ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) {
362				$space = $newEltDefault;
363			}
364		}
365		return $space;
366	}
367
368	/**
369	 * Helper: Is this node auto-inserted by the HTML5 tree-builder
370	 * during wt->html?
371	 * @param DOMNode $node
372	 * @return bool
373	 */
374	protected function isBuilderInsertedElt( DOMNode $node ): bool {
375		if ( !DOMUtils::isElt( $node ) ) {
376			return false;
377		}
378		'@phan-var DOMElement $node';/** @var DOMElement $node */
379		$dp = DOMDataUtils::getDataParsoid( $node );
380		return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd );
381	}
382
383	/**
384	 * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis
385	 * N.B. We no longer emit self-closed nowikis as placeholders, so remove this
386	 * once all our stored content is updated.
387	 * @param DOMElement $node
388	 * @param SerializerState $state
389	 */
390	protected function emitPlaceholderSrc( DOMElement $node, SerializerState $state ) {
391		$dp = DOMDataUtils::getDataParsoid( $node );
392		if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) {
393			$state->hasSelfClosingNowikis = true;
394		}
395		// FIXME: Should this also check for tabs and plain space
396		// chars interspersed with newlines?
397		if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) {
398			$state->appendSep( $dp->src, $node );
399		} else {
400			$state->serializer->emitWikitext( $dp->src, $node );
401		}
402	}
403
404}
405