1<?php
2
3declare( strict_types = 1 );
4
5namespace Wikimedia\Parsoid\Html2Wt;
6
7use Wikimedia\Assert\Assert;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\Config\WikitextConstants;
10use Wikimedia\Parsoid\Core\DomSourceRange;
11use Wikimedia\Parsoid\DOM\Comment;
12use Wikimedia\Parsoid\DOM\DocumentFragment;
13use Wikimedia\Parsoid\DOM\Element;
14use Wikimedia\Parsoid\DOM\Node;
15use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\TokenUtils;
21use Wikimedia\Parsoid\Utils\Utils;
22use Wikimedia\Parsoid\Utils\WTUtils;
23
24class Separators {
25	/*
26	 * This regexp looks for leading whitespace on the last line of a separator string.
27	 * So, only comments (single or multi-line) or other newlines can precede that
28	 * whitespace-of-interest. But, also account for any whitespace preceding newlines
29	 * since that needs to be skipped over (Ex: "   \n  ").
30	 */
31	private const INDENT_PRE_WS_IN_SEP_REGEXP =
32		'/^((?: *\n|(?:' . Utils::COMMENT_REGEXP_FRAGMENT . '))*)( +)([^\n]*)$/D';
33
34	/**
35	 * @var SerializerState
36	 */
37	private $state;
38
39	/**
40	 * @var Env
41	 */
42	private $env;
43
44	/**
45	 * Clean up the constraints object to prevent excessively verbose output
46	 * and clog up log files / test runs.
47	 *
48	 * @param array $constraints
49	 * @return array
50	 */
51	private static function loggableConstraints( array $constraints ): array {
52		$c = [
53			'a' => $constraints['a'] ?? null,
54			'b' => $constraints['b'] ?? null,
55			'min' => $constraints['min'] ?? null,
56			'max' => $constraints['max'] ?? null,
57		];
58		if ( !empty( $constraints['constraintInfo'] ) ) {
59			$constraintInfo = $constraints['constraintInfo'];
60			$c['constraintInfo'] = [
61				'onSOL' => $constraintInfo['onSOL'] ?? false,
62				'sepType' => $constraintInfo['sepType'] ?? null,
63				'nodeA' => DOMCompat::nodeName( $constraintInfo['nodeA'] ),
64				'nodeB' => DOMCompat::nodeName( $constraintInfo['nodeB'] ),
65			];
66		}
67		return $c;
68	}
69
70	/**
71	 * @param Node $n
72	 * @return string|null
73	 */
74	private static function precedingSeparatorTextLen( Node $n ): ?int {
75		// Given the CSS white-space property and specifically,
76		// "pre" and "pre-line" values for this property, it seems that any
77		// sane HTML editor would have to preserve IEW in HTML documents
78		// to preserve rendering. One use-case where an editor might change
79		// IEW drastically would be when the user explicitly requests it
80		// (Ex: pretty-printing of raw source code).
81		//
82		// For now, we are going to exploit this. This information is
83		// only used to extrapolate DSR values and extract a separator
84		// string from source, and is only used locally. In addition,
85		// the extracted text is verified for being a valid separator.
86		//
87		// So, at worst, this can create a local dirty diff around separators
88		// and at best, it gets us a clean diff.
89
90		$len = 0;
91		$orig = $n;
92		while ( $n ) {
93			if ( DOMUtils::isIEW( $n ) ) {
94				$len += strlen( $n->nodeValue );
95			} elseif ( $n instanceof Comment ) {
96				$len += WTUtils::decodedCommentLength( $n );
97			} elseif ( $n !== $orig ) { // dont return if input node!
98				return null;
99			}
100
101			$n = $n->previousSibling;
102		}
103
104		return $len;
105	}
106
107	/**
108	 * Helper for updateSeparatorConstraints.
109	 *
110	 * Collects, checks and integrates separator newline requirements to a simple
111	 * min, max structure.
112	 *
113	 * @param Node $nodeA
114	 * @param array $aCons
115	 * @param Node $nodeB
116	 * @param array $bCons
117	 * @return array
118	 */
119	private function getSepNlConstraints(
120		Node $nodeA, array $aCons, Node $nodeB, array $bCons
121	): array {
122		$env = $this->state->getEnv();
123
124		$nlConstraints = [
125			'min' => $aCons['min'] ?? null,
126			'max' => $aCons['max'] ?? null,
127			'constraintInfo' => [],
128		];
129
130		if ( isset( $bCons['min'] ) ) {
131			if ( $nlConstraints['max'] !== null && $nlConstraints['max'] < $bCons['min'] ) {
132				// Conflict, warn and let nodeB win.
133				$env->log(
134					'info/html2wt',
135					'Incompatible constraints 1:',
136					DOMCompat::nodeName( $nodeA ),
137					DOMCompat::nodeName( $nodeB ),
138					self::loggableConstraints( $nlConstraints )
139				);
140				$nlConstraints['min'] = $bCons['min'];
141				$nlConstraints['max'] = $bCons['min'];
142			} else {
143				$nlConstraints['min'] = max( $nlConstraints['min'] ?? 0, $bCons['min'] );
144			}
145		}
146
147		if ( isset( $bCons['max'] ) ) {
148			if ( ( $nlConstraints['min'] ?? 0 ) > $bCons['max'] ) {
149				// Conflict, warn and let nodeB win.
150				$env->log(
151					'info/html2wt',
152					'Incompatible constraints 2:',
153					DOMCompat::nodeName( $nodeA ),
154					DOMCompat::nodeName( $nodeB ),
155					self::loggableConstraints( $nlConstraints )
156				);
157				$nlConstraints['min'] = $bCons['max'];
158				$nlConstraints['max'] = $bCons['max'];
159			} else {
160				$nlConstraints['max'] = min( $nlConstraints['max'] ?? $bCons['max'], $bCons['max'] );
161			}
162		}
163
164		if ( $nlConstraints['max'] === null ) {
165			// Anything more than two lines will trigger paragraphs, so default to
166			// two if nothing is specified. (FIXME: This is a conservative strategy
167			// since strictly speaking, this is not always true. This is more a
168			// cautious fallback to handle cases where some DOM handler is missing
169			// a necessary max constraint.)
170			$nlConstraints['max'] = 2;
171		}
172
173		if ( ( $nlConstraints['min'] ?? 0 ) > $nlConstraints['max'] ) {
174			$nlConstraints['max'] = $nlConstraints['min'];
175		}
176
177		return $nlConstraints;
178	}
179
180	/**
181	 * Create a separator given a (potentially empty) separator text and newline constraints.
182	 *
183	 * @param Node $node
184	 * @param string $sep
185	 * @param array $nlConstraints
186	 * @return string
187	 */
188	private function makeSeparator( Node $node, string $sep, array $nlConstraints ): string {
189		$origSep = $sep;
190		$sepType = $nlConstraints['constraintInfo']['sepType'] ?? null;
191
192		// Split on comment/ws-only lines, consuming subsequent newlines since
193		// those lines are ignored by the PHP parser
194		// Ignore lines with ws and a single comment in them
195		$splitRe = implode( [ "#(?:\n(?:[ \t]*?",
196				Utils::COMMENT_REGEXP_FRAGMENT,
197				"[ \t]*?)+(?=\n))+|",
198				Utils::COMMENT_REGEXP_FRAGMENT,
199				"#"
200			] );
201		$sepNlCount = substr_count( implode( preg_split( $splitRe, $sep ) ), "\n" );
202		$minNls = $nlConstraints['min'] ?? 0;
203
204		if ( $this->state->atStartOfOutput && $minNls > 0 ) {
205			// Skip first newline as we are in start-of-line context
206			$minNls--;
207		}
208
209		if ( $minNls > 0 && $sepNlCount < $minNls ) {
210			// Append newlines
211			$nlBuf = [];
212			for ( $i = 0; $i < ( $minNls - $sepNlCount ); $i++ ) {
213				$nlBuf[] = "\n";
214			}
215
216			/* ------------------------------------------------------------------
217			 * The following two heuristics try to do a best-guess on where to
218			 * add the newlines relative to nodeA and nodeB that best matches
219			 * wikitext output expectations.
220			 *
221			 * 1. In a parent-child separator scenario, where the first child of
222			 *    nodeA is not an element, it could have contributed to the separator.
223			 *    In that case, the newlines should be prepended because they
224			 *    usually correspond to the parent's constraints,
225			 *    and the separator was plucked from the child.
226			 *
227			 *    Try html2wt on this snippet:
228			 *
229			 *    a<p><!--cmt-->b</p>
230			 *
231			 * 2. In a sibling scenario, if nodeB is a literal-HTML element, nodeA is
232			 *    forcing the newline and hence the newline should be emitted right
233			 *    after it.
234			 *
235			 *    Try html2wt on this snippet:
236			 *
237			 *    <p>foo</p>  <p data-parsoid='{"stx":"html"}'>bar</p>
238			 * -------------------------------------------------------------------- */
239			$constraintInfo = $nlConstraints['constraintInfo'] ?? [];
240			$sepType = $constraintInfo['sepType'] ?? null;
241			$nodeA = $constraintInfo['nodeA'] ?? null;
242			$nodeB = $constraintInfo['nodeB'] ?? null;
243			if (
244				$sepType === 'parent-child' &&
245				!DOMUtils::isContentNode( DOMUtils::firstNonDeletedChild( $nodeA ) ) &&
246				!(
247					isset( WikitextConstants::$HTML['ChildTableTags'][DOMCompat::nodeName( $nodeB )] ) &&
248					!WTUtils::isLiteralHTMLNode( $nodeB )
249				)
250			) {
251				$sep = implode( $nlBuf ) . $sep;
252			} elseif ( $sepType === 'sibling' && WTUtils::isLiteralHTMLNode( $nodeB ) ) {
253				$sep = implode( $nlBuf ) . $sep;
254			} else {
255				$sep .= implode( $nlBuf );
256			}
257		} elseif ( isset( $nlConstraints['max'] ) && $sepNlCount > $nlConstraints['max'] && (
258			// In selser mode, if the current node is an unmodified rendering-transparent node
259			// of a sibling pair, leave the separator alone since the excess newlines aren't
260			// going to change the semantics of how this node will be parsed in wt->html direction.
261			// This will instead eliminate a dirty diff on the page.
262			!$this->state->selserMode ||
263			$sepType !== 'sibling' ||
264			!$this->state->currNodeUnmodified ||
265			!WTUtils::isRenderingTransparentNode( $node )
266		) ) {
267			// Strip some newlines outside of comments.
268			//
269			// Capture separators in a single array with a capturing version of
270			// the split regexp, so that we can work on the non-separator bits
271			// when stripping newlines.
272			//
273			// Dirty-diff minimizing heuristic: Strip newlines away from an unmodified node.
274			// If both nodes are unmodified, this dirties the separator before the current node.
275			// If both nodes are modified, this dirties the separator after the previous node.
276			$allBits = preg_split( '#(' . PHPUtils::reStrip( $splitRe, '#' ) . ')#',
277				$sep, -1, PREG_SPLIT_DELIM_CAPTURE );
278			$newBits = [];
279			$n = $sepNlCount - $nlConstraints['max'];
280
281			$stripAtEnd = $this->state->prevNodeUnmodified;
282			while ( $n > 0 ) {
283				$bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits );
284				while ( $bit && preg_match( $splitRe, $bit ) ) {
285					// Retain comment-only lines as is
286					$newBits[] = $bit;
287					$bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits );
288				}
289				// @phan-suppress-next-line PhanPluginLoopVariableReuse
290				while ( $n > 0 && str_contains( $bit, "\n" ) ) {
291					$bit = preg_replace( '/\n([^\n]*)/', '$1', $bit, 1 );
292					$n--;
293				}
294				$newBits[] = $bit;
295			}
296			if ( $stripAtEnd ) {
297				$newBits = array_merge( $allBits, array_reverse( $newBits ) );
298			} else {
299				PHPUtils::pushArray( $newBits, $allBits );
300			}
301			$sep = implode( $newBits );
302		}
303
304		$this->state->getEnv()->log(
305			'debug/wts/sep',
306			'make-new   |',
307			static function () use ( $nlConstraints, $sepNlCount, $minNls, $sep, $origSep ) {
308				$constraints = Utils::clone( $nlConstraints );
309				unset( $constraints['constraintInfo'] );
310				return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $origSep ) . ', ' .
311					$minNls . ', ' . $sepNlCount . ', ' . PHPUtils::jsonEncode( $constraints );
312			}
313		);
314
315		return $sep;
316	}
317
318	/**
319	 * Merge two constraints.
320	 * @param Env $env
321	 * @param array $oldConstraints
322	 * @param array $newConstraints
323	 * @return array
324	 */
325	private static function mergeConstraints(
326		Env $env, array $oldConstraints, array $newConstraints
327	): array {
328		$res = [
329			'min' => max( $oldConstraints['min'] ?? 0, $newConstraints['min'] ?? 0 ),
330			'max' => min( $oldConstraints['max'] ?? 2, $newConstraints['max'] ?? 2 ),
331			'constraintInfo' => [],
332		];
333
334		if ( $res['min'] > $res['max'] ) {
335			$res['max'] = $res['min'];
336			$env->log(
337				'info/html2wt',
338				'Incompatible constraints (merge):',
339				$res,
340				self::loggableConstraints( $oldConstraints ),
341				self::loggableConstraints( $newConstraints )
342			);
343		}
344
345		return $res;
346	}
347
348	/**
349	 * @param Node $node
350	 * @return string
351	 */
352	public static function debugOut( Node $node ): string {
353		$value = '';
354		if ( $node instanceof Element ) {
355			$value = DOMCompat::getOuterHTML( $node );
356		}
357		if ( !$value ) {
358			$value = $node->nodeValue;
359		}
360		return mb_substr( PHPUtils::jsonEncode( $value ), 0, 40 );
361	}
362
363	/**
364	 * Figure out separator constraints and merge them with existing constraints
365	 * in state so that they can be emitted when the next content emits source.
366	 *
367	 * @param Node $nodeA
368	 * @param DOMHandler $sepHandlerA
369	 * @param Node $nodeB
370	 * @param DOMHandler $sepHandlerB
371	 */
372	public function updateSeparatorConstraints(
373		Node $nodeA, DOMHandler $sepHandlerA, Node $nodeB, DOMHandler $sepHandlerB
374	): void {
375		$state = $this->state;
376
377		if ( $nodeB->parentNode === $nodeA ) {
378			// parent-child separator, nodeA parent of nodeB
379			'@phan-var Element|DocumentFragment $nodeA'; // @var Element|DocumentFragment $nodeA
380			$sepType = 'parent-child';
381			$aCons = $sepHandlerA->firstChild( $nodeA, $nodeB, $state );
382			$bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : [];
383		} elseif ( $nodeA->parentNode === $nodeB ) {
384			// parent-child separator, nodeB parent of nodeA
385			'@phan-var Element|DocumentFragment $nodeB'; // @var Element|DocumentFragment $nodeA
386			$sepType = 'child-parent';
387			$aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : [];
388			$bCons = $sepHandlerB->lastChild( $nodeB, $nodeA, $state );
389		} else {
390			// sibling separator
391			$sepType = 'sibling';
392			$aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : [];
393			$bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : [];
394		}
395		$nlConstraints = $this->getSepNlConstraints( $nodeA, $aCons, $nodeB, $bCons );
396
397		if ( !empty( $state->sep->constraints ) ) {
398			// Merge the constraints
399			$state->sep->constraints = self::mergeConstraints(
400				$this->env,
401				$state->sep->constraints,
402				$nlConstraints
403			);
404		} else {
405			$state->sep->constraints = $nlConstraints;
406		}
407
408		$this->env->log(
409			'debug/wts/sep',
410			function () use ( $sepType, $nodeA, $nodeB, $state ) {
411				return 'constraint' . ' | ' .
412					$sepType . ' | ' .
413					'<' . DOMCompat::nodeName( $nodeA ) . ',' . DOMCompat::nodeName( $nodeB ) .
414					'>' . ' | ' . PHPUtils::jsonEncode( $state->sep->constraints ) . ' | ' .
415					self::debugOut( $nodeA ) . ' | ' . self::debugOut( $nodeB );
416			}
417		);
418
419		$state->sep->constraints['constraintInfo'] = [
420			'onSOL' => $state->onSOL,
421			// force SOL state when separator is built/emitted
422			'forceSOL' => $sepHandlerB->forceSOL(),
423			'sepType' => $sepType,
424			'nodeA' => $nodeA,
425			'nodeB' => $nodeB,
426		];
427	}
428
429	/**
430	 * @param Env $env
431	 * @param SerializerState $state
432	 */
433	public function __construct( Env $env, SerializerState $state ) {
434		$this->env = $env;
435		$this->state = $state;
436	}
437
438	/**
439	 * @param string $sep
440	 * @param array $nlConstraints
441	 * @return string
442	 */
443	private function makeSepIndentPreSafe(
444		string $sep, array $nlConstraints
445	): string {
446		$state = $this->state;
447		$constraintInfo = $nlConstraints['constraintInfo'] ?? [];
448		$sepType = $constraintInfo['sepType'] ?? null;
449		$nodeA = $constraintInfo['nodeA'] ?? null;
450		$nodeB = $constraintInfo['nodeB'] ?? null;
451		$forceSOL = ( $constraintInfo['forceSOL'] ?? false ) && $sepType !== 'child-parent';
452		$origNodeB = $nodeB;
453
454		// Ex: "<div>foo</div>\n <span>bar</span>"
455		//
456		// We also should test for onSOL state to deal with HTML like
457		// <ul> <li>foo</li></ul>
458		// and strip the leading space before non-indent-pre-safe tags
459		if (
460			!$state->inPHPBlock &&
461			!$state->inIndentPre &&
462			preg_match( self::INDENT_PRE_WS_IN_SEP_REGEXP, $sep ) && (
463				str_contains( $sep, "\n" ) || !empty( $constraintInfo['onSOL'] ) || $forceSOL
464			)
465		) {
466			// 'sep' is the separator before 'nodeB' and it has leading spaces on a newline.
467			// We have to decide whether that leading space will trigger indent-pres in wikitext.
468			// The decision depends on where this separator will be emitted relative
469			// to 'nodeA' and 'nodeB'.
470
471			$isIndentPreSafe = false;
472
473			// Example sepType scenarios:
474			//
475			// 1. sibling
476			// <div>foo</div>
477			// <span>bar</span>
478			// The span will be wrapped in an indent-pre if the leading space
479			// is not stripped since span is not a block tag
480			//
481			// 2. child-parent
482			// <span>foo
483			// </span>bar
484			// The " </span>bar" will be wrapped in an indent-pre if the
485			// leading space is not stripped since span is not a block tag
486			//
487			// 3. parent-child
488			// <div>foo
489			// <span>bar</span>
490			// </div>
491			//
492			// In all cases, only block-tags prevent indent-pres.
493			// (except for a special case for <br> nodes)
494			if ( $nodeB && WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ) ) {
495				$isIndentPreSafe = true;
496			} elseif ( $sepType === 'sibling' || $nodeA && DOMUtils::atTheTop( $nodeA ) ) {
497				Assert::invariant( !DOMUtils::atTheTop( $nodeA ) || $sepType === 'parent-child', __METHOD__ );
498
499				// 'nodeB' is the first non-separator child of 'nodeA'.
500				//
501				// Walk past sol-transparent nodes in the right-sibling chain
502				// of 'nodeB' till we establish indent-pre safety.
503				while ( $nodeB &&
504					( DOMUtils::isDiffMarker( $nodeB ) || WTUtils::emitsSolTransparentSingleLineWT( $nodeB ) )
505				) {
506					$nodeB = $nodeB->nextSibling;
507				}
508
509				$isIndentPreSafe = !$nodeB || WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB );
510			}
511
512			// Check whether nodeB is nested inside an element that suppresses
513			// indent-pres.
514			if ( $nodeB && !$isIndentPreSafe && !DOMUtils::atTheTop( $nodeB ) ) {
515				$parentB = $nodeB->parentNode; // could be nodeA
516				while ( WTUtils::isZeroWidthWikitextElt( $parentB ) ) {
517					$parentB = $parentB->parentNode;
518				}
519
520				// The token stream paragraph wrapper (and legacy doBlockLevels)
521				// tracks this separately with $inBlockquote
522				$isIndentPreSafe = DOMUtils::hasNameOrHasAncestorOfName(
523					$parentB, 'blockquote'
524				);
525
526				// First scope wins
527				while ( !$isIndentPreSafe && !DOMUtils::atTheTop( $parentB ) ) {
528					if (
529						TokenUtils::tagOpensBlockScope( DOMCompat::nodeName( $parentB ) ) &&
530						// Only html p-tag is indent pre suppressing
531						( DOMCompat::nodeName( $parentB ) !== 'p' || WTUtils::isLiteralHTMLNode( $parentB ) )
532					) {
533						$isIndentPreSafe = true;
534						break;
535					} elseif ( TokenUtils::tagClosesBlockScope( DOMCompat::nodeName( $parentB ) ) ) {
536						break;
537					}
538					$parentB = $parentB->parentNode;
539				}
540			}
541
542			$stripLeadingSpace = ( !empty( $constraintInfo['onSOL'] ) || $forceSOL ) &&
543				$nodeB && !WTUtils::isLiteralHTMLNode( $nodeB ) &&
544				isset( WikitextConstants::$HTMLTagsRequiringSOLContext[DOMCompat::nodeName( $nodeB )] );
545			if ( !$isIndentPreSafe || $stripLeadingSpace ) {
546				// Wrap non-nl ws from last line, but preserve comments.
547				// This avoids triggering indent-pres.
548				$sep = preg_replace_callback(
549					self::INDENT_PRE_WS_IN_SEP_REGEXP,
550					static function ( $matches ) use ( $stripLeadingSpace, $state ) {
551						if ( !$stripLeadingSpace ) {
552							// Since we nowiki-ed, we are no longer in sol state
553							$state->onSOL = false;
554							$state->hasIndentPreNowikis = true;
555							$space = '<nowiki>' . $matches[2] . '</nowiki>';
556						}
557						return ( $matches[1] ?? '' ) . ( $space ?? '' ) . ( $matches[3] ?? '' );
558					},
559					$sep
560				);
561			}
562		}
563
564		$state->getEnv()->log(
565			'debug/wts/sep',
566			'ipre-safe  |',
567			static function () use ( $sep, $nlConstraints ) {
568				$constraints = Utils::clone( $nlConstraints );
569				unset( $constraints['constraintInfo'] );
570				return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $constraints );
571			}
572		);
573
574		return $sep;
575	}
576
577	/**
578	 * Serializing auto inserted content should invalidate the original separator
579	 * @param Element $node
580	 * @return DomSourceRange|null
581	 */
582	private static function handleAutoInserted( Element $node ): ?DomSourceRange {
583		$dp = DOMDataUtils::getDataParsoid( $node );
584		if ( !isset( $dp->dsr ) ) {
585			return null;
586		}
587
588		$dsr = clone $dp->dsr;
589		if ( !empty( $dp->autoInsertedStart ) ) {
590			$dsr->openWidth = null;
591		}
592		if ( !empty( $dp->autoInsertedEnd ) ) {
593			$dsr->closeWidth = null;
594		}
595		return $dsr;
596	}
597
598	/**
599	 * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed
600	 * in the wt->html direction. In this method, we attempt to recover leading trimmed whitespace
601	 * using DSR information on $node.
602	 *
603	 * In some cases, $node might have an additional "data-mw-selser-wrapper" span
604	 * that is added by SelSer - look past those wrappers.
605	 *
606	 * The recovery is attempted in two different ways:
607	 * 1. If we have additional DSR fields about leading/trailing WS
608	 *    (represented by $state->haveTrimmedWsDSR), that info is used.
609	 * 2. If not, we simply inspect source at $dsr->innerStart and if it
610	 *    happens to be whitespace, we use that.
611	 *
612	 * @param Node $node
613	 * @return ?string
614	 */
615	private function fetchLeadingTrimmedSpace( Node $node ): ?string {
616		$origNode = $node;
617		$parentNode = $node->parentNode;
618
619		// Skip past the artificial span wrapper
620		if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) {
621			$node = $parentNode;
622			$parentNode = $parentNode->parentNode;
623		}
624
625		// Leading trimmed whitespace only makes sense for first child.
626		// Ignore comments (which are part of separators) + deletion markers.
627		if ( DOMUtils::previousNonSepSibling( $node ) ) {
628			return null;
629		}
630
631		'@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode
632		if ( isset( WikitextConstants::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) &&
633			( $origNode instanceof Element || !preg_match( '/^[ \t]/', $origNode->nodeValue ) )
634		) {
635			// Don't reintroduce whitespace that's already been captured as a DisplaySpace
636			if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) {
637				return null;
638			}
639
640			// FIXME: Is this complexity worth some minor dirty diff on this test?
641			// ParserTest: "3. List embedded in a formatting tag in a misnested way"
642			// I've not added an equivalent check in the trailing whitespace case.
643			if ( $origNode instanceof Element &&
644				isset( DOMDataUtils::getDataParsoid( $origNode )->autoInsertedStart ) &&
645				strspn( $origNode->firstChild->textContent ?? '', " \t" ) >= 1
646			) {
647				return null;
648			}
649
650			$state = $this->state;
651			$dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null;
652			if ( Utils::isValidDSR( $dsr, true ) ) {
653				if ( $state->haveTrimmedWsDSR && (
654					$dsr->leadingWS > 0 || ( $dsr->leadingWS === 0 && $dsr->trailingWS > 0 )
655				) ) {
656					$sep = $state->getOrigSrc( $dsr->innerStart(), $dsr->innerStart() + $dsr->leadingWS ) ?? '';
657					return strspn( $sep, " \t" ) === strlen( $sep ) ? $sep : null;
658				} else {
659					$offset = $dsr->innerStart();
660					if ( $offset < $dsr->innerEnd() ) {
661						$sep = $state->getOrigSrc( $offset, $offset + 1 ) ?? '';
662						return preg_match( '/[ \t]/', $sep ) ? $sep : null;
663					}
664				}
665			}
666		}
667
668		return null;
669	}
670
671	/**
672	 * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed
673	 * in the wt->html direction. In this method, we attempt to recover trailing trimmed whitespace
674	 * using DSR information on $node.
675	 *
676	 * In some cases, $node might have an additional "data-mw-selser-wrapper" span
677	 * that is added by SelSer - look past those wrappers.
678	 *
679	 * The recovery is attempted in two different ways:
680	 * 1. If we have additional DSR fields about leading/trailing WS
681	 *    (represented by $state->haveTrimmedWsDSR), that info is used.
682	 * 2. If not, we simply inspect source at $dsr->innerEnd and if it
683	 *    happens to be whitespace, we use that.
684	 *
685	 * @param Node $node
686	 * @return ?string
687	 */
688	private function fetchTrailingTrimmedSpace( Node $node ): ?string {
689		$origNode = $node;
690		$parentNode = $node->parentNode;
691
692		// Skip past the artificial span wrapper
693		if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) {
694			$node = $parentNode;
695			$parentNode = $parentNode->parentNode;
696		}
697
698		// Trailing trimmed whitespace only makes sense for last child.
699		// Ignore comments (which are part of separators) + deletion markers.
700		if ( DOMUtils::nextNonSepSibling( $node ) ) {
701			return null;
702		}
703
704		$sep = null;
705		'@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode
706		if ( isset( WikitextConstants::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) &&
707			( $origNode instanceof Element || !preg_match( '/[ \t]$/', $origNode->nodeValue ) )
708		) {
709			// Don't reintroduce whitespace that's already been captured as a DisplaySpace
710			if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) {
711				return null;
712			}
713
714			$state = $this->state;
715			$dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null;
716			if ( Utils::isValidDSR( $dsr, true ) ) {
717				if ( $state->haveTrimmedWsDSR && (
718					$dsr->trailingWS > 0 || ( $dsr->trailingWS === 0 && $dsr->leadingWS > 0 )
719				) ) {
720					$sep = $state->getOrigSrc( $dsr->innerEnd() - $dsr->trailingWS, $dsr->innerEnd() ) ?? '';
721					if ( !preg_match( '/^[ \t]*$/', $sep ) ) {
722						$sep = null;
723					}
724				} else {
725					$offset = $dsr->innerEnd() - 1;
726					// The > instead of >= is to deal with an edge case
727					// = = where that single space is captured by the
728					// getLeadingSpace case above
729					if ( $offset > $dsr->innerStart() ) {
730						$sep = $state->getOrigSrc( $offset, $offset + 1 ) ?? '';
731						if ( !preg_match( '/[ \t]/', $sep ) ) {
732							$sep = null;
733						}
734					}
735				}
736			}
737		}
738
739		return $sep;
740	}
741
742	/**
743	 * Emit a separator based on the collected (and merged) constraints
744	 * and existing separator text. Called when new output is triggered.
745	 * @param Node $node
746	 * @param bool $leading
747	 *   if true, trimmed leading whitespace is emitted
748	 *   if false, trimmed railing whitespace is emitted
749	 * @return string|null
750	 */
751	public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string {
752		// Deal with scenarios where leading / trailing whitespace were trimmed.
753		// We now need to figure out if we need to add any leading / trailing WS back.
754		if ( $this->state->useWhitespaceHeuristics && $this->state->selserMode ) {
755			if ( $leading ) {
756				return $this->fetchLeadingTrimmedSpace( $node );
757			} else {
758				$lastChild = DOMUtils::lastNonDeletedChild( $node );
759				return $lastChild ? $this->fetchTrailingTrimmedSpace( $lastChild ) : null;
760			}
761		}
762
763		return null;
764	}
765
766	/**
767	 * Emit a separator based on the collected (and merged) constraints
768	 * and existing separator text. Called when new output is triggered.
769	 * @param Node $node
770	 * @return string|null
771	 */
772	public function buildSep( Node $node ): ?string {
773		$state = $this->state;
774		$sepType = $state->sep->constraints['constraintInfo']['sepType'] ?? null;
775		$sep = null;
776		$origNode = $node;
777		$prevNode = $state->sep->lastSourceNode;
778		$dsrA = null;
779		$dsrB = null;
780
781		/* ----------------------------------------------------------------------
782		 * Assuming we have access to the original source, we can use DSR offsets
783		 * to extract separators from source only if:
784		 * - we are in selser mode AND
785		 * - this node is not part of a newly inserted subtree (marked 'modified')
786		 *   for which DSR isn't available
787		 * - neither node is adjacent to a deleted block node
788		 *   (see the long comment in SerializerState::emitChunk in the middle)
789		 *
790		 * In other scenarios, DSR values on "adjacent" nodes in the edited DOM
791		 * may not reflect deleted content between them.
792		 * ---------------------------------------------------------------------- */
793		$origSepNeeded = $node !== $prevNode && $state->selserMode;
794		$origSepNeededAndUsable =
795			$origSepNeeded && !$state->inModifiedContent &&
796			!WTSUtils::nextToDeletedBlockNodeInWT( $prevNode, true ) &&
797			!WTSUtils::nextToDeletedBlockNodeInWT( $node, false ) &&
798			WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $prevNode ) &&
799			WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $node );
800
801		if ( $origSepNeededAndUsable ) {
802			if ( $prevNode instanceof Element ) {
803				$dsrA = self::handleAutoInserted( $prevNode );
804			} elseif ( !( $prevNode instanceof DocumentFragment ) ) {
805				// Check if $prevNode is the last child of a zero-width element,
806				// and use that for dsr purposes instead. Typical case: text in p.
807				if (
808					!$prevNode->nextSibling &&
809					$prevNode->parentNode !== $node &&
810					$prevNode->parentNode instanceof Element &&
811					( DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr->closeWidth ?? null ) === 0
812				) {
813					$dsrA = self::handleAutoInserted( $prevNode->parentNode );
814				} elseif (
815					// Can we extrapolate DSR from $prevNode->previousSibling?
816					// Yes, if $prevNode->parentNode didn't have its children edited.
817					$prevNode->previousSibling instanceof Element &&
818					!DiffUtils::directChildrenChanged( $prevNode->parentNode, $this->env )
819				) {
820					$endDsr = DOMDataUtils::getDataParsoid( $prevNode->previousSibling )->dsr->end ?? null;
821					$correction = null;
822					if ( is_int( $endDsr ) ) {
823						if ( DOMUtils::isComment( $prevNode ) ) {
824							'@phan-var Comment $prevNode'; // @var Comment $prevNode
825							$correction = WTUtils::decodedCommentLength( $prevNode );
826						} else {
827							$correction = strlen( $prevNode->nodeValue );
828						}
829						$dsrA = new DomSourceRange(
830							$endDsr,
831							$endDsr + $correction + WTUtils::indentPreDSRCorrection( $prevNode ),
832							0,
833							0
834						);
835					}
836				}
837			}
838
839			if ( !$dsrA ) {
840				// nothing to do -- no reason to compute dsrB if dsrA is null
841			} elseif ( $node instanceof Element ) {
842				// $node is parent of $prevNode
843				if ( $prevNode->parentNode === $node ) {
844					'@phan-var Element|DocumentFragment $node'; // @var Element|DocumentFragment $node
845					// FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid?
846					//
847					// When we are in the lastChild sep scenario and the parent doesn't have
848					// useable dsr, if possible, walk up the ancestor nodes till we find
849					// a dsr-bearing node
850					//
851					// This fix is needed to handle trailing newlines in this wikitext:
852					// [[File:foo.jpg|thumb|300px|foo\n{{1x|A}}\n{{1x|B}}\n{{1x|C}}\n\n]]
853					while (
854						!$node->nextSibling &&
855						!DOMUtils::atTheTop( $node ) &&
856						(
857							empty( DOMDataUtils::getDataParsoid( $node )->dsr ) ||
858							DOMDataUtils::getDataParsoid( $node )->dsr->start === null ||
859							DOMDataUtils::getDataParsoid( $node )->dsr->end === null
860						)
861					) {
862						$node = $node->parentNode;
863					}
864				}
865
866				// The top node could be a document fragment
867				$dsrB = $node instanceof Element ? self::handleAutoInserted( $node ) : null;
868			} elseif ( !( $node instanceof DocumentFragment ) ) {
869				// $node is text/comment. Can we extrapolate DSR from $node->parentNode?
870				// Yes, if this is the child of a zero-width element and
871				// is only preceded by separator elements.
872				//
873				// 1. text in p.
874				// 2. ws-only child of a node with auto-inserted start tag
875				//    Ex: "<span> <s>x</span> </s>" --> <span> <s>x</s*></span><s*> </s>
876				// 3. ws-only children of a node with auto-inserted start tag
877				//    Ex: "{|\n|-\n <!--foo--> \n|}"
878				$nodeParent = $node->parentNode;
879				// phpcs:ignore Generic.Files.LineLength.TooLong
880				'@phan-var Element|DocumentFragment $nodeParent'; // @var Element|DocumentFragment $nodeParent
881
882				if (
883					$nodeParent !== $prevNode &&
884					$nodeParent instanceof Element &&
885					( DOMDataUtils::getDataParsoid( $nodeParent )->dsr->openWidth ?? null ) === 0
886				) {
887					$sepLen = self::precedingSeparatorTextLen( $node );
888					if ( $sepLen !== null ) {
889						$dsrB = DOMDataUtils::getDataParsoid( $nodeParent )->dsr;
890						if ( is_int( $dsrB->start ) && $sepLen > 0 ) {
891							$dsrB = clone $dsrB;
892							$dsrB->start += $sepLen;
893						}
894					}
895				}
896			}
897
898			// FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid?
899			if ( Utils::isValidDSR( $dsrA ) && Utils::isValidDSR( $dsrB ) ) {
900				// Figure out containment relationship
901				if ( $dsrA->start <= $dsrB->start ) {
902					if ( $dsrB->end <= $dsrA->end ) {
903						if ( $dsrA->start === $dsrB->start && $dsrA->end === $dsrB->end ) {
904							// Both have the same dsr range, so there can't be any
905							// separators between them
906							$sep = '';
907						} elseif ( ( $dsrA->openWidth ?? null ) !== null ) {
908							// B in A, from parent to child
909							$sep = $state->getOrigSrc( $dsrA->innerStart(), $dsrB->start );
910						}
911					} elseif ( $dsrA->end <= $dsrB->start ) {
912						// B following A (siblingish)
913						$sep = $state->getOrigSrc( $dsrA->end, $dsrB->start );
914					} elseif ( ( $dsrB->closeWidth ?? null ) !== null ) {
915						// A in B, from child to parent
916						$sep = $state->getOrigSrc( $dsrA->end, $dsrB->innerEnd() );
917					}
918				} elseif ( $dsrA->end <= $dsrB->end ) {
919					if ( ( $dsrB->closeWidth ?? null ) !== null ) {
920						// A in B, from child to parent
921						$sep = $state->getOrigSrc( $dsrA->end, $dsrB->innerEnd() );
922					}
923				} else {
924					$this->env->log( 'info/html2wt', 'dsr backwards: should not happen!' );
925				}
926
927				// Reset if $sep is invalid
928				if ( $sep && !WTSUtils::isValidSep( $sep ) ) {
929					$sep = null;
930				}
931			}
932		} elseif ( $origSepNeeded && !DiffUtils::hasDiffMarkers( $prevNode, $this->env ) ) {
933			// Given the following conditions:
934			// - $prevNode has no diff markers. (checked above)
935			// - $prevNode's next non-sep sibling ($next) was inserted.
936			// - $next is an ancestor of $node.
937			// - all of those ancestor nodes from $node->$next have zero-width
938			//   wikitext (otherwise, the separator isn't usable)
939			// Try to extract a separator from original source that existed
940			// between $prevNode and its original next sibling or its parent
941			// (if $prevNode was the last non-sep child).
942			//
943			// This minimizes dirty-diffs to that separator text from
944			// the insertion of $next after $prevNode.
945			$next = DOMUtils::nextNonSepSibling( $prevNode );
946			$origSepUsable = $next && DiffUtils::hasInsertedDiffMark( $next, $this->env );
947
948			// Check that $next is an ancestor of $node and all nodes
949			// on that path have zero-width wikitext
950			if ( $origSepUsable && $node !== $next ) {
951				$n = $node->parentNode;
952				while ( $n && $next !== $n ) {
953					if ( !WTUtils::isZeroWidthWikitextElt( $n ) ) {
954						$origSepUsable = false;
955						break;
956					}
957					$n = $n->parentNode;
958				}
959				$origSepUsable = $origSepUsable && $n !== null;
960			}
961
962			// Extract separator from original source if possible
963			if ( $origSepUsable ) {
964				$origNext = DOMUtils::nextNonSepSibling( $next );
965				if ( !$origNext ) { // $prevNode was last non-sep child of its parent
966					// We could work harder for text/comments and extrapolate, but skipping that here
967					// FIXME: If we had a generic DSR extrapolation utility, that would be useful
968					$o1 = $prevNode instanceof Element ?
969						DOMDataUtils::getDataParsoid( $prevNode )->dsr->end ?? null : null;
970					if ( $o1 !== null ) {
971						$dsr2 = DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr ?? null;
972						$o2 = $dsr2 ? $dsr2->innerEnd() : null;
973						$sep = $o2 !== null ? $state->getOrigSrc( $o1, $o2 ) : null;
974					}
975				} elseif ( !DiffUtils::hasDiffMarkers( $origNext, $this->env ) ) {
976					// We could work harder for text/comments and extrapolate, but skipping that here
977					// FIXME: If we had a generic DSR extrapolation utility, that would be useful
978					$o1 = $prevNode instanceof Element ?
979						DOMDataUtils::getDataParsoid( $prevNode )->dsr->end ?? null : null;
980					if ( $o1 !== null ) {
981						$o2 = $origNext instanceof Element ?
982							DOMDataUtils::getDataParsoid( $origNext )->dsr->start ?? null : null;
983						$sep = $o2 !== null ? $state->getOrigSrc( $o1, $o2 ) : null;
984					}
985				}
986
987				if ( $sep !== null ) {
988					// Since this is an inserted node, we might have to augment this
989					// with newline constraints and so, we just set this recovered sep
990					// to the buffered sep in state->sep->src
991					$state->sep->src = $sep;
992					$sep = null;
993				}
994			}
995		}
996
997		// If all efforts failed, use special-purpose heuristics to recover
998		// trimmed leading / trailing whitespace from lists, headings, table-cells
999		if ( $sep === null ) {
1000			if ( $sepType === 'parent-child' ) {
1001				$sep = $this->recoverTrimmedWhitespace( $node, true );
1002				if ( $sep !== null ) {
1003					$state->sep->src = $sep . $state->sep->src;
1004				}
1005			} elseif ( $sepType === 'child-parent' ) {
1006				$sep = $this->recoverTrimmedWhitespace( $node, false );
1007				if ( $sep !== null ) {
1008					$state->sep->src .= $sep;
1009				}
1010			} else {
1011				$sep = null;
1012			}
1013		}
1014
1015		$this->env->log(
1016			'debug/wts/sep',
1017			static function () use ( $prevNode, $origNode, $sep, $state ) {
1018				return 'maybe-sep  | ' .
1019					'prev:' . ( $prevNode ? DOMCompat::nodeName( $prevNode ) : '--none--' ) .
1020					', node:' . DOMCompat::nodeName( $origNode ) .
1021					', sep: ' . PHPUtils::jsonEncode( $sep ) .
1022					', state.sep.src: ' . PHPUtils::jsonEncode( $state->sep->src ?? null );
1023			}
1024		);
1025
1026		// If the separator is being emitted before a node that emits sol-transparent WT,
1027		// go through makeSeparator to verify indent-pre constraints are met.
1028		$sepConstraints = $state->sep->constraints ?? [ 'max' => 0 ];
1029		if ( $sep === null || ( $state->sep->src && $state->sep->src !== $sep ) ) {
1030			if ( !empty( $state->sep->constraints ) || !empty( $state->sep->src ) ) {
1031				// TODO: set modified flag if start or end node (but not both) are
1032				// modified / new so that the selser can use the separator
1033				$sep = $this->makeSeparator( $node, $state->sep->src ?? '', $sepConstraints );
1034			} else {
1035				$sep = null;
1036			}
1037		}
1038
1039		if ( $sep !== null ) {
1040			$sep = self::makeSepIndentPreSafe( $sep, $sepConstraints );
1041		}
1042		return $sep;
1043	}
1044}
1045