1<?php
2
3namespace Wikimedia\Parsoid\Html2Wt;
4
5use Closure;
6use DOMElement;
7use DOMNode;
8use Exception;
9use stdClass;
10use Wikimedia\Assert\Assert;
11use Wikimedia\Parsoid\Config\Env;
12use Wikimedia\Parsoid\Config\WikitextConstants;
13use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText;
14use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler;
15use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandlerFactory;
16use Wikimedia\Parsoid\Tokens\KV;
17use Wikimedia\Parsoid\Tokens\TagTk;
18use Wikimedia\Parsoid\Tokens\Token;
19use Wikimedia\Parsoid\Utils\ContentUtils;
20use Wikimedia\Parsoid\Utils\DOMCompat;
21use Wikimedia\Parsoid\Utils\DOMDataUtils;
22use Wikimedia\Parsoid\Utils\DOMUtils;
23use Wikimedia\Parsoid\Utils\PHPUtils;
24use Wikimedia\Parsoid\Utils\TokenUtils;
25use Wikimedia\Parsoid\Utils\Utils;
26use Wikimedia\Parsoid\Utils\WTUtils;
27
28/**
29 * Wikitext to HTML serializer.
30 * Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor.
31 *
32 * This serializer is designed to eventually
33 * - accept arbitrary HTML and
34 * - serialize that to wikitext in a way that round-trips back to the same
35 *   HTML DOM as far as possible within the limitations of wikitext.
36 *
37 * Not much effort has been invested so far on supporting
38 * non-Parsoid/VE-generated HTML. Some of this involves adaptively switching
39 * between wikitext and HTML representations based on the values of attributes
40 * and DOM context. A few special cases are already handled adaptively
41 * (multi-paragraph list item contents are serialized as HTML tags for
42 * example, generic A elements are serialized to HTML A tags), but in general
43 * support for this is mostly missing.
44 *
45 * Example issue:
46 * ```
47 * <h1><p>foo</p></h1> will serialize to =\nfoo\n= whereas the
48 *        correct serialized output would be: =<p>foo</p>=
49 * ```
50 *
51 * What to do about this?
52 * - add a generic 'can this HTML node be serialized to wikitext in this
53 *   context' detection method and use that to adaptively switch between
54 *   wikitext and HTML serialization.
55 *
56 */
57class WikitextSerializer {
58
59	/** @var string[] */
60	private const IGNORED_ATTRIBUTES = [
61		'data-parsoid' => true,
62		'data-ve-changed' => true,
63		'data-parsoid-changed' => true,
64		'data-parsoid-diff' => true,
65		'data-parsoid-serialize' => true,
66		DOMDataUtils::DATA_OBJECT_ATTR_NAME => true,
67	];
68
69	/** @var string[] attribute name => value regexp */
70	private const PARSOID_ATTRIBUTES = [
71		'about' => '/^#mwt\d+$/D',
72		'typeof' => '/(^|\s)mw:[^\s]+/',
73	];
74
75	// PORT-FIXME do different whitespace semantics matter?
76
77	/** @var string Regexp */
78	private const TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP
79		= '/\n(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D';
80
81	/** @var string Regexp */
82	private const FORMATSTRING_REGEXP =
83		'/^(\n)?(\{\{ *_+)(\n? *\|\n? *_+ *= *)(_+)(\n? *\}\})(\n)?$/D';
84
85	/** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */
86	private const COMMENT_OR_WS_REGEXP = '/^(\s|' . Utils::COMMENT_REGEXP_FRAGMENT . ')*$/D';
87
88	/** @var string Regexp for testing whether nowiki added around heading-like wikitext is needed */
89	private const HEADING_NOWIKI_REGEXP = '/^(?:' . Utils::COMMENT_REGEXP_FRAGMENT . ')*'
90		. '<nowiki>(=+[^=]+=+)<\/nowiki>(.+)$/D';
91
92	/** @var array string[] */
93	private static $separatorREs = [
94		'pureSepRE' => '/^[ \t\r\n]*$/D',
95		'sepPrefixWithNlsRE' => '/^[ \t]*\n+[ \t\r\n]*/',
96		'sepSuffixWithNlsRE' => '/\n[ \t\r\n]*$/D',
97	];
98
99	/** @var WikitextEscapeHandlers */
100	public $wteHandlers;
101
102	/** @var Env */
103	public $env;
104
105	/** @var SerializerState */
106	private $state;
107
108	/** @var Separators */
109	private $separators;
110
111	/**
112	 * @var array
113	 *   - env: (Env)
114	 *   - rtTestMode: (boolean)
115	 *   - logType: (string)
116	 */
117	private $options;
118
119	/** @var string Log type for trace() */
120	private $logType;
121
122	/**
123	 * @param array $options List of options for serialization:
124	 *   - env: (Env) (required)
125	 *   - rtTestMode: (boolean)
126	 *   - logType: (string)
127	 */
128	public function __construct( $options ) {
129		$this->env = $options['env'];
130		$this->options = array_merge( $options, [
131			'rtTestMode' => $this->env->getSiteConfig()->rtTestMode(),
132			'logType' => 'trace/wts',
133		] );
134		$this->logType = $this->options['logType'];
135		$this->state = new SerializerState( $this, $this->options );
136		$this->separators = new Separators( $this->env, $this->state );
137		$this->wteHandlers = new WikitextEscapeHandlers( $this->options );
138	}
139
140	/**
141	 * Main link handler.
142	 * @param DOMElement $node
143	 * Used in multiple tag handlers (<a> and <link>), and hence added as top-level method
144	 * PORT-TODO: rename to something like handleLink()?
145	 */
146	public function linkHandler( DOMElement $node ): void {
147		LinkHandlerUtils::linkHandler( $this->state, $node );
148	}
149
150	/**
151	 * Main figure handler.
152	 *
153	 * All figures have a fixed structure:
154	 * ```
155	 * <figure or figure-inline typeof="mw:Image...">
156	 *  <a or span><img ...><a or span>
157	 *  <figcaption>....</figcaption>
158	 * </figure or figure-inline>
159	 * ```
160	 * Pull out this fixed structure, being as generous as possible with
161	 * possibly-broken HTML.
162	 *
163	 * @param DOMElement $node
164	 * Used in multiple tag handlers(<figure> and <a>.linkHandler above), and hence added as
165	 * top-level method
166	 * PORT-TODO: rename to something like handleFigure()?
167	 */
168	public function figureHandler( DOMElement $node ): void {
169		LinkHandlerUtils::figureHandler( $this->state, $node );
170	}
171
172	/**
173	 * @param DOMElement $node
174	 * @return void
175	 */
176	public function languageVariantHandler( DOMNode $node ): void {
177		LanguageVariantHandler::handleLanguageVariant( $this->state, $node );
178	}
179
180	/**
181	 * Figure out separator constraints and merge them with existing constraints
182	 * in state so that they can be emitted when the next content emits source.
183	 * @param DOMNode $nodeA
184	 * @param DOMHandler $handlerA
185	 * @param DOMNode $nodeB
186	 * @param DOMHandler $handlerB
187	 */
188	public function updateSeparatorConstraints(
189		DOMNode $nodeA, DOMHandler $handlerA, DOMNode $nodeB, DOMHandler $handlerB
190	): void {
191		$this->separators->updateSeparatorConstraints( $nodeA, $handlerA, $nodeB, $handlerB );
192	}
193
194	/**
195	 * Emit a separator based on the collected (and merged) constraints
196	 * and existing separator text. Called when new output is triggered.
197	 * @param DOMNode $node
198	 * @return string|null
199	 */
200	public function buildSep( DOMNode $node ): ?string {
201		return $this->separators->buildSep( $node );
202	}
203
204	/**
205	 * Escape wikitext-like strings in '$text' so that $text renders as a plain string
206	 * when rendered as HTML. The escaping is done based on the context in which $text
207	 * is present (ex: start-of-line, in a link, etc.)
208	 *
209	 * @param SerializerState $state
210	 * @param string $text
211	 * @param array $opts
212	 *   - node: (DOMNode)
213	 *   - isLastChild: (bool)
214	 * @return string
215	 */
216	public function escapeWikiText( SerializerState $state, string $text, array $opts ): string {
217		return $this->wteHandlers->escapeWikitext( $state, $text, $opts );
218	}
219
220	/**
221	 * @param array $opts
222	 * @param DOMElement $elt
223	 * @return ConstrainedText|string
224	 */
225	public function domToWikitext( array $opts, DOMElement $elt ) {
226		$opts['logType'] = $this->logType;
227		$serializer = new WikitextSerializer( $opts );
228		return $serializer->serializeDOM( $elt );
229	}
230
231	/**
232	 * @param array $opts
233	 * @param string $html
234	 * @return ConstrainedText|string
235	 */
236	public function htmlToWikitext( array $opts, string $html ) {
237		$body = ContentUtils::ppToDOM( $this->env, $html, [ 'markNew' => true ] );
238		return $this->domToWikitext( $opts, $body );
239	}
240
241	/**
242	 * @param DOMElement $node
243	 * @param string $key
244	 * @return string
245	 */
246	public function getAttributeKey( DOMElement $node, string $key ): string {
247		$tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? [];
248		foreach ( $tplAttrs as $attr ) {
249			// If this attribute's key is generated content,
250			// serialize HTML back to generator wikitext.
251			// PORT-FIXME: bool check might not be safe. Need documentation on attrib format.
252			if ( ( $attr[0]->txt ?? null ) === $key && isset( $attr[0]->html ) ) {
253				return $this->htmlToWikitext( [
254					'env' => $this->env,
255					'onSOL' => false,
256				], $attr[0]->html );
257			}
258		}
259		return $key;
260	}
261
262	/**
263	 * @param DOMElement $node
264	 * @param string $key Attribute name.
265	 * @param mixed $value Fallback value to use if the attibute is not present.
266	 * @return ConstrainedText|string
267	 */
268	public function getAttributeValue( DOMElement $node, string $key, $value ) {
269		$tplAttrs = DOMDataUtils::getDataMw( $node )->attribs ?? [];
270		foreach ( $tplAttrs as $attr ) {
271			// If this attribute's value is generated content,
272			// serialize HTML back to generator wikitext.
273			// PORT-FIXME: not type safe. Need documentation on attrib format.
274			if ( ( $attr[0] === $key || ( $attr[0]->txt ?? null ) === $key )
275				 // Only return here if the value is generated (ie. .html),
276				 // it may just be in .txt form.
277				 && isset( $attr[1]->html )
278				 // !== null is required. html:"" will serialize to "" and
279				 // will be returned here. This is used to suppress the =".."
280				 // string in the attribute in scenarios where the template
281				 // generates a "k=v" string.
282				 // Ex: <div {{1x|1=style='color:red'}}>foo</div>
283				 && $attr[1]->html !== null
284			) {
285				return $this->htmlToWikitext( [
286					'env' => $this->env,
287					'onSOL' => false,
288					'inAttribute' => true,
289				], $attr[1]->html );
290			}
291		}
292		return $value;
293	}
294
295	/**
296	 * @param DOMElement $node
297	 * @param string $key
298	 * @return array|null A tuple in {@link WTSUtils::getShadowInfo()} format,
299	 *   with an extra 'fromDataMW' flag.
300	 */
301	public function getAttributeValueAsShadowInfo( DOMElement $node, string $key ): ?array {
302		$v = $this->getAttributeValue( $node, $key, null );
303		if ( $v === null ) {
304			return $v;
305		}
306		return [
307			'value' => $v,
308			'modified' => false,
309			'fromsrc' => true,
310			'fromDataMW' => true,
311		];
312	}
313
314	/**
315	 * @param DOMElement $dataMWnode
316	 * @param DOMElement $htmlAttrNode
317	 * @param string $key
318	 * @return array A tuple in {@link WTSUtils::getShadowInfo()} format,
319	 *   possibly with an extra 'fromDataMW' flag.
320	 */
321	public function serializedImageAttrVal(
322		DOMElement $dataMWnode, DOMElement $htmlAttrNode, string $key
323	): array {
324		$v = $this->getAttributeValueAsShadowInfo( $dataMWnode, $key );
325		return $v ?: WTSUtils::getAttributeShadowInfo( $htmlAttrNode, $key );
326	}
327
328	/**
329	 * @param DOMElement $node
330	 * @param string $name
331	 * @return array
332	 */
333	public function serializedAttrVal( DOMElement $node, string $name ): array {
334		return $this->serializedImageAttrVal( $node, $node, $name );
335	}
336
337	/**
338	 * @param DOMElement $node
339	 * @param bool $wrapperUnmodified
340	 * @return string
341	 */
342	public function serializeHTMLTag( DOMElement $node, bool $wrapperUnmodified ): string {
343		// TODO(arlolra): As of 1.3.0, html pre is considered an extension
344		// and wrapped in encapsulation.  When that version is no longer
345		// accepted for serialization, we can remove this backwards
346		// compatibility code.
347		//
348		// 'inHTMLPre' flag has to be updated always,
349		// even when we are selsering in the wrapperUnmodified case.
350		$token = WTSUtils::mkTagTk( $node );
351		if ( $token->getName() === 'pre' ) {
352			// html-syntax pre is very similar to nowiki
353			$this->state->inHTMLPre = true;
354		}
355
356		if ( $wrapperUnmodified ) {
357			$dsr = DOMDataUtils::getDataParsoid( $node )->dsr;
358			return $this->state->getOrigSrc( $dsr->start, $dsr->innerStart() ) ?? '';
359		}
360
361		$da = $token->dataAttribs;
362		if ( !empty( $da->autoInsertedStart ) ) {
363			return '';
364		}
365
366		$close = '';
367		if ( ( Utils::isVoidElement( $token->getName() ) && empty( $da->noClose ) ) ||
368			!empty( $da->selfClose )
369		) {
370			$close = ' /';
371		}
372
373		$sAttribs = $this->serializeAttributes( $node, $token );
374		if ( strlen( $sAttribs ) > 0 ) {
375			$sAttribs = ' ' . $sAttribs;
376		}
377
378		// srcTagName cannot be '' so, it is okay to use ?? operator
379		$tokenName = $da->srcTagName ?? $token->getName();
380		$ret = "<{$tokenName}{$sAttribs}{$close}>";
381
382		if ( strtolower( $tokenName ) === 'nowiki' ) {
383			$ret = WTUtils::escapeNowikiTags( $ret );
384		}
385
386		return $ret;
387	}
388
389	/**
390	 * @param DOMElement $node
391	 * @param bool $wrapperUnmodified
392	 * @return string
393	 */
394	public function serializeHTMLEndTag( DOMElement $node, $wrapperUnmodified ): string {
395		if ( $wrapperUnmodified ) {
396			$dsr = DOMDataUtils::getDataParsoid( $node )->dsr;
397			return $this->state->getOrigSrc( $dsr->innerEnd(), $dsr->end ) ?? '';
398		}
399
400		$token = WTSUtils::mkEndTagTk( $node );
401		if ( $token->getName() === 'pre' ) {
402			$this->state->inHTMLPre = false;
403		}
404
405		// srcTagName cannot be '' so, it is okay to use ?? operator
406		$tokenName = $token->dataAttribs->srcTagName ?? $token->getName();
407		$ret = '';
408
409		if ( empty( $token->dataAttribs->autoInsertedEnd )
410			&& !Utils::isVoidElement( $token->getName() )
411			&& empty( $token->dataAttribs->selfClose )
412		) {
413			$ret = "</{$tokenName}>";
414		}
415
416		if ( strtolower( $tokenName ) === 'nowiki' ) {
417			$ret = WTUtils::escapeNowikiTags( $ret );
418		}
419
420		return $ret;
421	}
422
423	/**
424	 * @param DOMElement $node
425	 * @param Token $token
426	 * @param bool $isWt
427	 * @return string
428	 */
429	public function serializeAttributes( DOMElement $node, Token $token, bool $isWt = false ): string {
430		$attribs = $token->attribs;
431
432		$out = [];
433		foreach ( $attribs as $kv ) {
434			$k = $kv->k;
435			$v = null;
436			$vInfo = null;
437
438			// Unconditionally ignore
439			// (all of the IGNORED_ATTRIBUTES should be filtered out earlier,
440			// but ignore them here too just to make sure.)
441			if ( isset( self::IGNORED_ATTRIBUTES[$k] ) || $k === 'data-mw' ) {
442				continue;
443			}
444
445			// Ignore parsoid-like ids. They may have been left behind
446			// by clients and shouldn't be serialized. This can also happen
447			// in v2/v3 API when there is no matching data-parsoid entry found
448			// for this id.
449			if ( $k === 'id' && preg_match( '/^mw[\w-]{2,}$/D', $kv->v ) ) {
450				if ( WTUtils::isNewElt( $node ) ) {
451					$this->env->log( 'warn/html2wt',
452						'Parsoid id found on element without a matching data-parsoid '
453						. 'entry: ID=' . $kv->v . '; ELT=' . DOMCompat::getOuterHTML( $node )
454					);
455				} else {
456					$vInfo = $token->getAttributeShadowInfo( $k );
457					if ( !$vInfo['modified'] && $vInfo['fromsrc'] ) {
458						$out[] = $k . '=' . '"' . preg_replace( '/"/', '&quot;', $vInfo['value'] ) . '"';
459					}
460				}
461				continue;
462			}
463
464			// Parsoid auto-generates ids for headings and they should
465			// be stripped out, except if this is not auto-generated id.
466			if ( $k === 'id' && preg_match( '/h[1-6]/', $node->nodeName ) ) {
467				if ( !empty( DOMDataUtils::getDataParsoid( $node )->reusedId ) ) {
468					$vInfo = $token->getAttributeShadowInfo( $k );
469					// PORT-FIXME: is this safe? value could be a token or token array
470					$out[] = $k . '=' . '"' . preg_replace( '/"/', '&quot;', $vInfo['value'] ) . '"';
471				}
472				continue;
473			}
474
475			// Strip Parsoid-inserted class="mw-empty-elt" attributes
476			if ( $k === 'class'
477				 && isset( WikitextConstants::$Output['FlaggedEmptyElts'][$node->nodeName] )
478			) {
479				$kv->v = preg_replace( '/\bmw-empty-elt\b/', '', $kv->v, 1 );
480				if ( !$kv->v ) {
481					continue;
482				}
483			}
484
485			// Strip other Parsoid-generated values
486			//
487			// FIXME: Given that we are currently escaping about/typeof keys
488			// that show up in wikitext, we could unconditionally strip these
489			// away right now.
490			$parsoidValueRegExp = self::PARSOID_ATTRIBUTES[$k] ?? null;
491			if ( $parsoidValueRegExp && preg_match( $parsoidValueRegExp, $kv->v ) ) {
492				$v = preg_replace( $parsoidValueRegExp, '', $kv->v );
493				if ( $v ) {
494					$out[] = $k . '=' . '"' . $v . '"';
495				}
496				continue;
497			}
498
499			if ( strlen( $k ) > 0 ) {
500				$vInfo = $token->getAttributeShadowInfo( $k );
501				$v = $vInfo['value'];
502				// Deal with k/v's that were template-generated
503				$kk = $this->getAttributeKey( $node, $k );
504				// Pass in kv.k, not k since k can potentially
505				// be original wikitext source for 'k' rather than
506				// the string value of the key.
507				$vv = $this->getAttributeValue( $node, $kv->k, $v );
508				// Remove encapsulation from protected attributes
509				// in pegTokenizer.pegjs:generic_newline_attribute
510				$kk = preg_replace( '/^data-x-/i', '', $kk, 1 );
511				// PORT-FIXME: is this type safe? $vv could be a ConstrainedText
512				if ( strlen( $vv ) > 0 ) {
513					if ( !$vInfo['fromsrc'] && !$isWt ) {
514						// Escape wikitext entities
515						$vv = preg_replace( '/>/', '&gt;', Utils::escapeWtEntities( $vv ) );
516					}
517					$out[] = $kk . '=' . '"' . preg_replace( '/"/', '&quot;', $vv ) . '"';
518				} elseif ( preg_match( '/[{<]/', $kk ) ) {
519					// Templated, <*include*>, or <ext-tag> generated
520					$out[] = $kk;
521				} else {
522					$out[] = $kk . '=""';
523				}
524				continue;
525			// PORT-FIXME: is this type safe? $k->v could be a Token or Token array
526			} elseif ( strlen( $kv->v ) ) {
527				// not very likely..
528				$out[] = $kv->v;
529			}
530		}
531
532		// SSS FIXME: It can be reasonably argued that we can permanently delete
533		// dangerous and unacceptable attributes in the interest of safety/security
534		// and the resultant dirty diffs should be acceptable.  But, this is
535		// something to do in the future once we have passed the initial tests
536		// of parsoid acceptance.
537		//
538		// 'a' data attribs -- look for attributes that were removed
539		// as part of sanitization and add them back
540		$dataAttribs = $token->dataAttribs;
541		if ( isset( $dataAttribs->a ) && isset( $dataAttribs->sa ) ) {
542			$aKeys = array_keys( $dataAttribs->a );
543			foreach ( $aKeys as $k ) {
544				// Attrib not present -- sanitized away!
545				if ( !KV::lookupKV( $attribs, $k ) ) {
546					$v = $dataAttribs->sa[$k] ?? null;
547					// PORT-FIXME check type
548					if ( $v !== null && $v !== '' ) {
549						$out[] = $k . '=' . '"' . preg_replace( '/"/', '&quot;', $v ) . '"';
550					} else {
551						// at least preserve the key
552						$out[] = $k;
553					}
554				}
555			}
556		}
557		// XXX: round-trip optional whitespace / line breaks etc
558		return implode( ' ', $out );
559	}
560
561	/**
562	 * @param DOMElement $node
563	 */
564	public function handleLIHackIfApplicable( DOMElement $node ): void {
565		$liHackSrc = DOMDataUtils::getDataParsoid( $node )->liHackSrc ?? null;
566		$prev = DOMUtils::previousNonSepSibling( $node );
567
568		// If we are dealing with an LI hack, then we must ensure that
569		// we are dealing with either
570		//
571		//   1. A node with no previous sibling inside of a list.
572		//
573		//   2. A node whose previous sibling is a list element.
574		if ( $liHackSrc !== null
575			// Case 1
576			&& ( ( $prev === null && DOMUtils::isList( $node->parentNode ) )
577				// Case 2
578				|| ( $prev !== null && DOMUtils::isListItem( $prev ) ) )
579		) {
580			$this->state->emitChunk( $liHackSrc, $node );
581		}
582	}
583
584	/**
585	 * @param string $format
586	 * @param string $value
587	 * @param bool $forceTrim
588	 * @return string
589	 */
590	private function formatStringSubst( string $format, string $value, bool $forceTrim ): string {
591		// PORT-FIXME: JS is more agressive and removes various unicode whitespaces
592		// (most notably nbsp). Does that matter?
593		if ( $forceTrim ) {
594			$value = trim( $value );
595		}
596		return preg_replace_callback( '/_+/', function ( $m ) use ( $value ) {
597			if ( $value === '' ) {
598				return $value;
599			}
600			$hole = $m[0];
601			$holeLen = strlen( $hole );
602			$valueLen = mb_strlen( $value );
603			return $holeLen <= $valueLen ? $value : $value . str_repeat( ' ', $holeLen - $valueLen );
604		}, $format, 1 );
605	}
606
607	/**
608	 * Generates a template parameter sort function that tries to preserve existing ordering
609	 * but also to follow the order prescribed by the templatedata.
610	 * @param array $dpArgInfo
611	 * @param array|null $tplData
612	 * @param array $dataMwKeys
613	 * @return Closure
614	 * PORT-FIXME: there's probably a better way to do this
615	 */
616	private function createParamComparator(
617		array $dpArgInfo, ?array $tplData, array $dataMwKeys
618	): Closure {
619		// Record order of parameters in new data-mw
620		$newOrder = array_map( function ( $key, $i ) {
621			return [ $key, [ 'order' => $i ] ];
622		}, $dataMwKeys, array_keys( $dataMwKeys ) );
623		// Record order of parameters in templatedata (if present)
624		$tplDataOrder = [];
625		$aliasMap = [];
626		$keys = [];
627		if ( $tplData && isset( $tplData['paramOrder'] ) ) {
628			foreach ( $tplData['paramOrder'] as $i => $key ) {
629				$tplDataOrder[$key] = [ 'order' => $i ];
630				$aliasMap[$key] = [ 'key' => $key, 'order' => -1 ];
631				$keys[] = $key;
632				// Aliases have the same sort order as the main name.
633				$aliases = $tplData['params'][$key]['aliases'] ?? [];
634				foreach ( $aliases as $j => $alias ) {
635					$aliasMap[$alias] = [ 'key' => $key, 'order' => $j ];
636				}
637			}
638		}
639		// Record order of parameters in original wikitext (from data-parsoid)
640		$origOrder = [];
641		foreach ( $dpArgInfo as $i => $argInfo ) {
642			$origOrder[$argInfo->k] = [ 'order' => $i, 'dist' => 0 ];
643		}
644		// Canonical parameter key gets the same order as an alias parameter
645		// found in the original wikitext.
646		foreach ( $dpArgInfo as $i => $argInfo ) {
647			$canon = $aliasMap[$argInfo->k] ?? null;
648			if ( $canon !== null && !array_key_exists( $canon['key'], $origOrder ) ) {
649				$origOrder[$canon['key']] = $origOrder[$argInfo->k];
650			}
651		}
652		// Find the closest "original parameter" for each templatedata parameter,
653		// so that newly-added parameters are placed near the parameters which
654		// templatedata says they should be adjacent to.
655		$nearestOrder = $origOrder;
656		$reduceF = function ( $acc, $val ) use ( &$origOrder, &$nearestOrder ) {
657			if ( isset( $origOrder[$val] ) ) {
658				$acc = $origOrder[$val];
659			}
660			if ( !( isset( $nearestOrder[$val] ) && $nearestOrder[$val]['dist'] < $acc['dist'] ) ) {
661				$nearestOrder[$val] = $acc;
662			}
663			return [ 'order' => $acc['order'], 'dist' => $acc['dist'] + 1 ];
664		};
665		// Find closest original parameter before the key.
666		// @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
667		array_reduce( $keys, $reduceF, [ 'order' => -1, 'dist' => 2 * count( $keys ) ] );
668		// Find closest original parameter after the key.
669		// @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
670		array_reduce( array_reverse( $keys ), $reduceF,
671			[ 'order' => count( $origOrder ), 'dist' => count( $keys ) ] );
672
673		// Helper function to return a large number if the given key isn't
674		// in the sort order map
675		$big = max( count( $nearestOrder ), count( $newOrder ) );
676		$defaultGet = function ( $map, $key1, $key2 = null ) use ( &$big ) {
677			$key = ( !$key2 || isset( $map[$key1] ) ) ? $key1 : $key2;
678			return $map[$key]['order'] ?? $big;
679		};
680
681		return function ( $a, $b ) use (
682			&$aliasMap, &$defaultGet, &$nearestOrder, &$tplDataOrder, &$newOrder
683		) {
684			$aCanon = $aliasMap[$a] ?? [ 'key' => $a, 'order' => -1 ];
685			$bCanon = $aliasMap[$b] ?? [ 'key' => $b, 'order' => -1 ];
686			// primary key is `nearestOrder` (nearest original parameter)
687			$aOrder = $defaultGet( $nearestOrder, $a, $aCanon['key'] );
688			$bOrder = $defaultGet( $nearestOrder, $b, $bCanon['key'] );
689			if ( $aOrder !== $bOrder ) {
690				return $aOrder - $bOrder;
691			}
692			// secondary key is templatedata order
693			if ( $aCanon['key'] === $bCanon['key'] ) {
694				return $aCanon['order'] - $bCanon['order'];
695			}
696			$aOrder = $defaultGet( $tplDataOrder, $aCanon['key'] );
697			$bOrder = $defaultGet( $tplDataOrder, $bCanon['key'] );
698			if ( $aOrder !== $bOrder ) {
699				return $aOrder - $bOrder;
700			}
701			// tertiary key is original input order (makes sort stable)
702			$aOrder = $defaultGet( $newOrder, $a );
703			$bOrder = $defaultGet( $newOrder, $b );
704			return $aOrder - $bOrder;
705		};
706	}
707
708	/**
709	 * Serialize part of a templatelike expression.
710	 * @param SerializerState $state
711	 * @param string $buf
712	 * @param DOMElement $node
713	 * @param string $type The type of the part to be serialized. One of template, templatearg,
714	 *   parserfunction.
715	 * @param stdClass $part The expression fragment to serialize. See $srcParts
716	 *   in serializeFromParts() for format.
717	 * @param ?array $tplData Templatedata, see
718	 *   https://github.com/wikimedia/mediawiki-extensions-TemplateData/blob/master/Specification.md
719	 * @param mixed $prevPart Previous part. See $srcParts in serializeFromParts(). PORT-FIXME type?
720	 * @param mixed $nextPart Next part. See $srcParts in serializeFromParts(). PORT-FIXME type?
721	 * @return string
722	 */
723	private function serializePart(
724		SerializerState $state, string $buf, DOMElement $node, string $type, stdClass $part,
725		?array $tplData, $prevPart, $nextPart
726	): string {
727		// Parse custom format specification, if present.
728		$defaultBlockSpc = "{{_\n| _ = _\n}}"; // "block"
729		$defaultInlineSpc = '{{_|_=_}}'; // "inline"
730
731		$format = isset( $tplData['format'] ) ? strtolower( $tplData['format'] ) : null;
732		if ( $format === 'block' ) {
733			$format = $defaultBlockSpc;
734		} elseif ( $format === 'inline' ) {
735			$format = $defaultInlineSpc;
736		}
737		// Check format string for validity.
738		preg_match( self::FORMATSTRING_REGEXP, $format, $parsedFormat );
739		if ( !$parsedFormat ) {
740			preg_match( self::FORMATSTRING_REGEXP, $defaultInlineSpc, $parsedFormat );
741			$format = null; // Indicates that no valid custom format was present.
742		}
743		$formatSOL = $parsedFormat[1] ?? '';
744		$formatStart = $parsedFormat[2] ?? '';
745		$formatParamName = $parsedFormat[3] ?? '';
746		$formatParamValue = $parsedFormat[4] ?? '';
747		$formatEnd = $parsedFormat[5] ?? '';
748		$formatEOL = $parsedFormat[6] ?? '';
749		$forceTrim = ( $format !== null ) || WTUtils::isNewElt( $node );
750
751		// Shoehorn formatting of top-level templatearg wikitext into this code.
752		if ( $type === 'templatearg' ) {
753			$formatStart = preg_replace( '/{{/', '{{{', $formatStart, 1 );
754			$formatEnd = preg_replace( '/}}/', '}}}', $formatEnd, 1 );
755		}
756
757		// handle SOL newline requirement
758		if ( $formatSOL && !preg_match( '/\n$/D', ( $prevPart !== null ) ? $buf : $state->sep->src ) ) {
759			$buf .= "\n";
760		}
761
762		// open the transclusion
763		$tgt = $part->target;
764		'@phan-var stdClass $tgt';
765		$buf .= $this->formatStringSubst( $formatStart, $tgt->wt, $forceTrim );
766
767		// Trim whitespace from data-mw keys to deal with non-compliant
768		// clients. Make sure param info is accessible for the stripped key
769		// since later code will be using the stripped key always.
770		$tplKeysFromDataMw = array_map( function ( $key ) use ( $part ) {
771			// PORT-FIXME do we care about different whitespace semantics for trim?
772			$strippedKey = trim( $key );
773			if ( $key !== $strippedKey ) {
774				$part->params->{$strippedKey} = $part->params->{$key};
775			}
776			return $strippedKey;
777		}, array_keys( get_object_vars( $part->params ) ) );
778		if ( !$tplKeysFromDataMw ) {
779			return $buf . $formatEnd;
780		}
781
782		$env = $this->env;
783
784		// Per-parameter info from data-parsoid for pre-existing parameters
785		$dp = DOMDataUtils::getDataParsoid( $node );
786		$dpArgInfo = isset( $part->i ) ? ( $dp->pi[$part->i] ?? [] ) : [];
787
788		// Build a key -> arg info map
789		$dpArgInfoMap = array_column( $dpArgInfo, null, 'k' );
790
791		// 1. Process all parameters and build a map of
792		//    arg-name -> [serializeAsNamed, name, value]
793		//
794		// 2. Serialize tpl args in required order
795		//
796		// 3. Format them according to formatParamName/formatParamValue
797
798		$kvMap = [];
799		foreach ( $tplKeysFromDataMw as $key ) {
800			$param = $part->params->{$key};
801			$argInfo = $dpArgInfoMap[$key] ?? [];
802
803			// TODO: Other formats?
804			// Only consider the html parameter if the wikitext one
805			// isn't present at all. If it's present but empty,
806			// that's still considered a valid parameter.
807			if ( property_exists( $param, 'wt' ) ) {
808				$value = $param->wt;
809			} else {
810				$value = $this->htmlToWikitext( [ 'env' => $env ], $param->html );
811			}
812
813			Assert::invariant( is_string( $value ), "For param: $key, wt property should be a string '
814				. 'but got: $value" );
815
816			$serializeAsNamed = !empty( $argInfo->named );
817
818			// The name is usually equal to the parameter key, but
819			// if there's a key.wt attribute, use that.
820			$name = null;
821			if ( isset( $param->key->wt ) ) {
822				$name = $param->key->wt;
823				// And make it appear even if there wasn't
824				// data-parsoid information.
825				$serializeAsNamed = true;
826			} else {
827				$name = $key;
828			}
829
830			// Use 'k' as the key, not 'name'.
831			//
832			// The normalized form of 'k' is used as the key in both
833			// data-parsoid and data-mw. The full non-normalized form
834			// is present in '$param->key->wt'
835			$kvMap[$key] = [ 'serializeAsNamed' => $serializeAsNamed, 'name' => $name, 'value' => $value ];
836		}
837
838		$argOrder = array_keys( $kvMap );
839		usort( $argOrder, $this->createParamComparator( $dpArgInfo, $tplData, $argOrder ) );
840
841		$argIndex = 1;
842		$numericIndex = 1;
843
844		$numPositionalArgs = array_reduce( $dpArgInfo, function ( $n, $pi ) use ( $part ) {
845			return ( isset( $part->params->{$pi->k} ) && empty( $pi->named ) ) ? $n + 1 : $n;
846		}, 0 );
847
848		$argBuf = [];
849		foreach ( $argOrder as $param ) {
850			$kv = $kvMap[$param];
851			// Add nowiki escapes for the arg value, as required
852			$escapedValue = $this->wteHandlers->escapeTplArgWT( $kv['value'], [
853				'serializeAsNamed' => $kv['serializeAsNamed'] || $param !== $numericIndex,
854				'type' => $type,
855				'argPositionalIndex' => $numericIndex,
856				'numPositionalArgs' => $numPositionalArgs,
857				'argIndex' => $argIndex++,
858				'numArgs' => count( $tplKeysFromDataMw ),
859			] );
860			if ( $escapedValue['serializeAsNamed'] ) {
861				// WS trimming for values of named args
862				// PORT-FIXME check different whitespace trimming semantics
863				$argBuf[] = [ 'dpKey' => $param, 'name' => $kv['name'], 'value' => trim( $escapedValue['v'] ) ];
864			} else {
865				$numericIndex++;
866				// No WS trimming for positional args
867				$argBuf[] = [ 'dpKey' => $param, 'name' => null, 'value' => $escapedValue['v'] ];
868			}
869		}
870
871		// If no explicit format is provided, default format is:
872		// - 'inline' for new args
873		// - whatever format is available from data-parsoid for old args
874		// (aka, overriding formatParamName/formatParamValue)
875		//
876		// If an unedited node OR if paramFormat is unspecified,
877		// this strategy prevents unnecessary normalization
878		// of edited transclusions which don't have valid
879		// templatedata formatting information.
880
881		// "magic case": If the format string ends with a newline, an extra newline is added
882		// between the template name and the first parameter.
883
884		foreach ( $argBuf as $arg ) {
885			$name = $arg['name'];
886			$val = $arg['value'];
887			if ( $name === null ) {
888				// We are serializing a positional parameter.
889				// Whitespace is significant for these and
890				// formatting would change semantics.
891				$name = '';
892				$modFormatParamName = '|_';
893				$modFormatParamValue = '_';
894			} elseif ( $name === '' ) {
895				// No spacing for blank parameters ({{foo|=bar}})
896				// This should be an edge case and probably only for
897				// inline-formatted templates, but we are consciously
898				// forcing this default here. Can revisit if this is
899				// ever a problem.
900				$modFormatParamName = '|_=';
901				$modFormatParamValue = '_';
902			} else {
903				// Preserve existing spacing, esp if there was a comment
904				// embedded in it. Otherwise, follow TemplateData's lead.
905				// NOTE: In either case, we are forcibly normalizing
906				// non-block-formatted transclusions into block formats
907				// by adding missing newlines.
908				$spc = $dpArgInfoMap[$arg['dpKey']]->spc ?? null;
909				if ( $spc && ( !$format || preg_match( Utils::COMMENT_REGEXP, $spc[3] ?? '' ) ) ) {
910					$nl = ( substr( $formatParamName, 0, 1 ) === "\n" ) ? "\n" : '';
911					$modFormatParamName = $nl . '|' . $spc[0] . '_' . $spc[1] . '=' . $spc[2];
912					$modFormatParamValue = '_' . $spc[3];
913				} else {
914					$modFormatParamName = $formatParamName;
915					$modFormatParamValue = $formatParamValue;
916				}
917			}
918
919			// Don't create duplicate newlines.
920			$trailing = preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf );
921			if ( $trailing && substr( $formatParamName, 0, 1 ) === "\n" ) {
922				$modFormatParamName = substr( $formatParamName, 1 );
923			}
924
925			$buf .= $this->formatStringSubst( $modFormatParamName, $name, $forceTrim );
926			$buf .= $this->formatStringSubst( $modFormatParamValue, $val, $forceTrim );
927		}
928
929		// Don't create duplicate newlines.
930		if ( preg_match( self::TRAILING_COMMENT_OR_WS_AFTER_NL_REGEXP, $buf )
931			 && substr( $formatEnd, 0, 1 ) === "\n"
932		) {
933			$buf .= substr( $formatEnd, 1 );
934		} else {
935			$buf .= $formatEnd;
936		}
937
938		if ( $formatEOL ) {
939			if ( $nextPart === null ) {
940				// This is the last part of the block. Add the \n only
941				// if the next non-comment node is not a text node
942				// of if the text node doesn't have a leading \n.
943				$next = DOMUtils::nextNonDeletedSibling( $node );
944				while ( $next && DOMUtils::isComment( $next ) ) {
945					$next = DOMUtils::nextNonDeletedSibling( $next );
946				}
947				if ( !DOMUtils::isText( $next ) || substr( $next->nodeValue, 0, 1 ) !== "\n" ) {
948					$buf .= "\n";
949				}
950			} elseif ( !is_string( $nextPart ) || substr( $nextPart, 0, 1 ) !== "\n" ) {
951				// If nextPart is another template, and it wants a leading nl,
952				// this \n we add here will count towards that because of the
953				// formatSOL check at the top.
954				$buf .= "\n";
955			}
956		}
957
958		return $buf;
959	}
960
961	/**
962	 * Serialize a template from its parts.
963	 * @param SerializerState $state
964	 * @param DOMElement $node
965	 * @param stdClass[] $srcParts PORT-FIXME document
966	 * @return string
967	 */
968	public function serializeFromParts(
969		SerializerState $state, DOMElement $node, array $srcParts
970	): string {
971		$env = $this->env;
972		$useTplData = WTUtils::isNewElt( $node ) || DiffUtils::hasDiffMarkers( $node, $env );
973		$buf = '';
974		foreach ( $srcParts as $i => $part ) {
975			$prevPart = $srcParts[$i - 1] ?? null;
976			$nextPart = $srcParts[$i + 1] ?? null;
977			$tplArg = $part->templatearg ?? null;
978			if ( $tplArg ) {
979				$buf = $this->serializePart( $state, $buf, $node, 'templatearg',
980					$tplArg, null, $prevPart, $nextPart );
981				continue;
982			}
983
984			$tpl = $part->template ?? null;
985			if ( !$tpl ) {
986				$buf .= $part;
987				continue;
988			}
989
990			// transclusion: tpl or parser function
991			$tplHref = $tpl->target->href ?? null;
992			$isTpl = is_string( $tplHref );
993			$type = $isTpl ? 'template' : 'parserfunction';
994
995			// While the API supports fetching multiple template data objects in one call,
996			// we will fetch one at a time to benefit from cached responses.
997			//
998			// Fetch template data for the template
999			$tplData = null;
1000			$apiResp = null;
1001			if ( $isTpl && $useTplData && !$this->env->noDataAccess() ) {
1002				$title = preg_replace( '#^\./#', '', $tplHref, 1 );
1003				try {
1004					$tplData = $this->env->getDataAccess()->fetchTemplateData( $env->getPageConfig(), $title );
1005				} catch ( Exception $err ) {
1006					// Log the error, and use default serialization mode.
1007					// Better to misformat a transclusion than to lose an edit.
1008					$env->log( 'error/html2wt/tpldata', $err );
1009				}
1010			}
1011			// If the template doesn't exist, or does but has no TemplateData, ignore it
1012			if ( !empty( $tplData['missing'] ) || !empty( $tplData['notemplatedata'] ) ) {
1013				$tplData = null;
1014			}
1015			$buf = $this->serializePart( $state, $buf, $node, $type, $tpl, $tplData, $prevPart, $nextPart );
1016		}
1017		return $buf;
1018	}
1019
1020	/**
1021	 * @param DOMElement $node
1022	 * @param SerializerState $state
1023	 * @return string
1024	 */
1025	public function serializeExtensionStartTag( DOMElement $node, SerializerState $state ): string {
1026		$dataMw = DOMDataUtils::getDataMw( $node );
1027		$extName = $dataMw->name;
1028
1029		// Serialize extension attributes in normalized form as:
1030		// key='value'
1031		// FIXME: with no dataAttribs, shadow info will mark it as new
1032		$attrs = (array)( $dataMw->attrs ?? [] );
1033		$extTok = new TagTk( $extName, array_map( function ( $key ) use ( $attrs ) {
1034			return new KV( $key, $attrs[$key] );
1035		}, array_keys( $attrs ) ) );
1036
1037		if ( $node->hasAttribute( 'about' ) ) {
1038			$extTok->addAttribute( 'about', $node->getAttribute( 'about' ) );
1039		}
1040		if ( $node->hasAttribute( 'typeof' ) ) {
1041			$extTok->addAttribute( 'typeof', $node->getAttribute( 'typeof' ) );
1042		}
1043
1044		$attrStr = $this->serializeAttributes( $node, $extTok );
1045		$src = '<' . $extName;
1046		if ( $attrStr ) {
1047			$src .= ' ' . $attrStr;
1048		}
1049		return $src . ( !empty( $dataMw->body ) ? '>' : ' />' );
1050	}
1051
1052	/**
1053	 * @param DOMElement $node
1054	 * @param SerializerState $state
1055	 * @return string
1056	 */
1057	public function defaultExtensionHandler( DOMElement $node, SerializerState $state ): string {
1058		$dataMw = DOMDataUtils::getDataMw( $node );
1059		$src = $this->serializeExtensionStartTag( $node, $state );
1060		if ( !isset( $dataMw->body ) ) {
1061			return $src; // We self-closed this already.
1062		} elseif ( is_string( $dataMw->body->extsrc ?? null ) ) {
1063			$src .= $dataMw->body->extsrc;
1064		} else {
1065			$state->getEnv()->log( 'error/html2wt/ext', 'Extension src unavailable for: '
1066				. DOMCompat::getOuterHTML( $node ) );
1067		}
1068		return $src . '</' . $dataMw->name . '>';
1069	}
1070
1071	/**
1072	 * Consolidate separator handling when emitting text.
1073	 * @param string $res
1074	 * @param DOMNode $node
1075	 * @param bool $omitEscaping
1076	 */
1077	private function serializeText( string $res, DOMNode $node, bool $omitEscaping ): void {
1078		$state = $this->state;
1079
1080		// Deal with trailing separator-like text (at least 1 newline and other whitespace)
1081		preg_match( self::$separatorREs['sepSuffixWithNlsRE'], $res, $newSepMatch );
1082		$res = preg_replace( self::$separatorREs['sepSuffixWithNlsRE'], '', $res, 1 );
1083
1084		if ( !$state->inIndentPre ) {
1085			// Strip leading newlines and other whitespace
1086			if ( preg_match( self::$separatorREs['sepPrefixWithNlsRE'], $res, $match ) ) {
1087				$state->appendSep( $match[0] );
1088				$res = substr( $res, strlen( $match[0] ) );
1089			}
1090		}
1091
1092		if ( $omitEscaping ) {
1093			$state->emitChunk( $res, $node );
1094		} else {
1095			// Always escape entities
1096			$res = Utils::escapeWtEntities( $res );
1097
1098			// If not in pre context, escape wikitext
1099			// XXX refactor: Handle this with escape handlers instead!
1100			$state->escapeText = ( $state->onSOL || !$state->currNodeUnmodified ) && !$state->inHTMLPre;
1101			$state->emitChunk( $res, $node );
1102			$state->escapeText = false;
1103		}
1104
1105		// Move trailing newlines into the next separator
1106		if ( $newSepMatch ) {
1107			if ( !$state->sep->src ) {
1108				$state->appendSep( $newSepMatch[0] );
1109			} else {
1110				/* SSS FIXME: what are we doing with the stripped NLs?? */
1111			}
1112		}
1113	}
1114
1115	/**
1116	 * Serialize the content of a text node
1117	 * @param DOMNode $node
1118	 * @return DOMNode|null
1119	 */
1120	private function serializeTextNode( DOMNode $node ): ?DOMNode {
1121		$this->serializeText( $node->nodeValue, $node, false );
1122		return $node->nextSibling;
1123	}
1124
1125	/**
1126	 * Emit non-separator wikitext that does not need to be escaped.
1127	 * @param string $res
1128	 * @param DOMNode $node
1129	 */
1130	public function emitWikitext( string $res, DOMNode $node ): void {
1131		$this->serializeText( $res, $node, true );
1132	}
1133
1134	/**
1135	 * DOM-based serialization
1136	 * @param DOMElement $node
1137	 * @param DOMHandler $domHandler
1138	 * @return DOMNode|null
1139	 */
1140	private function serializeDOMNode( DOMElement $node, DOMHandler $domHandler ) {
1141		// To serialize a node from source, the node should satisfy these
1142		// conditions:
1143		//
1144		// 1. It should not have a diff marker or be in a modified subtree
1145		//    WTS should not be in a subtree with a modification flag that
1146		//    applies to every node of a subtree (rather than an indication
1147		//    that some node in the subtree is modified).
1148		//
1149		// 2. It should continue to be valid in any surrounding edited context
1150		//    For some nodes, modification of surrounding context
1151		//    can change serialized output of this node
1152		//    (ex: <td>s and whether you emit | or || for them)
1153		//
1154		// 3. It should have valid, usable DSR
1155		//
1156		// 4. Either it has non-zero positive DSR width, or meets one of the
1157		//    following:
1158		//
1159		//    4a. It is content like <p><br/><p> or an automatically-inserted
1160		//        wikitext <references/> (HTML <ol>) (will have dsr-width 0)
1161		//    4b. it is fostered content (will have dsr-width 0)
1162		//    4c. it is misnested content (will have dsr-width 0)
1163		//
1164		// SSS FIXME: Additionally, we can guard against buggy DSR with
1165		// some sanity checks. We can test that non-sep src content
1166		// leading wikitext markup corresponds to the node type.
1167		//
1168		// Ex: If node.nodeName is 'UL', then src[0] should be '*'
1169		//
1170		// TO BE DONE
1171
1172		$state = $this->state;
1173		$wrapperUnmodified = false;
1174		$dp = DOMDataUtils::getDataParsoid( $node );
1175
1176		if ( $state->selserMode
1177			&& !$state->inModifiedContent
1178			&& WTSUtils::origSrcValidInEditedContext( $state->getEnv(), $node )
1179			&& Utils::isValidDSR( $dp->dsr ?? null )
1180			&& ( $dp->dsr->end > $dp->dsr->start
1181				// FIXME: <p><br/></p>
1182				// nodes that have dsr width 0 because currently,
1183				// we emit newlines outside the p-nodes. So, this check
1184				// tries to handle that scenario.
1185				|| ( $dp->dsr->end === $dp->dsr->start &&
1186					( preg_match( '/^(p|br)$/D', $node->nodeName )
1187					|| !empty( DOMDataUtils::getDataMw( $node )->autoGenerated ) ) )
1188				|| !empty( $dp->fostered )
1189				|| !empty( $dp->misnested )
1190			)
1191		) {
1192			if ( !DiffUtils::hasDiffMarkers( $node, $this->env ) ) {
1193				// If this HTML node will disappear in wikitext because of
1194				// zero width, then the separator constraints will carry over
1195				// to the node's children.
1196				//
1197				// Since we dont recurse into 'node' in selser mode, we update the
1198				// separator constraintInfo to apply to 'node' and its first child.
1199				//
1200				// We could clear constraintInfo altogether which would be
1201				// correct (but could normalize separators and introduce dirty
1202				// diffs unnecessarily).
1203
1204				$state->currNodeUnmodified = true;
1205
1206				if ( WTUtils::isZeroWidthWikitextElt( $node )
1207					&& $node->hasChildNodes()
1208					&& ( $state->sep->constraints['constraintInfo']['sepType'] ?? null ) === 'sibling'
1209				) {
1210					$state->sep->constraints['constraintInfo']['onSOL'] = $state->onSOL;
1211					$state->sep->constraints['constraintInfo']['sepType'] = 'parent-child';
1212					$state->sep->constraints['constraintInfo']['nodeA'] = $node;
1213					$state->sep->constraints['constraintInfo']['nodeB'] = $node->firstChild;
1214				}
1215
1216				$out = $state->getOrigSrc( $dp->dsr->start, $dp->dsr->end ) ?? '';
1217
1218				$this->trace( 'ORIG-src with DSR', function () use ( $dp, $out ) {
1219					return '[' . $dp->dsr->start . ',' . $dp->dsr->end . '] = '
1220						. PHPUtils::jsonEncode( $out );
1221				} );
1222
1223				// When reusing source, we should only suppress serializing
1224				// to a single line for the cases we've allowed in
1225				// normal serialization.
1226				$suppressSLC = WTUtils::isFirstEncapsulationWrapperNode( $node )
1227					|| in_array( $node->nodeName, [ 'dl', 'ul', 'ol' ], true )
1228					|| ( $node->nodeName === 'table'
1229						&& $node->parentNode->nodeName === 'dd'
1230						&& DOMUtils::previousNonSepSibling( $node ) === null );
1231
1232				// Use selser to serialize this text!  The original
1233				// wikitext is `out`.  But first allow
1234				// `ConstrainedText.fromSelSer` to figure out the right
1235				// type of ConstrainedText chunk(s) to use to represent
1236				// `out`, based on the node type.  Since we might actually
1237				// have to break this wikitext into multiple chunks,
1238				// `fromSelSer` returns an array.
1239				if ( $suppressSLC ) {
1240					$state->singleLineContext->disable();
1241				}
1242				foreach ( ConstrainedText::fromSelSer( $out, $node, $dp, $state->getEnv() ) as $ct ) {
1243					$state->emitChunk( $ct, $ct->node );
1244				}
1245				if ( $suppressSLC ) {
1246					$state->singleLineContext->pop();
1247				}
1248
1249				// Skip over encapsulated content since it has already been
1250				// serialized.
1251				if ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) {
1252					return WTUtils::skipOverEncapsulatedContent( $node );
1253				} else {
1254					return $node->nextSibling;
1255				}
1256			}
1257
1258			if ( DiffUtils::onlySubtreeChanged( $node, $this->env )
1259				&& WTSUtils::hasValidTagWidths( $dp->dsr ?? null )
1260				// In general, we want to avoid nodes with auto-inserted
1261				// start/end tags since dsr for them might not be entirely
1262				// trustworthy. But, since wikitext does not have closing tags
1263				// for tr/td/th in the first place, dsr for them can be trusted.
1264				//
1265				// SSS FIXME: I think this is only for b/i tags for which we do
1266				// dsr fixups. It may be okay to use this for other tags.
1267				&& ( ( empty( $dp->autoInsertedStart ) && empty( $dp->autoInsertedEnd ) )
1268					|| preg_match( '/^(td|th|tr)$/D', $node->nodeName ) )
1269			) {
1270				$wrapperUnmodified = true;
1271			}
1272		}
1273
1274		$state->currNodeUnmodified = false;
1275
1276		$currentModifiedState = $state->inModifiedContent;
1277
1278		$inModifiedContent = $state->selserMode && DiffUtils::hasInsertedDiffMark( $node, $this->env );
1279
1280		if ( $inModifiedContent ) {
1281			$state->inModifiedContent = true;
1282		}
1283
1284		$next = $domHandler->handle( $node, $state, $wrapperUnmodified );
1285
1286		if ( $inModifiedContent ) {
1287			$state->inModifiedContent = $currentModifiedState;
1288		}
1289
1290		return $next;
1291	}
1292
1293	/**
1294	 * Internal worker. Recursively serialize a DOM subtree.
1295	 * @private
1296	 * @param DOMNode $node
1297	 * @return DOMNode|null
1298	 */
1299	public function serializeNode( DOMNode $node ): ?DOMNode {
1300		$domHandler = $method = null;
1301		$domHandlerFactory = new DOMHandlerFactory();
1302		$state = $this->state;
1303
1304		if ( $state->selserMode ) {
1305			$this->trace(
1306				function () use ( $node ) {
1307					return WTSUtils::traceNodeName( $node );
1308				},
1309				'; prev-unmodified: ', $state->prevNodeUnmodified,
1310				'; SOL: ', $state->onSOL );
1311		} else {
1312			$this->trace(
1313				function () use ( $node ) {
1314					return WTSUtils::traceNodeName( $node );
1315				},
1316				'; SOL: ', $state->onSOL );
1317		}
1318
1319		switch ( $node->nodeType ) {
1320			case XML_ELEMENT_NODE:
1321				'@phan-var DOMElement $node';/** @var DOMElement $node */
1322				// Ignore DiffMarker metas, but clear unmodified node state
1323				if ( DOMUtils::isDiffMarker( $node ) ) {
1324					$state->updateModificationFlags( $node );
1325					// `state.sep.lastSourceNode` is cleared here so that removed
1326					// separators between otherwise unmodified nodes don't get
1327					// restored.
1328					// `state.sep.lastSourceNode` is cleared here so that removed
1329					// separators between otherwise unmodified nodes don't get
1330					// restored.
1331					$state->updateSep( $node );
1332					return $node->nextSibling;
1333				}
1334				$domHandler = $domHandlerFactory->getDOMHandler( $node );
1335				Assert::invariant( $domHandler !== null, 'No dom handler found for '
1336					. DOMCompat::getOuterHTML( $node ) );
1337				$method = [ $this, 'serializeDOMNode' ];
1338				break;
1339			case XML_TEXT_NODE:
1340				// This code assumes that the DOM is in normalized form with no
1341				// run of text nodes.
1342				// Accumulate whitespace from the text node into state.sep.src
1343				$text = $node->nodeValue;
1344				if ( !$state->inIndentPre
1345					// PORT-FIXME: original uses this->state->serializer->separatorREs
1346					// but that does not seem useful
1347					&& preg_match( self::$separatorREs['pureSepRE'], $text )
1348				) {
1349					$state->appendSep( $text );
1350					return $node->nextSibling;
1351				}
1352				if ( $state->selserMode ) {
1353					$prev = $node->previousSibling;
1354					if ( !$state->inModifiedContent && (
1355						( !$prev && DOMUtils::isBody( $node->parentNode ) ) ||
1356						( $prev && !DOMUtils::isDiffMarker( $prev ) )
1357					) ) {
1358						$state->currNodeUnmodified = true;
1359					} else {
1360						$state->currNodeUnmodified = false;
1361					}
1362				}
1363
1364				$domHandler = new DOMHandler( false );
1365				$method = [ $this, 'serializeTextNode' ];
1366				break;
1367			case XML_COMMENT_NODE:
1368				// Merge this into separators
1369				$state->appendSep( WTSUtils::commentWT( $node->nodeValue ) );
1370				return $node->nextSibling;
1371			default:
1372				// PORT-FIXME the JS code used node.outerHTML here; probably a bug?
1373				Assert::invariant( 'Unhandled node type: ', $node->nodeType );
1374		}
1375
1376		$prev = DOMUtils::previousNonSepSibling( $node ) ?: $node->parentNode;
1377		$this->updateSeparatorConstraints(
1378			$prev, $domHandlerFactory->getDOMHandler( $prev ),
1379			$node, $domHandler
1380		);
1381
1382		$nextNode = call_user_func( $method, $node, $domHandler );
1383
1384		$next = DOMUtils::nextNonSepSibling( $node ) ?: $node->parentNode;
1385		$this->updateSeparatorConstraints(
1386			$node, $domHandler,
1387			$next, $domHandlerFactory->getDOMHandler( $next )
1388		);
1389
1390		// Update modification flags
1391		$state->updateModificationFlags( $node );
1392
1393		return $nextNode;
1394	}
1395
1396	/**
1397	 * @param string $line
1398	 * @return string
1399	 */
1400	private function stripUnnecessaryHeadingNowikis( string $line ): string {
1401		$state = $this->state;
1402		if ( !$state->hasHeadingEscapes ) {
1403			return $line;
1404		}
1405
1406		$escaper = function ( string $wt ) use ( $state ) {
1407			$ret = $state->serializer->wteHandlers->escapedText( $state, false, $wt, false, true );
1408			return $ret;
1409		};
1410
1411		preg_match( self::HEADING_NOWIKI_REGEXP, $line, $match );
1412		if ( $match && !preg_match( self::COMMENT_OR_WS_REGEXP, $match[2] ) ) {
1413			// The nowikiing was spurious since the trailing = is not in EOL position
1414			return $escaper( $match[1] ) . $match[2];
1415		} else {
1416			// All is good.
1417			return $line;
1418		}
1419	}
1420
1421	private function stripUnnecessaryIndentPreNowikis(): void {
1422		$env = $this->env;
1423		// FIXME: The solTransparentWikitextRegexp includes redirects, which really
1424		// only belong at the SOF and should be unique. See the "New redirect" test.
1425		// PORT-FIXME do the different whitespace semantics matter?
1426		$noWikiRegexp = '@^'
1427			. PHPUtils::reStrip( $env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '@' )
1428			. '((?i:<nowiki>\s+</nowiki>))([^\n]*(?:\n|$))' . '@Dm';
1429		$pieces = preg_split( $noWikiRegexp, $this->state->out, -1, PREG_SPLIT_DELIM_CAPTURE );
1430		$out = $pieces[0];
1431		for ( $i = 1;  $i < count( $pieces );  $i += 4 ) {
1432			$out .= $pieces[$i];
1433			$nowiki = $pieces[$i + 1];
1434			$rest = $pieces[$i + 2];
1435			// Ignore comments
1436			preg_match_all( '/<[^!][^<>]*>/', $rest, $htmlTags );
1437
1438			// Not required if just sol transparent wt.
1439			$reqd = !preg_match( $env->getSiteConfig()->solTransparentWikitextRegexp(), $rest );
1440
1441			if ( $reqd ) {
1442				foreach ( $htmlTags[0] as $j => $rawTagName ) {
1443					// Strip </, attributes, and > to get the tagname
1444					$tagName = preg_replace( '/<\/?|\s.*|>/', '', $rawTagName );
1445					if ( !isset( WikitextConstants::$HTML['HTML5Tags'][$tagName] ) ) {
1446						// If we encounter any tag that is not a html5 tag,
1447						// it could be an extension tag. We could do a more complex
1448						// regexp or tokenize the string to determine if any block tags
1449						// show up outside the extension tag. But, for now, we just
1450						// conservatively bail and leave the nowiki as is.
1451						$reqd = true;
1452						break;
1453					} elseif ( TokenUtils::isBlockTag( $tagName ) ) {
1454						// FIXME: Extension tags shadowing html5 tags might not
1455						// have block semantics.
1456						// Block tags on a line suppress nowikis
1457						$reqd = false;
1458					}
1459				}
1460			}
1461
1462			// PORT-FIXME do the different whitespace semantics matter?
1463			if ( !$reqd ) {
1464				$nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#', '$1', $nowiki, 1 );
1465			} elseif ( $env->shouldScrubWikitext() ) {
1466				$solTransparentWikitextNoWsRegexpFragment = PHPUtils::reStrip(
1467					$env->getSiteConfig()->solTransparentWikitextNoWsRegexp(), '/' );
1468				$wsReplacementRE = '/^(' . $solTransparentWikitextNoWsRegexpFragment . ')?\s+/';
1469				// Replace all leading whitespace
1470				do {
1471					$oldRest = $rest;
1472					$rest = preg_replace( $wsReplacementRE, '$1', $rest );
1473				} while ( $rest !== $oldRest );
1474
1475				// Protect against sol-sensitive wikitext characters
1476				$solCharsTest = '/^' . $solTransparentWikitextNoWsRegexpFragment . '[=*#:;]/';
1477				$nowiki = preg_replace( '#^<nowiki>(\s+)</nowiki>#',
1478					preg_match( $solCharsTest, $rest ) ? '<nowiki/>' : '', $nowiki, 1 );
1479			}
1480			$out = $out . $nowiki . $rest . $pieces[$i + 3];
1481		}
1482		$this->state->out = $out;
1483	}
1484
1485	/**
1486	 * This implements a heuristic to strip two common sources of <nowiki/>s.
1487	 * When <i> and <b> tags are matched up properly,
1488	 * - any single ' char before <i> or <b> does not need <nowiki/> protection.
1489	 * - any single ' char before </i> or </b> does not need <nowiki/> protection.
1490	 * @param string $line
1491	 * @return string
1492	 */
1493	private function stripUnnecessaryQuoteNowikis( string $line ): string {
1494		if ( !$this->state->hasQuoteNowikis ) {
1495			return $line;
1496		}
1497
1498		// Optimization: We are interested in <nowiki/>s before quote chars.
1499		// So, skip this if we don't have both.
1500		if ( !( preg_match( '#<nowiki\s*/>#', $line ) && preg_match( "/'/", $line ) ) ) {
1501			return $line;
1502		}
1503
1504		// * Split out all the [[ ]] {{ }} '' ''' ''''' <..> </...>
1505		//   parens in the regexp mean that the split segments will
1506		//   be spliced into the result array as the odd elements.
1507		// * If we match up the tags properly and we see opening
1508		//   <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we
1509		//   can remove all those nowikis.
1510		//   Ex: '<nowiki/>''foo'' bar '<nowiki/>'''baz'''
1511		// * If we match up the tags properly and we see closing
1512		//   <i> / <b> / <i><b> tags preceded by a '<nowiki/>, we
1513		//   can remove all those nowikis.
1514		//   Ex: ''foo'<nowiki/>'' bar '''baz'<nowiki/>'''
1515		// phpcs:ignore Generic.Files.LineLength.TooLong
1516		$p = preg_split( "#('''''|'''|''|\[\[|\]\]|\{\{|\}\}|<\w+(?:\s+[^>]*?|\s*?)/?>|</\w+\s*>)#", $line, -1, PREG_SPLIT_DELIM_CAPTURE );
1517
1518		// Which nowiki do we strip out?
1519		$nowikiIndex = -1;
1520
1521		// Verify that everything else is properly paired up.
1522		$stack = [];
1523		$quotesOnStack = 0;
1524		$n = count( $p );
1525		$nonHtmlTag = null;
1526		for ( $j = 1;  $j < $n;  $j += 2 ) {
1527			// For HTML tags, pull out just the tag name for clearer code below.
1528			preg_match( '#^<(/?\w+)#', $p[$j], $matches );
1529			$tag = mb_strtolower( $matches[1] ?? $p[$j] );
1530			$tagLen = strlen( $tag );
1531			$selfClose = false;
1532			if ( preg_match( '#/>$#D', $p[$j] ) ) {
1533				$tag .= '/';
1534				$selfClose = true;
1535			}
1536
1537			// Ignore non-html-tag (<nowiki> OR extension tag) blocks
1538			if ( !$nonHtmlTag ) {
1539				if ( isset( $this->env->getSiteConfig()->getExtensionTagNameMap()[$tag] ) ) {
1540					$nonHtmlTag = $tag;
1541					continue;
1542				}
1543			} else {
1544				if ( $tagLen > 0 && $tag[0] === '/' && substr( $tag, 1 ) === $nonHtmlTag ) {
1545					$nonHtmlTag = null;
1546				}
1547				continue;
1548			}
1549
1550			if ( $tag === ']]' ) {
1551				if ( array_pop( $stack ) !== '[[' ) {
1552					return $line;
1553				}
1554			} elseif ( $tag === '}}' ) {
1555				if ( array_pop( $stack ) !== '{{' ) {
1556					return $line;
1557				}
1558			} elseif ( $tagLen > 0 && $tag[0] === '/' ) { // closing html tag
1559				// match html/ext tags
1560				$openTag = array_pop( $stack );
1561				if ( $tag !== ( '/' . $openTag ) ) {
1562					return $line;
1563				}
1564			} elseif ( $tag === 'nowiki/' ) {
1565				// We only want to process:
1566				// - trailing single quotes (bar')
1567				// - or single quotes by themselves without a preceding '' sequence
1568				if ( substr( $p[$j - 1], -1 ) === "'"
1569					&& !( $p[$j - 1] === "'" && $j > 1 && substr( $p[$j - 2], -2 ) === "''" )
1570					// Consider <b>foo<i>bar'</i>baz</b> or <b>foo'<i>bar'</i>baz</b>.
1571					// The <nowiki/> before the <i> or </i> cannot be stripped
1572					// if the <i> is embedded inside another quote.
1573					&& ( $quotesOnStack === 0
1574						// The only strippable scenario with a single quote elt on stack
1575						// is: ''bar'<nowiki/>''
1576						//   -> ["", "''", "bar'", "<nowiki/>", "", "''"]
1577						|| ( $quotesOnStack === 1
1578							&& $j + 2 < $n
1579							&& $p[$j + 1] === ''
1580							&& $p[$j + 2][0] === "'"
1581							&& $p[$j + 2] === PHPUtils::lastItem( $stack ) ) )
1582				) {
1583					$nowikiIndex = $j;
1584				}
1585				continue;
1586			} elseif ( $selfClose || $tag === 'br' ) {
1587				// Skip over self-closing tags or what should have been self-closed.
1588				// ( While we could do this for all void tags defined in
1589				//   mediawiki.wikitext.constants.js, <br> is the most common
1590				//   culprit. )
1591				continue;
1592			} elseif ( $tagLen > 0 && $tag[0] === "'" && PHPUtils::lastItem( $stack ) === $tag ) {
1593				array_pop( $stack );
1594				$quotesOnStack--;
1595			} else {
1596				$stack[] = $tag;
1597				if ( $tagLen > 0 && $tag[0] === "'" ) {
1598					$quotesOnStack++;
1599				}
1600			}
1601		}
1602
1603		if ( count( $stack ) ) {
1604			return $line;
1605		}
1606
1607		if ( $nowikiIndex !== -1 ) {
1608			// We can only remove the final trailing nowiki.
1609			//
1610			// HTML  : <i>'foo'</i>
1611			// line  : ''<nowiki/>'foo'<nowiki/>''
1612			$p[$nowikiIndex] = '';
1613			return implode( '', $p );
1614		} else {
1615			return $line;
1616		}
1617	}
1618
1619	/**
1620	 * Serialize an HTML DOM document.
1621	 * WARNING: You probably want to use {@link FromHTML::serializeDOM} instead.
1622	 * @param DOMElement $body
1623	 * @param bool|null $selserMode
1624	 * @return ConstrainedText|string
1625	 */
1626	public function serializeDOM( DOMElement $body, bool $selserMode = false ) {
1627		Assert::invariant( DOMUtils::isBody( $body ), 'Expected a body node.' );
1628		// `editedDoc` is simply body's ownerDocument.  However, since we make
1629		// recursive calls to WikitextSerializer.prototype.serializeDOM with elements from dom fragments
1630		// from data-mw, we need this to be set prior to the initial call.
1631		// It's mainly required for correct serialization of citations in some
1632		// scenarios (Ex: <ref> nested in <references>).
1633		Assert::invariant( $this->env->getPageConfig()->editedDoc !== null, 'Should be set.' );
1634
1635		if ( !$selserMode ) {
1636			// Strip <section> tags
1637			// Selser mode will have done that already before running dom-diff
1638			ContentUtils::stripSectionTagsAndFallbackIds( $body );
1639		}
1640
1641		$this->logType = $selserMode ? 'trace/selser' : 'trace/wts';
1642
1643		$state = $this->state;
1644		$state->initMode( $selserMode );
1645
1646		$domNormalizer = new DOMNormalizer( $state );
1647		$domNormalizer->normalize( $body );
1648
1649		if ( $this->env->hasDumpFlag( 'dom:post-normal' ) ) {
1650			$options = [ 'storeDiffMark' => true, 'env' => $this->env ];
1651			ContentUtils::dumpDOM( $body, 'DOM: post-normal', $options );
1652		}
1653
1654		$state->kickOffSerialize( $body );
1655
1656		if ( $state->hasIndentPreNowikis ) {
1657			// FIXME: Perhaps this can be done on a per-line basis
1658			// rather than do one post-pass on the entire document.
1659			$this->stripUnnecessaryIndentPreNowikis();
1660		}
1661
1662		$splitLines = $state->selserMode
1663			|| $state->hasQuoteNowikis
1664			|| $state->hasSelfClosingNowikis
1665			|| $state->hasHeadingEscapes;
1666
1667		if ( $splitLines ) {
1668			$state->out = implode( "\n", array_map( function ( $line ) {
1669				// FIXME: Perhaps this can be done on a per-line basis
1670				// rather than do one post-pass on the entire document.
1671				$line = $this->stripUnnecessaryQuoteNowikis( $line );
1672
1673				// Strip (useless) trailing <nowiki/>s
1674				// Interim fix till we stop introducing them in the first place.
1675				//
1676				// Don't strip |param = <nowiki/> since that pattern is used
1677				// in transclusions and where the trailing <nowiki /> is a valid
1678				// template arg. So, use a conservative regexp to detect that usage.
1679				$line = preg_replace( '#^([^=]*?)(?:<nowiki\s*/>\s*)+$#D', '$1', $line, 1 );
1680
1681				$line = $this->stripUnnecessaryHeadingNowikis( $line );
1682				return $line;
1683			}, explode( "\n", $state->out ) ) );
1684		}
1685
1686		if ( $state->redirectText && $state->redirectText !== 'unbuffered' ) {
1687			$firstLine = explode( "\n", $state->out, 1 )[0];
1688			$nl = preg_match( '/^(\s|$)/D', $firstLine ) ? '' : "\n";
1689			$state->out = $state->redirectText . $nl . $state->out;
1690		}
1691
1692		return $state->out;
1693	}
1694
1695	/**
1696	 * @note Porting note: this replaces the pattern $serializer->env->log( $serializer->logType, ... )
1697	 * @param mixed ...$args
1698	 * @deprecated Use PSR-3 logging instead
1699	 */
1700	public function trace( ...$args ) {
1701		$this->env->log( $this->logType, ...$args );
1702	}
1703
1704}
1705