1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Utils;
5
6use DOMDocument;
7use DOMElement;
8use DOMNode;
9use Wikimedia\Assert\Assert;
10use Wikimedia\Parsoid\Config\Env;
11use Wikimedia\Parsoid\Core\DomSourceRange;
12use Wikimedia\Parsoid\Wt2Html\XMLSerializer;
13
14/**
15 * These utilities are for processing content that's generated
16 * by parsing source input (ex: wikitext)
17 */
18class ContentUtils {
19	/**
20	 * XML Serializer.
21	 *
22	 * @param DOMNode $node
23	 * @param array $options XMLSerializer options.
24	 * @return string
25	 */
26	public static function toXML( DOMNode $node, array $options = [] ): string {
27		return XMLSerializer::serialize( $node, $options )['html'];
28	}
29
30	/**
31	 * dataobject aware XML serializer, to be used in the DOM post-processing phase.
32	 *
33	 * @param DOMNode $node
34	 * @param array $options
35	 * @return string
36	 */
37	public static function ppToXML( DOMNode $node, array $options = [] ): string {
38		// We really only want to pass along `$options['keepTmp']`
39		DOMDataUtils::visitAndStoreDataAttribs( $node, $options );
40		return self::toXML( $node, $options );
41	}
42
43	/**
44	 * .dataobject aware HTML parser, to be used in the DOM
45	 * post-processing phase.
46	 *
47	 * @param Env $env
48	 * @param string $html
49	 * @param array|null $options
50	 * @return DOMElement
51	 */
52	public static function ppToDOM( Env $env, string $html, array $options = [] ): DOMElement {
53		$options += [
54			'node' => null,
55			'reinsertFosterableContent' => null,
56		];
57		$node = $options['node'];
58		if ( $node === null ) {
59			$node = DOMCompat::getBody( $env->createDocument( $html ) );
60		} else {
61			DOMUtils::assertElt( $node );
62			DOMCompat::setInnerHTML( $node, $html );
63		}
64
65		if ( $options['reinsertFosterableContent'] ) {
66			DOMUtils::visitDOM( $node, function ( $n, ...$args ) use ( $env ) {
67				// untunnel fostered content
68				$meta = WTUtils::reinsertFosterableContent( $env, $n, true );
69				$n = $meta ?? $n;
70
71				// load data attribs
72				DOMDataUtils::loadDataAttribs( $n, ...$args );
73			}, $options );
74		} else {
75			DOMDataUtils::visitAndLoadDataAttribs( $node, $options );
76		}
77		return $node;
78	}
79
80	/**
81	 * Pull the data-parsoid script element out of the doc before serializing.
82	 *
83	 * @param DOMNode $node
84	 * @param array $options XMLSerializer options.
85	 * @return array
86	 */
87	public static function extractDpAndSerialize( DOMNode $node, array $options = [] ): array {
88		$doc = DOMUtils::isBody( $node ) ? $node->ownerDocument : $node;
89		$pb = DOMDataUtils::extractPageBundle( $doc );
90		$out = XMLSerializer::serialize( $node, $options );
91		$out['pb'] = $pb;
92		return $out;
93	}
94
95	/**
96	 * Strip Parsoid-inserted section wrappers and fallback id spans with
97	 * HTML4 ids for headings from the DOM.
98	 *
99	 * @param DOMElement $node
100	 */
101	public static function stripSectionTagsAndFallbackIds( DOMElement $node ): void {
102		$n = $node->firstChild;
103		while ( $n ) {
104			$next = $n->nextSibling;
105			if ( $n instanceof DOMElement ) {
106				// Recurse into subtree before stripping this
107				self::stripSectionTagsAndFallbackIds( $n );
108
109				// Strip <section> tags
110				if ( WTUtils::isParsoidSectionTag( $n ) ) {
111					DOMUtils::migrateChildren( $n, $n->parentNode, $n );
112					$n->parentNode->removeChild( $n );
113				}
114
115				// Strip <span typeof='mw:FallbackId' ...></span>
116				if ( WTUtils::isFallbackIdSpan( $n ) ) {
117					$n->parentNode->removeChild( $n );
118				}
119			}
120			$n = $next;
121		}
122	}
123
124	/**
125	 * @param DOMNode $node
126	 * @param DOMNode $clone
127	 * @param array $options
128	 */
129	private static function cloneData(
130		DOMNode $node, DOMNode $clone, array $options
131	): void {
132		if ( !( $node instanceof DOMElement ) ) {
133			return;
134		}
135		DOMUtils::assertElt( $clone );
136
137		$d = DOMDataUtils::getNodeData( $node );
138		DOMDataUtils::setNodeData( $clone,  Utils::clone( $d ) );
139		$node = $node->firstChild;
140		$clone = $clone->firstChild;
141		while ( $node ) {
142			self::cloneData( $node, $clone, $options );
143			$node = $node->nextSibling;
144			$clone = $clone->nextSibling;
145		}
146	}
147
148	/**
149	 * @param array $buf
150	 * @param array &$opts
151	 */
152	private static function emit( array $buf, array &$opts ): void {
153		$str = implode( "\n", $buf ) . "\n";
154		if ( isset( $opts['outBuffer'] ) ) {
155			$opts['outBuffer'] .= $str;
156		} elseif ( isset( $opts['outStream'] ) ) {
157			fwrite( $opts['outStream'], $str . "\n" );
158		} else {
159			error_log( $str );
160		}
161	}
162
163	/**
164	 * Shift the DSR of a DOM fragment.
165	 * @param Env $env
166	 * @param DOMNode $rootNode
167	 * @param callable $dsrFunc
168	 * @return DOMNode Returns the $rootNode passed in to allow chaining.
169	 */
170	public static function shiftDSR( Env $env, DOMNode $rootNode, callable $dsrFunc ): DOMNode {
171		$doc = $rootNode->ownerDocument;
172		$convertString = function ( $str ) {
173			// Stub $convertString out to allow definition of a pair of
174			// mutually-recursive functions.
175			return $str;
176		};
177		$convertNode = function ( DOMNode $node ) use (
178			$env, $dsrFunc, &$convertString, &$convertNode
179		) {
180			if ( !( $node instanceof DOMElement ) ) {
181				return;
182			}
183			$dp = DOMDataUtils::getDataParsoid( $node );
184			if ( ( $dp->dsr ?? null ) !== null ) {
185				$dp->dsr = $dsrFunc( clone $dp->dsr );
186				// We don't need to setDataParsoid because dp is not a copy
187			}
188			if ( ( $dp->tmp->origDSR ?? null ) !== null ) {
189				// Even though tmp shouldn't escape Parsoid, go ahead and
190				// convert to enable hybrid testing.
191				$dp->tmp->origDSR = $dsrFunc( clone $dp->tmp->origDSR );
192			}
193			if ( ( $dp->extTagOffsets ?? null ) !== null ) {
194				$dp->extTagOffsets = $dsrFunc( clone $dp->extTagOffsets );
195			}
196
197			// Handle embedded HTML in Language Variant markup
198			$dmwv = DOMDataUtils::getJSONAttribute( $node, 'data-mw-variant', null );
199			if ( $dmwv ) {
200				if ( isset( $dmwv->disabled ) ) {
201					$dmwv->disabled->t = $convertString( $dmwv->disabled->t );
202				}
203				if ( isset( $dmwv->twoway ) ) {
204					foreach ( $dmwv->twoway as $l ) {
205						$l->t = $convertString( $l->t );
206					}
207				}
208				if ( isset( $dmwv->oneway ) ) {
209					foreach ( $dmwv->oneway as $l ) {
210						$l->f = $convertString( $l->f );
211						$l->t = $convertString( $l->t );
212					}
213				}
214				if ( isset( $dmwv->filter ) ) {
215					$dmwv->filter->t = $convertString( $dmwv->filter->t );
216				}
217				DOMDataUtils::setJSONAttribute( $node, 'data-mw-variant', $dmwv );
218			}
219
220			if ( DOMUtils::matchTypeOf( $node, '#^mw:(ExpandedAttrs|Image|Extension)\b#D' ) ) {
221				$dmw = DOMDataUtils::getDataMw( $node );
222				// Handle embedded HTML in template-affected attributes
223				if ( $dmw->attribs ?? null ) {
224					foreach ( $dmw->attribs as &$a ) {
225						foreach ( $a as $kOrV ) {
226							if ( gettype( $kOrV ) !== 'string' && isset( $kOrV->html ) ) {
227								$kOrV->html = $convertString( $kOrV->html );
228							}
229						}
230					}
231				}
232				// Handle embedded HTML in figure-inline captions
233				if ( $dmw->caption ?? null ) {
234					$dmw->caption = $convertString( $dmw->caption );
235				}
236				// FIXME: Cite-specific handling here maybe?
237				if ( $dmw->body->html ?? null ) {
238					$dmw->body->html = $convertString( $dmw->body->html );
239				}
240				DOMDataUtils::setDataMw( $node, $dmw );
241			}
242
243			if ( DOMUtils::matchTypeOf( $node, '#^mw:DOMFragment(/|$)#D' ) ) {
244				$dp = DOMDataUtils::getDataParsoid( $node );
245				if ( $dp->html ?? null ) {
246					$nodes = $env->getDOMFragment( $dp->html );
247					foreach ( $nodes as $n ) {
248						DOMPostOrder::traverse( $n, $convertNode );
249					}
250				}
251			}
252		};
253		$convertString = function ( string $str ) use ( $doc, $env, $convertNode ): string {
254			$parentNode = $doc->createElement( 'body' );
255			$node = self::ppToDOM( $env, $str, [ 'node' => $parentNode ] );
256			DOMPostOrder::traverse( $node, $convertNode );
257			return self::ppToXML( $node, [ 'innerXML' => true ] );
258		};
259		DOMPostOrder::traverse( $rootNode, $convertNode );
260		return $rootNode; // chainable
261	}
262
263	/**
264	 * Convert DSR offsets in a Document between utf-8/ucs2/codepoint
265	 * indices.
266	 *
267	 * Offset types are:
268	 *  - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`.
269	 *  - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`.
270	 *  - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`.
271	 *
272	 * @see TokenUtils::convertTokenOffsets for a related function on tokens.
273	 *
274	 * @param Env $env
275	 * @param DOMDocument $doc The document to convert
276	 * @param string $from Offset type to convert from.
277	 * @param string $to Offset type to convert to.
278	 */
279	public static function convertOffsets(
280		Env $env,
281		DOMDocument $doc,
282		string $from,
283		string $to
284	): void {
285		$env->setCurrentOffsetType( $to );
286		if ( $from === $to ) {
287			return; // Hey, that was easy!
288		}
289		$offsetMap = [];
290		$offsets = [];
291		$collect = function ( int $n ) use ( &$offsetMap, &$offsets ) {
292			if ( !array_key_exists( $n, $offsetMap ) ) {
293				$box = PHPUtils::arrayToObject( [ 'value' => $n ] );
294				$offsetMap[$n] = $box;
295				$offsets[] =& $box->value;
296			}
297		};
298		// Collect DSR offsets throughout the document
299		$collectDSR = function ( DomSourceRange $dsr ) use ( $collect ) {
300			if ( $dsr->start !== null ) {
301				$collect( $dsr->start );
302				$collect( $dsr->innerStart() );
303			}
304			if ( $dsr->end !== null ) {
305				$collect( $dsr->innerEnd() );
306				$collect( $dsr->end );
307			}
308			return $dsr;
309		};
310		$body = DOMCompat::getBody( $doc );
311		self::shiftDSR( $env, $body, $collectDSR );
312		if ( count( $offsets ) === 0 ) {
313			return; /* nothing to do (shouldn't really happen) */
314		}
315		// Now convert these offsets
316		TokenUtils::convertOffsets(
317			$env->topFrame->getSrcText(), $from, $to, $offsets
318		);
319		// Apply converted offsets
320		$applyDSR = function ( DomSourceRange $dsr ) use ( $offsetMap ) {
321			$start = $dsr->start;
322			$openWidth = $dsr->openWidth;
323			if ( $start !== null ) {
324				$start = $offsetMap[$start]->value;
325				$openWidth = $offsetMap[$dsr->innerStart()]->value - $start;
326			}
327			$end = $dsr->end;
328			$closeWidth = $dsr->closeWidth;
329			if ( $end !== null ) {
330				$end = $offsetMap[$end]->value;
331				$closeWidth = $end - $offsetMap[$dsr->innerEnd()]->value;
332			}
333			return new DomSourceRange(
334				$start, $end, $openWidth, $closeWidth
335			);
336		};
337		self::shiftDSR( $env, $body, $applyDSR );
338	}
339
340	/**
341	 * Dump the DOM with attributes.
342	 *
343	 * @param DOMNode $rootNode
344	 * @param string $title
345	 * @param array &$options
346	 */
347	public static function dumpDOM(
348		DOMNode $rootNode, string $title, array &$options = []
349	): void {
350		if ( !empty( $options['storeDiffMark'] ) || !empty( $options['dumpFragmentMap'] ) ) {
351			Assert::invariant( isset( $options['env'] ), "env should be set" );
352		}
353
354		if ( $rootNode instanceof DOMElement ) {
355			// cloneNode doesn't clone data => walk DOM to clone it
356			$clonedRoot = $rootNode->cloneNode( true );
357			self::cloneData( $rootNode, $clonedRoot, $options );
358		} else {
359			$clonedRoot = $rootNode;
360		}
361
362		$buf = [];
363		if ( empty( $options['quiet'] ) ) {
364			$buf[] = '----- ' . $title . ' -----';
365		}
366
367		$buf[] = self::ppToXML( $clonedRoot, $options );
368		self::emit( $buf, $options );
369
370		// Dump cached fragments
371		if ( !empty( $options['dumpFragmentMap'] ) ) {
372			foreach ( $options['env']->getDOMFragmentMap() as $k => $fragment ) {
373				$buf = [];
374				$buf[] = str_repeat( '=', 15 );
375				$buf[] = 'FRAGMENT ' . $k;
376				$buf[] = '';
377				self::emit( $buf, $options );
378
379				$newOpts = $options;
380				$newOpts['dumpFragmentMap'] = false;
381				$newOpts['quiet'] = true;
382				self::dumpDOM( is_array( $fragment ) ? $fragment[0] : $fragment, '', $newOpts );
383			}
384		}
385
386		if ( empty( $options['quiet'] ) ) {
387			self::emit( [ str_repeat( '-', mb_strlen( $title ) + 12 ) ], $options );
388		}
389	}
390
391}
392