1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Utils;
5
6use Composer\Semver\Semver;
7use stdClass;
8use Wikimedia\Assert\Assert;
9use Wikimedia\Parsoid\Config\Env;
10use Wikimedia\Parsoid\Core\DataParsoid;
11use Wikimedia\Parsoid\Core\DomSourceRange;
12use Wikimedia\Parsoid\DOM\Document;
13use Wikimedia\Parsoid\DOM\Element;
14use Wikimedia\Parsoid\DOM\Node;
15use Wikimedia\Parsoid\Tokens\SourceRange;
16
17/**
18 * These helpers pertain to HTML and data attributes of a node.
19 */
20class DOMDataUtils {
21	public const DATA_OBJECT_ATTR_NAME = 'data-object-id';
22
23	/**
24	 * Return the dynamic "bag" property of a Document.
25	 * @param Document $doc
26	 * @return DataBag
27	 */
28	private static function getBag( Document $doc ): DataBag {
29		// This is a dynamic property; it is not declared.
30		// All references go through here so we can suppress phan's complaint.
31		// @phan-suppress-next-line PhanUndeclaredProperty
32		return $doc->bag;
33	}
34
35	/**
36	 * @param Document $doc
37	 */
38	public static function prepareDoc( Document $doc ) {
39		// `bag` is a deliberate dynamic property; see DOMDataUtils::getBag()
40		// @phan-suppress-next-line PhanUndeclaredProperty dynamic property
41		$doc->bag = new DataBag();
42
43		// Cache the head and body.
44		DOMCompat::getHead( $doc );
45		DOMCompat::getBody( $doc );
46	}
47
48	/**
49	 * Stash $obj in $doc and return an id for later retrieval
50	 * @param Document $doc
51	 * @param stdClass $obj
52	 * @return int
53	 */
54	public static function stashObjectInDoc( Document $doc, stdClass $obj ): int {
55		return self::getBag( $doc )->stashObject( $obj );
56	}
57
58	/**
59	 * Does this node have any attributes?
60	 * @param Element $node
61	 * @return bool
62	 */
63	public static function noAttrs( Element $node ): bool {
64		$numAttrs = count( DOMCompat::attributes( $node ) );
65		return $numAttrs === 0 ||
66			( $numAttrs === 1 && $node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) );
67	}
68
69	/**
70	 * Get data object from a node.
71	 *
72	 * @param Element $node node
73	 * @return stdClass
74	 */
75	public static function getNodeData( Element $node ): stdClass {
76		if ( !$node->hasAttribute( self::DATA_OBJECT_ATTR_NAME ) ) {
77			// Initialized on first request
78			$dataObject = new stdClass;
79			self::setNodeData( $node, $dataObject );
80			return $dataObject;
81		}
82
83		$docId = $node->getAttribute( self::DATA_OBJECT_ATTR_NAME );
84		if ( $docId !== '' ) {
85			$dataObject = self::getBag( $node->ownerDocument )->getObject( (int)$docId );
86		} else {
87			$dataObject = null; // Make phan happy
88		}
89		Assert::invariant( isset( $dataObject ), 'Bogus docId given!' );
90		'@phan-var stdClass $dataObject'; // @var stdClass $dataObject
91		if ( isset( $dataObject->storedId ) ) {
92			PHPUtils::unreachable(
93				'Trying to fetch node data without loading!' .
94				// If this node's data-object id is different from storedId,
95				// it will indicate that the data-parsoid object was shared
96				// between nodes without getting cloned. Useful for debugging.
97				'Node id: ' . $node->getAttribute( self::DATA_OBJECT_ATTR_NAME ) .
98				'Stored data: ' . PHPUtils::jsonEncode( $dataObject )
99			);
100		}
101		return $dataObject;
102	}
103
104	/**
105	 * Set node data.
106	 *
107	 * @param Element $node node
108	 * @param stdClass $data data
109	 */
110	public static function setNodeData( Element $node, stdClass $data ): void {
111		$docId = self::stashObjectInDoc( $node->ownerDocument, $data );
112		$node->setAttribute( self::DATA_OBJECT_ATTR_NAME, (string)$docId );
113	}
114
115	/**
116	 * Get data parsoid info from a node.
117	 *
118	 * @param Element $node node
119	 * @return DataParsoid
120	 */
121	public static function getDataParsoid( Element $node ): stdClass {
122		$data = self::getNodeData( $node );
123		if ( !isset( $data->parsoid ) ) {
124			$data->parsoid = new stdClass;
125		}
126		if ( !isset( $data->parsoid->tmp ) ) {
127			$data->parsoid->tmp = new stdClass;
128		}
129		return $data->parsoid;
130	}
131
132	/** Set data parsoid info on a node.
133	 *
134	 * @param Element $node node
135	 * @param stdClass $dp data-parsoid
136	 */
137	public static function setDataParsoid( Element $node, stdClass $dp ): void {
138		$data = self::getNodeData( $node );
139		$data->parsoid = $dp;
140	}
141
142	/**
143	 * Get data diff info from a node.
144	 *
145	 * @param Element $node node
146	 * @return ?stdClass
147	 */
148	public static function getDataParsoidDiff( Element $node ): ?stdClass {
149		$data = self::getNodeData( $node );
150		// We won't set a default value for this property
151		return $data->parsoid_diff ?? null;
152	}
153
154	/** Set data diff info on a node.
155	 *
156	 * @param Element $node node
157	 * @param ?stdClass $diffObj data-parsoid-diff object
158	 */
159	public static function setDataParsoidDiff( Element $node, ?stdClass $diffObj ): void {
160		$data = self::getNodeData( $node );
161		$data->parsoid_diff = $diffObj;
162	}
163
164	/**
165	 * Get data meta wiki info from a node.
166	 *
167	 * @param Element $node node
168	 * @return stdClass
169	 */
170	public static function getDataMw( Element $node ): stdClass {
171		$data = self::getNodeData( $node );
172		if ( !isset( $data->mw ) ) {
173			$data->mw = new stdClass;
174		}
175		return $data->mw;
176	}
177
178	/** Set data meta wiki info from a node.
179	 *
180	 * @param Element $node node
181	 * @param ?stdClass $dmw data-mw
182	 */
183	public static function setDataMw( Element $node, ?stdClass $dmw ): void {
184		$data = self::getNodeData( $node );
185		$data->mw = $dmw;
186	}
187
188	/**
189	 * Check if there is meta wiki info in a node.
190	 *
191	 * @param Element $node node
192	 * @return bool
193	 */
194	public static function validDataMw( Element $node ): bool {
195		return (array)self::getDataMw( $node ) !== [];
196	}
197
198	/**
199	 * Get an object from a JSON-encoded XML attribute on a node.
200	 *
201	 * @param Element $node node
202	 * @param string $name name
203	 * @param mixed $defaultVal
204	 * @return mixed
205	 */
206	public static function getJSONAttribute( Element $node, string $name, $defaultVal ) {
207		if ( !$node->hasAttribute( $name ) ) {
208			return $defaultVal;
209		}
210		$attVal = $node->getAttribute( $name );
211		$decoded = PHPUtils::jsonDecode( $attVal, false );
212		if ( $decoded !== null ) {
213			return $decoded;
214		} else {
215			error_log( 'ERROR: Could not decode attribute-val ' . $attVal .
216				' for ' . $name . ' on node ' . DOMCompat::nodeName( $node ) );
217			return $defaultVal;
218		}
219	}
220
221	/**
222	 * Set a attribute on a node with a JSON-encoded object.
223	 *
224	 * @param Element $node node
225	 * @param string $name Name of the attribute.
226	 * @param mixed $obj value of the attribute to
227	 */
228	public static function setJSONAttribute( Element $node, string $name, $obj ): void {
229		$val = $obj === [] ? '{}' : PHPUtils::jsonEncode( $obj );
230		$node->setAttribute( $name, $val );
231	}
232
233	/**
234	 * Set shadow info on a node; similar to the method on tokens.
235	 * Records a key = value pair in data-parsoid['a'] property.
236	 *
237	 * This is effectively a call of 'setShadowInfoIfModified' except
238	 * there is no original value, so by definition, $val is modified.
239	 *
240	 * @param Element $node node
241	 * @param string $name Name of the attribute.
242	 * @param mixed $val val
243	 */
244	public static function setShadowInfo( Element $node, string $name, $val ): void {
245		$dp = self::getDataParsoid( $node );
246		if ( !isset( $dp->a ) ) {
247			$dp->a = [];
248		}
249		if ( !isset( $dp->sa ) ) {
250			$dp->sa = [];
251		}
252		$dp->a[$name] = $val;
253	}
254
255	/**
256	 * Set shadow info on a node; similar to the method on tokens.
257	 *
258	 * If the new value ($val) for the key ($name) is different from the
259	 * original value ($origVal):
260	 * - the new value is recorded in data-parsoid->a and
261	 * - the original value is recorded in data-parsoid->sa
262	 *
263	 * @param Element $node node
264	 * @param string $name Name of the attribute.
265	 * @param mixed $val val
266	 * @param mixed $origVal original value (null is a valid value)
267	 * @param bool $skipOrig
268	 */
269	public static function setShadowInfoIfModified(
270		Element $node, string $name, $val, $origVal, bool $skipOrig = false
271	): void {
272		if ( !$skipOrig && ( $val === $origVal || $origVal === null ) ) {
273			return;
274		}
275		$dp = self::getDataParsoid( $node );
276		if ( !isset( $dp->a ) ) {
277			$dp->a = [];
278		}
279		if ( !isset( $dp->sa ) ) {
280			$dp->sa = [];
281		}
282		// FIXME: This is a hack to not overwrite already shadowed info.
283		// We should either fix the call site that depends on this
284		// behaviour to do an explicit check, or double down on this
285		// by porting it to the token method as well.
286		if ( !$skipOrig && !array_key_exists( $name, $dp->a ) ) {
287			$dp->sa[$name] = $origVal;
288		}
289		$dp->a[$name] = $val;
290	}
291
292	/**
293	 * Set an attribute and shadow info to a node.
294	 * Similar to the method on tokens
295	 *
296	 * @param Element $node node
297	 * @param string $name Name of the attribute.
298	 * @param mixed $val value
299	 * @param mixed $origVal original value
300	 * @param bool $skipOrig
301	 */
302	public static function addNormalizedAttribute(
303		Element $node, string $name, $val, $origVal, bool $skipOrig = false
304	): void {
305		if ( $name === 'id' ) {
306			DOMCompat::setIdAttribute( $node, $val );
307		} else {
308			$node->setAttribute( $name, $val );
309		}
310		self::setShadowInfoIfModified( $node, $name, $val, $origVal, $skipOrig );
311	}
312
313	/**
314	 * Get this document's pagebundle object
315	 * @param Document $doc
316	 * @return stdClass
317	 */
318	public static function getPageBundle( Document $doc ): stdClass {
319		return self::getBag( $doc )->getPageBundle();
320	}
321
322	/**
323	 * Removes the `data-*` attribute from a node, and migrates the data to the
324	 * document's JSON store. Generates a unique id with the following format:
325	 * ```
326	 * mw<base64-encoded counter>
327	 * ```
328	 * but attempts to keep user defined ids.
329	 *
330	 * @param Element $node node
331	 * @param Env $env environment
332	 * @param stdClass $data data
333	 * @param array $idIndex Index of used id attributes in the DOM
334	 */
335	public static function storeInPageBundle(
336		Element $node, Env $env, stdClass $data, array $idIndex
337	): void {
338		$uid = $node->getAttribute( 'id' ) ?? '';
339		$document = $node->ownerDocument;
340		$pb = self::getPageBundle( $document );
341		$docDp = $pb->parsoid;
342		$origId = $uid ?: null;
343		if ( array_key_exists( $uid, $docDp->ids ) ) {
344			$uid = null;
345			// FIXME: Protect mw ids while tokenizing to avoid false positives.
346			$env->log( 'info', 'Wikitext for this page has duplicate ids: ' . $origId );
347		}
348		if ( !$uid ) {
349			do {
350				$docDp->counter += 1;
351				// PORT-FIXME: NOTE that we aren't updating the idIndex here because
352				// we are generating unique ids that will not conflict. In any case,
353				// the idIndex is a workaround for the PHP DOM's issues and we might
354				// switch out of this in the future anyway.
355				$uid = 'mw' . PHPUtils::counterToBase64( $docDp->counter );
356			} while ( isset( $idIndex[$uid] ) );
357			self::addNormalizedAttribute( $node, 'id', $uid, $origId );
358		}
359		$docDp->ids[$uid] = $data->parsoid;
360		if ( isset( $data->mw ) ) {
361			$pb->mw->ids[$uid] = $data->mw;
362		}
363	}
364
365	/**
366	 * @param Document $doc doc
367	 * @param stdClass $obj object
368	 */
369	public static function injectPageBundle( Document $doc, stdClass $obj ): void {
370		$pb = PHPUtils::jsonEncode( $obj );
371		$script = $doc->createElement( 'script' );
372		DOMCompat::setIdAttribute( $script, 'mw-pagebundle' );
373		$script->setAttribute( 'type', 'application/x-mw-pagebundle' );
374		$script->appendChild( $doc->createTextNode( $pb ) );
375		DOMCompat::getHead( $doc )->appendChild( $script );
376	}
377
378	/**
379	 * @param Document $doc doc
380	 * @return stdClass|null
381	 */
382	public static function extractPageBundle( Document $doc ): ?stdClass {
383		$pb = null;
384		$dpScriptElt = DOMCompat::getElementById( $doc, 'mw-pagebundle' );
385		if ( $dpScriptElt ) {
386			$dpScriptElt->parentNode->removeChild( $dpScriptElt );
387			$pb = PHPUtils::jsonDecode( $dpScriptElt->textContent, false );
388		}
389		return $pb;
390	}
391
392	/**
393	 * Walk DOM from node downward calling loadDataAttribs
394	 *
395	 * @param Node $node node
396	 * @param array $options options
397	 */
398	public static function visitAndLoadDataAttribs( Node $node, array $options = [] ): void {
399		DOMUtils::visitDOM( $node, [ self::class, 'loadDataAttribs' ], $options );
400	}
401
402	/**
403	 * Massage the data parsoid object loaded from a node attribute
404	 * into expected shape. When we create a first-class object for
405	 * data-parsoid, this will move into the constructor.
406	 *
407	 * @param stdClass $dp
408	 * @param array $options
409	 * @param ?Element $node
410	 */
411	public static function massageLoadedDataParsoid(
412		stdClass $dp, array $options = [], ?Element $node = null
413	): void {
414		if ( isset( $dp->sa ) ) {
415			$dp->sa = (array)$dp->sa;
416		}
417		if ( isset( $dp->a ) ) {
418			$dp->a = (array)$dp->a;
419		}
420		if ( isset( $dp->dsr ) ) {
421			$dp->dsr = DomSourceRange::fromArray( $dp->dsr );
422		}
423		if ( isset( $dp->tsr ) ) {
424			// tsr is generally for tokens, not DOM trees.
425			$dp->tsr = SourceRange::fromArray( $dp->tsr );
426		}
427		if ( isset( $dp->extTagOffsets ) ) {
428			$dp->extTagOffsets = DomSourceRange::fromArray( $dp->extTagOffsets );
429		}
430		if ( isset( $dp->extLinkContentOffsets ) ) {
431			$dp->extLinkContentOffsets =
432				SourceRange::fromArray( $dp->extLinkContentOffsets );
433		}
434		if ( !empty( $options['markNew'] ) ) {
435			$dp->tmp = PHPUtils::arrayToObject( $dp->tmp ?? [] );
436			$dp->tmp->isNew = !$node->hasAttribute( 'data-parsoid' );
437		}
438		if ( isset( $dp->optList ) ) {
439			foreach ( $dp->optList as &$item ) {
440				$item = (array)$item;
441			}
442		}
443	}
444
445	/**
446	 * These are intended be used on a document after post-processing, so that
447	 * the underlying .dataobject is transparently applied (in the store case)
448	 * and reloaded (in the load case), rather than worrying about keeping
449	 * the attributes up-to-date throughout that phase.  For the most part,
450	 * using this.ppTo* should be sufficient and using these directly should be
451	 * avoided.
452	 *
453	 * @param Node $node node
454	 * @param array $options options
455	 */
456	public static function loadDataAttribs( Node $node, array $options ): void {
457		if ( !( $node instanceof Element ) ) {
458			return;
459		}
460		// Reset the node data object's stored state, since we're reloading it
461		self::setNodeData( $node, new stdClass );
462		$dp = self::getJSONAttribute( $node, 'data-parsoid', new stdClass );
463		self::massageLoadedDataParsoid( $dp, $options, $node );
464		self::setDataParsoid( $node, $dp );
465		$node->removeAttribute( 'data-parsoid' );
466		$dmw = self::getJSONAttribute( $node, 'data-mw', null );
467		self::setDataMw( $node, $dmw );
468		$node->removeAttribute( 'data-mw' );
469		$dpd = self::getJSONAttribute( $node, 'data-parsoid-diff', null );
470		self::setDataParsoidDiff( $node, $dpd );
471		$node->removeAttribute( 'data-parsoid-diff' );
472	}
473
474	/**
475	 * Builds an index of id attributes seen in the DOM
476	 * @param Node $node
477	 * @return array
478	 */
479	public static function usedIdIndex( Node $node ): array {
480		$index = [];
481		DOMUtils::visitDOM( DOMCompat::getBody( $node->ownerDocument ),
482			static function ( Node $n, ?array $options = null ) use ( &$index ) {
483				if ( $n instanceof Element && $n->hasAttribute( 'id' ) ) {
484					$index[$n->getAttribute( 'id' )] = true;
485				}
486			},
487			[]
488		);
489		return $index;
490	}
491
492	/**
493	 * Walk DOM from node downward calling storeDataAttribs
494	 *
495	 * @param Node $node node
496	 * @param array $options options
497	 */
498	public static function visitAndStoreDataAttribs( Node $node, array $options = [] ): void {
499		// PORT-FIXME: storeDataAttribs calls storeInPageBundle which calls getElementById.
500		// PHP's `getElementById` implementation is broken, and we work around that by
501		// using Zest which uses XPath. So, getElementById call can be O(n) and calling it
502		// on on every element of the DOM via vistDOM here makes it O(n^2) instead of O(n).
503		// So, we work around that by building an index and avoiding getElementById entirely
504		// in storeInPageBundle.
505		if ( !empty( $options['storeInPageBundle'] ) ) {
506			$options['idIndex'] = self::usedIdIndex( $node );
507		}
508		DOMUtils::visitDOM( $node, [ self::class, 'storeDataAttribs' ], $options );
509	}
510
511	/**
512	 * PORT_FIXME This function needs an accurate description
513	 *
514	 * @param Node $node node
515	 * @param ?array $options options
516	 */
517	public static function storeDataAttribs( Node $node, ?array $options = null ): void {
518		$options = $options ?? [];
519		if ( !( $node instanceof Element ) ) {
520			return;
521		}
522		Assert::invariant( empty( $options['discardDataParsoid'] ) || empty( $options['keepTmp'] ),
523			'Conflicting options: discardDataParsoid and keepTmp are both enabled.' );
524		$dp = self::getDataParsoid( $node );
525		// $dp will be a DataParsoid object once but currently it is an stdClass
526		// with a fake type hint. Unfake it to prevent phan complaining about unset().
527		'@phan-var stdClass $dp';
528		// @phan-suppress-next-line PhanRedundantCondition
529		$discardDataParsoid = !empty( $options['discardDataParsoid'] );
530		if ( !empty( $dp->tmp->isNew ) ) {
531			// Only necessary to support the cite extension's getById,
532			// that's already been loaded once.
533			//
534			// This is basically a hack to ensure that DOMUtils.isNewElt
535			// continues to work since we effectively rely on the absence
536			// of data-parsoid to identify new elements. But, loadDataAttribs
537			// creates an empty {} if one doesn't exist. So, this hack
538			// ensures that a loadDataAttribs + storeDataAttribs pair don't
539			// dirty the node by introducing an empty data-parsoid attribute
540			// where one didn't exist before.
541			//
542			// Ideally, we'll find a better solution for this edge case later.
543			$discardDataParsoid = true;
544		}
545		$data = null;
546		if ( !$discardDataParsoid ) {
547			// @phan-suppress-next-line PhanRedundantCondition
548			if ( !empty( $options['keepTmp'] ) ) {
549				if ( isset( $dp->tmp->tplRanges ) ) {
550					unset( $dp->tmp->tplRanges );
551				}
552			} else {
553				unset( $dp->tmp );
554			}
555
556			if ( !empty( $options['storeInPageBundle'] ) ) {
557				$data = (object)[ 'parsoid' => $dp ];
558			} else {
559				self::setJSONAttribute( $node, 'data-parsoid', $dp );
560			}
561		}
562		// We need to serialize diffs only under special circumstances.
563		// So, do it on demand.
564		if ( !empty( $options['storeDiffMark'] ) ) {
565			$dpDiff = self::getDataParsoidDiff( $node );
566			if ( $dpDiff ) {
567				self::setJSONAttribute( $node, 'data-parsoid-diff', $dpDiff );
568			}
569		}
570		// Strip invalid data-mw attributes
571		if ( self::validDataMw( $node ) ) {
572			if (
573				!empty( $options['storeInPageBundle'] ) && isset( $options['env'] ) &&
574				// The pagebundle didn't have data-mw before 999.x
575				Semver::satisfies( $options['env']->getOutputContentVersion(), '^999.0.0' )
576			) {
577				$data = $data ?: new stdClass;
578				$data->mw = self::getDataMw( $node );
579			} else {
580				self::setJSONAttribute( $node, 'data-mw', self::getDataMw( $node ) );
581			}
582		}
583		// Store pagebundle
584		if ( $data !== null ) {
585			self::storeInPageBundle( $node, $options['env'], $data, $options['idIndex'] );
586		}
587
588		// Indicate that this node's data has been stored so that if we try
589		// to access it after the fact we're aware and remove the attribute
590		// since it's no longer needed.
591		$nd = self::getNodeData( $node );
592		$nd->storedId = $node->getAttribute( self::DATA_OBJECT_ATTR_NAME );
593		$node->removeAttribute( self::DATA_OBJECT_ATTR_NAME );
594	}
595}
596