1<?php
2
3/**
4 * A bidirectional Language Converter, capable of round-tripping variant
5 * conversion.
6 *
7 * Language conversion is as DOMPostProcessor pass, run over the
8 * Parsoid-format HTML output, which may have embedded language converter
9 * rules.  We first assign a (guessed) source variant to each DOM node,
10 * which will be used when round-tripping the result back to the original
11 * source variant.  Then for each applicable text node in the DOM, we
12 * first "bracket" the text, splitting it into cleanly round-trippable
13 * segments and lossy/unclean segments.  For the lossy segments we add
14 * additional metadata to the output to record the original source variant
15 * text to allow round-tripping (and variant-aware editing).
16 *
17 * Note that different wikis have different policies for source variant:
18 * in some wikis all articles are authored in one particular variant, by
19 * convention.  In others, it's a "first author gets to choose the variant"
20 * situation.  In both cases, a constant/per-article "source variant" may
21 * be specified via some as-of-yet-unimplemented mechanism; either part of
22 * the site configuration, or per-article metadata like pageLanguage.
23 * In other wikis (like zhwiki) the text is a random mix of variants; in
24 * these cases the "source variant" will be null/unspecified, and we'll
25 * dynamically pick the most likely source variant for each subtree.
26 *
27 * Each individual language has a dynamically-loaded subclass of `Language`,
28 * which may also have a `LanguageConverter` subclass to load appropriate
29 * `ReplacementMachine`s and do other language-specific customizations.
30 */
31
32namespace Wikimedia\Parsoid\Language;
33
34use DOMDocument;
35use DOMNode;
36use Wikimedia\LangConv\ReplacementMachine;
37use Wikimedia\Parsoid\Config\Env;
38use Wikimedia\Parsoid\Core\ClientError;
39use Wikimedia\Parsoid\Utils\DOMCompat;
40use Wikimedia\Parsoid\Utils\DOMDataUtils;
41use Wikimedia\Parsoid\Utils\DOMUtils;
42use Wikimedia\Parsoid\Utils\Timing;
43
44/**
45 * Base class for language variant conversion.
46 */
47class LanguageConverter {
48
49	/** @var Language */
50	private $language;
51
52	/** @var string */
53	private $langCode;
54
55	/** @var string[] */
56	private $variants;
57
58	/** @var ?array */
59	private $variantFallbacks;
60
61	/** @var ?ReplacementMachine */
62	private $machine;
63
64	/**
65	 * @param Language $language
66	 * @param string $langCode The main language code of this language
67	 * @param string[] $variants The supported variants of this language
68	 * @param ?array $variantfallbacks The fallback language of each variant
69	 * @param ?array $flags Defining the custom strings that maps to the flags
70	 * @param ?array $manualLevel Limit for supported variants
71	 */
72	public function __construct(
73		Language $language, string $langCode, array $variants,
74		?array $variantfallbacks = null, ?array $flags = null,
75		?array $manualLevel = null
76	) {
77		$this->language = $language;
78		$this->langCode = $langCode;
79		$this->variants = $variants; // XXX subtract disabled variants
80		$this->variantFallbacks = $variantfallbacks;
81		// this.mVariantNames = Language.// XXX
82
83		// Eagerly load conversion tables.
84		// XXX we could defer loading in the future, or cache more
85		// aggressively
86		$this->loadDefaultTables();
87	}
88
89	public function loadDefaultTables() {
90	}
91
92	/**
93	 * Return the {@link ReplacementMachine} powering this conversion.
94	 * @return ?ReplacementMachine
95	 */
96	public function getMachine(): ?ReplacementMachine {
97		return $this->machine;
98	}
99
100	/**
101	 * @param ReplacementMachine $machine
102	 */
103	public function setMachine( ReplacementMachine $machine ): void {
104		$this->machine = $machine;
105	}
106
107	/**
108	 * Try to return a classname from a given code.
109	 * @param string $code
110	 * @param bool $fallback Whether we're going through language fallback
111	 * @return class-string Name of the language class (if one were to exist)
112	 */
113	public static function classFromCode( string $code, bool $fallback ): string {
114		if ( $fallback && $code === 'en' ) {
115			return '\Wikimedia\Parsoid\Language\Language';
116		} else {
117			$code = preg_replace_callback( '/^\w/', function ( $matches ) {
118				return strtoupper( $matches[0] );
119			}, $code, 1 );
120			$code = preg_replace( '/-/', '_', $code );
121			$code = preg_replace( '#/|^\.+#', '', $code ); // avoid path attacks
122			return "\Wikimedia\Parsoid\Language\Language{$code}";
123		}
124	}
125
126	/**
127	 * @param Env $env
128	 * @param string $lang
129	 * @param bool $fallback
130	 * @return Language
131	 */
132	public static function loadLanguage( Env $env, string $lang, bool $fallback = false ): Language {
133		try {
134			if ( Language::isValidCode( $lang ) ) {
135				$languageClass = self::classFromCode( $lang, $fallback );
136				return new $languageClass();
137			}
138		} catch ( \Error $e ) {
139			/* fall through */
140		}
141		$env->log( 'info', "Couldn't load language: {$lang} fallback={$fallback}" );
142		return new Language();
143	}
144
145	// phpcs:ignore MediaWiki.Commenting.FunctionComment.MissingDocumentationPublic
146	public function findVariantLink( $link, $nt, $ignoreOtherCond ) {
147		// XXX unimplemented
148		return [ 'nt' => $nt, 'link' => $link ];
149	}
150
151	/**
152	 * @param string $fromVariant
153	 * @param string $text
154	 * @param string $toVariant
155	 * @suppress PhanEmptyPublicMethod
156	 */
157	public function translate( $fromVariant, $text, $toVariant ) {
158		// XXX unimplemented
159	}
160
161	/**
162	 * @param string $text
163	 * @param string $variant
164	 * @return bool
165	 */
166	public function guessVariant( $text, $variant ) {
167		return false;
168	}
169
170	/**
171	 * Convert the given document into $targetVariant, if:
172	 *  1) language converter is enabled on this wiki, and
173	 *  2) the targetVariant is specified, and it is a known variant (not a
174	 *     base language code)
175	 *
176	 * The `$sourceVariant`, if provided is expected to be per-wiki or
177	 * per-article metadata which specifies a standard "authoring variant"
178	 * for this article or wiki.  For example, all articles are authored in
179	 * Cyrillic by convention.  It should be left blank if there is no
180	 * consistent convention on the wiki (as for zhwiki, for instance).
181	 *
182	 * @param Env $env
183	 * @param DOMDocument $doc The input document.
184	 * @param ?string $targetVariant The desired output variant.
185	 * @param ?string $sourceVariant The variant used by convention when
186	 *   authoring pages, if there is one; otherwise left null.
187	 */
188	public static function maybeConvert(
189		Env $env, DOMDocument $doc, ?string $targetVariant,
190		?string $sourceVariant
191	): void {
192		// language converter must be enabled for the pagelanguage
193		if ( !$env->langConverterEnabled() ) {
194			return;
195		}
196		$variants = $env->getSiteConfig()->variants();
197
198		// targetVariant must be specified, and a language-with-variants
199		if ( !( $targetVariant && array_key_exists( $targetVariant, $variants ) ) ) {
200			return;
201		}
202
203		// targetVariant must not be a base language code
204		if ( $variants[$targetVariant]['base'] === $targetVariant ) {
205			// XXX in the future we probably want to go ahead and expand
206			// empty <span>s left by -{...}- constructs, etc.
207			return;
208		}
209
210		// Record the fact that we've done conversion to targetVariant
211		$env->getPageConfig()->setVariant( $targetVariant );
212
213		// But don't actually do the conversion if __NOCONTENTCONVERT__
214		if ( DOMCompat::querySelector( $doc, 'meta[property="mw:PageProp/nocontentconvert"]' ) ) {
215			return;
216		}
217
218		// OK, convert!
219		self::baseToVariant( $env, DOMCompat::getBody( $doc ), $targetVariant, $sourceVariant );
220	}
221
222	/**
223	 * Convert a text in the "base variant" to a specific variant, given by `targetVariant`.  If
224	 * `sourceVariant` is given, assume that the input wikitext is in `sourceVariant` to
225	 * construct round-trip metadata, instead of using a heuristic to guess the best variant
226	 * for each DOM subtree of wikitext.
227	 * @param Env $env
228	 * @param DOMNode $rootNode The root node of a fragment to convert.
229	 * @param string $targetVariant The variant to be used for the output DOM.
230	 * @param ?string $sourceVariant An optional variant assumed for the
231	 *  input DOM in order to create roundtrip metadata.
232	 */
233	public static function baseToVariant(
234		Env $env, DOMNode $rootNode, string $targetVariant,
235		?string $sourceVariant
236	): void {
237		// PageConfig guarantees getPageLanguage() never returns null.
238		$pageLangCode = $env->getPageConfig()->getPageLanguage();
239		$guesser = null;
240
241		$metrics = $env->getSiteConfig()->metrics();
242		$loadTiming = Timing::start( $metrics );
243		$languageClass = self::loadLanguage( $env, $pageLangCode );
244		$lang = new $languageClass();
245		$langconv = $lang->getConverter();
246		// XXX we might want to lazily-load conversion tables here.
247		$loadTiming->end( "langconv.{$targetVariant}.init" );
248		$loadTiming->end( 'langconv.init' );
249
250		// Check the the target variant is valid (and implemented!)
251		$validTarget = $langconv !== null && $langconv->getMachine() !== null
252			&& array_key_exists( $targetVariant, $langconv->getMachine()->getCodes() );
253		if ( !$validTarget ) {
254			// XXX create a warning header? (T197949)
255			$env->log( 'info', "Unimplemented variant: {$targetVariant}" );
256			return; /* no conversion */
257		}
258		// Check that the source variant is valid.
259		$validSource = $sourceVariant === null ||
260			array_key_exists( $sourceVariant, $langconv->getMachine()->getCodes() );
261		if ( !$validSource ) {
262			throw new ClientError( "Invalid source variant: $sourceVariant for target $targetVariant" );
263		}
264
265		$timing = Timing::start( $metrics );
266		if ( $metrics ) {
267			$metrics->increment( 'langconv.count' );
268			$metrics->increment( "langconv.{$targetVariant}.count" );
269		}
270
271		// XXX Eventually we'll want to consult some wiki configuration to
272		// decide whether a ConstantLanguageGuesser is more appropriate.
273		if ( $sourceVariant ) {
274			$guesser = new ConstantLanguageGuesser( $sourceVariant );
275		} else {
276			$guesser = new MachineLanguageGuesser(
277				$langconv->getMachine(), $rootNode, $targetVariant
278			);
279		}
280
281		$ct = new ConversionTraverser( $targetVariant, $guesser, $langconv->getMachine() );
282		$ct->traverse( $env, $rootNode, [], true );
283
284		// HACK: to avoid data-parsoid="{}" in the output, set the isNew flag
285		// on synthetic spans
286		DOMUtils::assertElt( $rootNode );
287		foreach ( DOMCompat::querySelectorAll(
288			$rootNode, 'span[typeof="mw:LanguageVariant"][data-mw-variant]'
289		) as $span ) {
290			$dmwv = DOMDataUtils::getJSONAttribute( $span, 'data-mw-variant', null );
291			if ( $dmwv->rt ?? false ) {
292				$dp = DOMDataUtils::getDataParsoid( $span );
293				$dp->tmp->isNew = true;
294			}
295		}
296
297		$timing->end( 'langconv.total' );
298		$timing->end( "langconv.{$targetVariant}.total" );
299		$loadTiming->end( 'langconv.totalWithInit' );
300	}
301}
302