1<?php
2/**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 * @author Zhengzhu Feng <zhengzhu@gmail.com>
20 * @author fdcn <fdcn64@gmail.com>
21 * @author shinjiman <shinjiman@gmail.com>
22 * @author PhiLiP <philip.npc@gmail.com>
23 */
24use MediaWiki\Linker\LinkTarget;
25use MediaWiki\Logger\LoggerFactory;
26use MediaWiki\MediaWikiServices;
27use MediaWiki\Revision\RevisionRecord;
28use MediaWiki\Revision\SlotRecord;
29
30/**
31 * Base class for multi-variant language conversion.
32 *
33 * @ingroup Language
34 */
35abstract class LanguageConverter implements ILanguageConverter {
36	use DeprecationHelper;
37
38	/**
39	 * languages supporting variants
40	 * @since 1.20
41	 * @var array
42	 */
43	public static $languagesWithVariants = [
44		'ban',
45		'en',
46		'crh',
47		'gan',
48		'iu',
49		'kk',
50		'ku',
51		'shi',
52		'sr',
53		'tg',
54		'uz',
55		'zh',
56	];
57
58	private $mTablesLoaded = false;
59
60	/**
61	 * @var ReplacementArray[]|bool[]
62	 */
63	protected $mTables;
64
65	/**
66	 * @var Language
67	 */
68	private $mLangObj;
69
70	private $mUcfirst = false;
71	private $mConvRuleTitle = false;
72	private $mURLVariant;
73	private $mUserVariant;
74	private $mHeaderVariant;
75	private $mMaxDepth = 10;
76	private $mVarSeparatorPattern;
77
78	private const CACHE_VERSION_KEY = 'VERSION 7';
79
80	/**
81	 * @param Language $langobj
82	 */
83	public function __construct( $langobj ) {
84		$this->deprecatePublicProperty( 'mUcfirst', '1.35', __CLASS__ );
85		$this->deprecatePublicProperty( 'mConvRuleTitle', '1.35', __CLASS__ );
86		$this->deprecatePublicProperty( 'mUserVariant', '1.35', __CLASS__ );
87		$this->deprecatePublicProperty( 'mHeaderVariant', '1.35', __CLASS__ );
88		$this->deprecatePublicProperty( 'mMaxDepth = 10', '1.35', __CLASS__ );
89		$this->deprecatePublicProperty( 'mVarSeparatorPattern', '1.35', __CLASS__ );
90		$this->deprecatePublicProperty( 'mLangObj', '1.35', __CLASS__ );
91		$this->deprecatePublicProperty( 'mVariantFallbacks', '1.35', __CLASS__ );
92		$this->deprecatePublicProperty( 'mTablesLoaded', '1.35', __CLASS__ );
93		$this->deprecatePublicProperty( 'mTables', '1.35', __CLASS__ );
94
95		$this->mLangObj = $langobj;
96
97		$this->deprecatePublicPropertyFallback( 'mVariants', '1.36', function () {
98			return $this->getVariants();
99		} );
100
101		$this->deprecatePublicPropertyFallback( 'mMainLanguageCode', '1.36', function () {
102			return $this->getMainCode();
103		} );
104
105		$this->deprecatePublicPropertyFallback( 'mVariantFallbacks', '1.36', function () {
106			return $this->getVariantsFallbacks();
107		} );
108
109		$this->deprecatePublicPropertyFallback( 'mFlags', '1.36', function () {
110			return $this->getFlags();
111		} );
112
113		$this->deprecatePublicPropertyFallback( 'mVariantNames', '1.36', function () {
114			return $this->getVariantNames();
115		} );
116
117		$this->deprecatePublicPropertyFallback( 'mDescCodeSep', '1.36', function () {
118			return $this->getDescCodeSeparator();
119		} );
120
121		$this->deprecatePublicPropertyFallback( 'mDescVarSep', '1.36', function () {
122			return $this->getDescVarSeparator();
123		} );
124	}
125
126	/**
127	 * Get main language code.
128	 * @since 1.36
129	 *
130	 * @return string
131	 */
132	abstract public function getMainCode(): string;
133
134	/**
135	 * Get supported variants of the language.
136	 * @since 1.36
137	 *
138	 * @return array
139	 */
140	abstract protected function getLanguageVariants(): array;
141
142	/**
143	 * Get language variants fallbacks.
144	 * @since 1.36
145	 *
146	 * @return array
147	 */
148	abstract public function getVariantsFallbacks(): array;
149
150	/**
151	 * Get strings that maps to the flags.
152	 * @since 1.36
153	 *
154	 * @return array
155	 */
156	final public function getFlags(): array {
157		$defaultflags = [
158			// 'S' show converted text
159			// '+' add rules for alltext
160			// 'E' the gave flags is error
161			// these flags above are reserved for program
162			'A' => 'A',   // add rule for convert code (all text convert)
163			'T' => 'T',   // title convert
164			'R' => 'R',   // raw content
165			'D' => 'D',   // convert description (subclass implement)
166			'-' => '-',   // remove convert (not implement)
167			'H' => 'H',   // add rule for convert code (but no display in placed code)
168			'N' => 'N',   // current variant name
169		];
170		$flags = array_merge( $defaultflags, $this->getAdditionalFlags() );
171		foreach ( $this->getVariants() as $v ) {
172			$flags[$v] = $v;
173		}
174		return $flags;
175	}
176
177	/**
178	 * Provides additinal flags for converter. By default it return empty array and
179	 * typicslly should be overridden by implementation of converter..
180	 *
181	 * @return array
182	 */
183	protected function getAdditionalFlags(): array {
184		return [];
185	}
186
187	/**
188	 * Get manual level limit for supported variants.
189	 * @since 1.36
190	 *
191	 * @return array
192	 */
193	final public function getManualLevel() {
194		$manualLevel  = $this->getAdditionalManualLevel();
195		$result = [];
196		foreach ( $this->getVariants() as $v ) {
197			if ( array_key_exists( $v, $manualLevel ) ) {
198				$result[$v] = $manualLevel[$v];
199			} else {
200				$result[$v] = 'bidirectional';
201			}
202		}
203		return $result;
204	}
205
206	/**
207	 * Provides additinal flags for converter. By default it return empty array and
208	 * typicslly should be overridden by implementation of converter.
209	 * @since 1.36
210	 *
211	 * @return array
212	 */
213	protected function getAdditionalManualLevel(): array {
214		return [];
215	}
216
217	/**
218	 * Get desc code separator. By default returns ":", can be overridden by
219	 * implementation of converter.
220	 * @since 1.36
221	 *
222	 * @return string
223	 */
224	public function getDescCodeSeparator(): string {
225		return ':';
226	}
227
228	/**
229	 * Get desc var separator. By default returns ";", can be overridden by
230	 * implementation of converter.
231	 * @since 1.36
232	 *
233	 * @return string
234	 */
235	public function getDescVarSeparator(): string {
236		return ';';
237	}
238
239	/**
240	 * Get variant names.
241	 *
242	 * @return array
243	 */
244	public function getVariantNames(): array {
245		return MediaWikiServices::getInstance()
246			->getLanguageNameUtils()
247			->getLanguageNames();
248	}
249
250	/**
251	 * Get all valid variants for current Coverter. It uses abstract
252	 *
253	 * @return string[] Contains all valid variants
254	 */
255	final public function getVariants() {
256		global $wgDisabledVariants;
257		return array_diff( $this->getLanguageVariants(), $wgDisabledVariants );
258	}
259
260	/**
261	 * In case some variant is not defined in the markup, we need
262	 * to have some fallback. For example, in zh, normally people
263	 * will define zh-hans and zh-hant, but less so for zh-sg or zh-hk.
264	 * when zh-sg is preferred but not defined, we will pick zh-hans
265	 * in this case. Right now this is only used by zh.
266	 *
267	 * @param string $variant The language code of the variant
268	 * @return string|array The code of the fallback language or the
269	 *   main code if there is no fallback
270	 */
271	public function getVariantFallbacks( $variant ) {
272		return $this->getVariantsFallbacks()[$variant] ?? $this->getMainCode();
273	}
274
275	/**
276	 * Get the title produced by the conversion rule.
277	 * @return string The converted title text
278	 */
279	public function getConvRuleTitle() {
280		return $this->mConvRuleTitle;
281	}
282
283	/**
284	 * Get preferred language variant.
285	 * @return string The preferred language code
286	 */
287	public function getPreferredVariant() {
288		global $wgDefaultLanguageVariant, $wgUser;
289
290		$req = $this->getURLVariant();
291
292		Hooks::runner()->onGetLangPreferredVariant( $req );
293
294		// NOTE: For calls from Setup.php, wgUser or the session might not be set yet (T235360)
295		// Use case: During autocreation, User::isUsableName is called which uses interface
296		// messages for reserved usernames.
297		if ( $wgUser && $wgUser->isSafeToLoad() && $wgUser->isRegistered() && !$req ) {
298			$req = $this->getUserVariant( $wgUser );
299		} elseif ( !$req ) {
300			$req = $this->getHeaderVariant();
301		}
302
303		if ( $wgDefaultLanguageVariant && !$req ) {
304			$req = $this->validateVariant( $wgDefaultLanguageVariant );
305		}
306
307		$req = $this->validateVariant( $req );
308
309		// This function, unlike the other get*Variant functions, is
310		// not memoized (i.e. there return value is not cached) since
311		// new information might appear during processing after this
312		// is first called.
313		if ( $req ) {
314			return $req;
315		}
316		return $this->getMainCode();
317	}
318
319	/**
320	 * This function would not be affected by user's settings
321	 * @return string The default variant code
322	 */
323	public function getDefaultVariant() {
324		global $wgDefaultLanguageVariant;
325
326		$req = $this->getURLVariant();
327
328		if ( !$req ) {
329			$req = $this->getHeaderVariant();
330		}
331
332		if ( $wgDefaultLanguageVariant && !$req ) {
333			$req = $this->validateVariant( $wgDefaultLanguageVariant );
334		}
335
336		if ( $req ) {
337			return $req;
338		}
339		return $this->getMainCode();
340	}
341
342	/**
343	 * Validate the variant and return an appropriate strict internal
344	 * variant code if one exists.  Compare to Language::hasVariant()
345	 * which does a strict test.
346	 *
347	 * @param string|null $variant The variant to validate
348	 * @return mixed Returns an equivalent valid variant code if possible,
349	 *   null otherwise
350	 */
351	public function validateVariant( $variant = null ) {
352		if ( $variant === null ) {
353			return null;
354		}
355		// Our internal variants are always lower-case; the variant we
356		// are validating may have mixed case.
357		$variant = LanguageCode::replaceDeprecatedCodes( strtolower( $variant ) );
358		if ( in_array( $variant, $this->getVariants() ) ) {
359			return $variant;
360		}
361		// Browsers are supposed to use BCP 47 standard in the
362		// Accept-Language header, but not all of our internal
363		// mediawiki variant codes are BCP 47.  Map BCP 47 code
364		// to our internal code.
365		foreach ( $this->getVariants() as $v ) {
366			// Case-insensitive match (BCP 47 is mixed case)
367			if ( strtolower( LanguageCode::bcp47( $v ) ) === $variant ) {
368				return $v;
369			}
370		}
371		return null;
372	}
373
374	/**
375	 * Get the variant specified in the URL
376	 *
377	 * @return mixed Variant if one found, null otherwise
378	 */
379	public function getURLVariant() {
380		global $wgRequest;
381
382		if ( $this->mURLVariant ) {
383			return $this->mURLVariant;
384		}
385
386		// see if the preference is set in the request
387		$ret = $wgRequest->getText( 'variant' );
388
389		if ( !$ret ) {
390			$ret = $wgRequest->getVal( 'uselang' );
391		}
392
393		$this->mURLVariant = $this->validateVariant( $ret );
394		return $this->mURLVariant;
395	}
396
397	/**
398	 * Determine if the user has a variant set.
399	 *
400	 * @param User $user
401	 * @return mixed Variant if one found, null otherwise
402	 */
403	protected function getUserVariant( User $user ) {
404		// This should only be called within the class after the user is known to be
405		// safe to load and logged in, but check just in case.
406		if ( !$user->isSafeToLoad() ) {
407			return false;
408		}
409
410		if ( $user->isRegistered() ) {
411			// Get language variant preference from logged in users
412			if (
413				$this->getMainCode() ==
414				MediaWikiServices::getInstance()->getContentLanguage()->getCode()
415			) {
416				$ret = $user->getOption( 'variant' );
417			} else {
418				$ret = $user->getOption( 'variant-' . $this->getMainCode() );
419			}
420		} else {
421			// figure out user lang without constructing wgLang to avoid
422			// infinite recursion
423			$ret = $user->getOption( 'language' );
424		}
425
426		$this->mUserVariant = $this->validateVariant( $ret );
427		return $this->mUserVariant;
428	}
429
430	/**
431	 * Determine the language variant from the Accept-Language header.
432	 *
433	 * @return mixed Variant if one found, null otherwise
434	 */
435	protected function getHeaderVariant() {
436		global $wgRequest;
437
438		if ( $this->mHeaderVariant ) {
439			return $this->mHeaderVariant;
440		}
441
442		// See if some supported language variant is set in the
443		// HTTP header.
444		$languages = array_keys( $wgRequest->getAcceptLang() );
445		if ( empty( $languages ) ) {
446			return null;
447		}
448
449		$fallbackLanguages = [];
450		foreach ( $languages as $language ) {
451			$this->mHeaderVariant = $this->validateVariant( $language );
452			if ( $this->mHeaderVariant ) {
453				break;
454			}
455
456			// To see if there are fallbacks of current language.
457			// We record these fallback variants, and process
458			// them later.
459			$fallbacks = $this->getVariantFallbacks( $language );
460			if ( is_string( $fallbacks ) && $fallbacks !== $this->getMainCode() ) {
461				$fallbackLanguages[] = $fallbacks;
462			} elseif ( is_array( $fallbacks ) ) {
463				$fallbackLanguages =
464					array_merge( $fallbackLanguages, $fallbacks );
465			}
466		}
467
468		if ( !$this->mHeaderVariant ) {
469			// process fallback languages now
470			$fallback_languages = array_unique( $fallbackLanguages );
471			foreach ( $fallback_languages as $language ) {
472				$this->mHeaderVariant = $this->validateVariant( $language );
473				if ( $this->mHeaderVariant ) {
474					break;
475				}
476			}
477		}
478
479		return $this->mHeaderVariant;
480	}
481
482	/**
483	 * Dictionary-based conversion.
484	 * This function would not parse the conversion rules.
485	 * If you want to parse rules, try to use convert() or
486	 * convertTo().
487	 *
488	 * @param string $text The text to be converted
489	 * @param bool|string $toVariant The target language code
490	 * @return string The converted text
491	 */
492	public function autoConvert( $text, $toVariant = false ) {
493		$this->loadTables();
494
495		if ( !$toVariant ) {
496			$toVariant = $this->getPreferredVariant();
497			if ( !$toVariant ) {
498				return $text;
499			}
500		}
501
502		if ( $this->guessVariant( $text, $toVariant ) ) {
503			return $text;
504		}
505		/* we convert everything except:
506		   1. HTML markups (anything between < and >)
507		   2. HTML entities
508		   3. placeholders created by the parser
509		   IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
510		   Minimize use of backtracking where possible.
511		*/
512		static $reg;
513		if ( $reg === null ) {
514			$marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
515
516			// this one is needed when the text is inside an HTML markup
517			$htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
518
519			// Optimize for the common case where these tags have
520			// few or no children. Thus try and possesively get as much as
521			// possible, and only engage in backtracking when we hit a '<'.
522
523			// disable convert to variants between <code> tags
524			$codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
525			// disable conversion of <script> tags
526			$scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
527			// disable conversion of <pre> tags
528			$prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
529			// The "|.*+)" at the end, is in case we missed some part of html syntax,
530			// we will fail securely (hopefully) by matching the rest of the string.
531			$htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
532
533			$reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
534				 '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
535		}
536		$startPos = 0;
537		$sourceBlob = '';
538		$literalBlob = '';
539
540		// Guard against delimiter nulls in the input
541		// (should never happen: see T159174)
542		$text = str_replace( "\000", '', $text );
543		$text = str_replace( "\004", '', $text );
544
545		$markupMatches = null;
546		$elementMatches = null;
547
548		// We add a marker (\004) at the end of text, to ensure we always match the
549		// entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
550		$textWithMarker = $text . "\004";
551		while ( $startPos < strlen( $text ) ) {
552			if ( preg_match( $reg, $textWithMarker, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
553				$elementPos = $markupMatches[0][1];
554				$element = $markupMatches[0][0];
555				if ( $element === "\004" ) {
556					// We hit the end.
557					$elementPos = strlen( $text );
558					$element = '';
559				} elseif ( substr( $element, -1 ) === "\004" ) {
560					// This can sometimes happen if we have
561					// unclosed html tags (For example
562					// when converting a title attribute
563					// during a recursive call that contains
564					// a &lt; e.g. <div title="&lt;">.
565					$element = substr( $element, 0, -1 );
566				}
567			} else {
568				// If we hit here, then Language Converter could be tricked
569				// into doing an XSS, so we refuse to translate.
570				// If non-crazy input manages to reach this code path,
571				// we should consider it a bug.
572				$log = LoggerFactory::getInstance( 'languageconverter' );
573				$log->error( "Hit pcre.backtrack_limit in " . __METHOD__
574					. ". Disabling language conversion for this page.",
575					[
576						"method" => __METHOD__,
577						"variant" => $toVariant,
578						"startOfText" => substr( $text, 0, 500 )
579					]
580				);
581				return $text;
582			}
583			// Queue the part before the markup for translation in a batch
584			$sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
585
586			// Advance to the next position
587			$startPos = $elementPos + strlen( $element );
588
589			// Translate any alt or title attributes inside the matched element
590			if ( $element !== ''
591				&& preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
592			) {
593				// FIXME, this decodes entities, so if you have something
594				// like <div title="foo&lt;bar"> the bar won't get
595				// translated since after entity decoding it looks like
596				// unclosed html and we call this method recursively
597				// on attributes.
598				$attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
599				// Ensure self-closing tags stay self-closing.
600				$close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : '';
601				$changed = false;
602				foreach ( [ 'title', 'alt' ] as $attrName ) {
603					if ( !isset( $attrs[$attrName] ) ) {
604						continue;
605					}
606					$attr = $attrs[$attrName];
607					// Don't convert URLs
608					if ( !strpos( $attr, '://' ) ) {
609						$attr = $this->recursiveConvertTopLevel( $attr, $toVariant );
610					}
611
612					if ( $attr !== $attrs[$attrName] ) {
613						$attrs[$attrName] = $attr;
614						$changed = true;
615					}
616				}
617				if ( $changed ) {
618					// @phan-suppress-next-line SecurityCheck-DoubleEscaped Explained above with decodeTagAttributes
619					$element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
620						$close . $elementMatches[3];
621				}
622			}
623			$literalBlob .= $element . "\000";
624		}
625
626		// Do the main translation batch
627		$translatedBlob = $this->translate( $sourceBlob, $toVariant );
628
629		// Put the output back together
630		$translatedIter = StringUtils::explode( "\000", $translatedBlob );
631		$literalIter = StringUtils::explode( "\000", $literalBlob );
632		$output = '';
633		while ( $translatedIter->valid() && $literalIter->valid() ) {
634			$output .= $translatedIter->current();
635			$output .= $literalIter->current();
636			$translatedIter->next();
637			$literalIter->next();
638		}
639
640		return $output;
641	}
642
643	/**
644	 * Translate a string to a variant.
645	 * Doesn't parse rules or do any of that other stuff, for that use
646	 * convert() or convertTo().
647	 *
648	 * @param string $text Text to convert
649	 * @param string $variant Variant language code
650	 * @return string Translated text
651	 */
652	public function translate( $text, $variant ) {
653		// If $text is empty or only includes spaces, do nothing
654		// Otherwise translate it
655		if ( trim( $text ) ) {
656			$this->loadTables();
657			$text = $this->mTables[$variant]->replace( $text );
658		}
659		return $text;
660	}
661
662	/**
663	 * Call translate() to convert text to all valid variants.
664	 *
665	 * @param string $text The text to be converted
666	 * @return array Variant => converted text
667	 */
668	public function autoConvertToAllVariants( $text ) {
669		$this->loadTables();
670
671		$ret = [];
672		foreach ( $this->getVariants() as $variant ) {
673			$ret[$variant] = $this->translate( $text, $variant );
674		}
675
676		return $ret;
677	}
678
679	/**
680	 * Apply manual conversion rules.
681	 *
682	 * @param ConverterRule $convRule
683	 */
684	protected function applyManualConv( ConverterRule $convRule ) {
685		// Use syntax -{T|zh-cn:TitleCN; zh-tw:TitleTw}- to custom
686		// title conversion.
687		// T26072: $mConvRuleTitle was overwritten by other manual
688		// rule(s) not for title, this breaks the title conversion.
689		$newConvRuleTitle = $convRule->getTitle();
690		if ( $newConvRuleTitle ) {
691			// So I add an empty check for getTitle()
692			$this->mConvRuleTitle = $newConvRuleTitle;
693		}
694
695		// merge/remove manual conversion rules to/from global table
696		$convTable = $convRule->getConvTable();
697		$action = $convRule->getRulesAction();
698		foreach ( $convTable as $variant => $pair ) {
699			$v = $this->validateVariant( $variant );
700			if ( !$v ) {
701				continue;
702			}
703
704			if ( $action == 'add' ) {
705				// More efficient than array_merge(), about 2.5 times.
706				foreach ( $pair as $from => $to ) {
707					$this->mTables[$v]->setPair( $from, $to );
708				}
709			} elseif ( $action == 'remove' ) {
710				$this->mTables[$v]->removeArray( $pair );
711			}
712		}
713	}
714
715	/**
716	 * Auto convert a LinkTarget object to a readable string in the
717	 * preferred variant.
718	 *
719	 * @param LinkTarget $linkTarget
720	 * @return string Converted title text
721	 */
722	public function convertTitle( LinkTarget $linkTarget ) {
723		$variant = $this->getPreferredVariant();
724		$index = $linkTarget->getNamespace();
725		if ( $index !== NS_MAIN ) {
726			$text = $this->convertNamespace( $index, $variant ) . ':';
727		} else {
728			$text = '';
729		}
730		$text .= $this->translate( $linkTarget->getText(), $variant );
731
732		return $text;
733	}
734
735	/**
736	 * Get the namespace display name in the preferred variant.
737	 *
738	 * @param int $index Namespace id
739	 * @param string|null $variant Variant code or null for preferred variant
740	 * @return string Namespace name for display
741	 */
742	public function convertNamespace( $index, $variant = null ) {
743		if ( $index === NS_MAIN ) {
744			return '';
745		}
746
747		if ( $variant === null ) {
748			$variant = $this->getPreferredVariant();
749		}
750
751		$cache = MediaWikiServices::getInstance()->getLocalServerObjectCache();
752		$key = $cache->makeKey( 'languageconverter', 'namespace-text', $index, $variant );
753		$nsVariantText = $cache->get( $key );
754		if ( $nsVariantText !== false ) {
755			return $nsVariantText;
756		}
757
758		// First check if a message gives a converted name in the target variant.
759		$nsConvMsg = wfMessage( 'conversion-ns' . $index )->inLanguage( $variant );
760		if ( $nsConvMsg->exists() ) {
761			$nsVariantText = $nsConvMsg->plain();
762		}
763
764		// Then check if a message gives a converted name in content language
765		// which needs extra translation to the target variant.
766		if ( $nsVariantText === false ) {
767			$nsConvMsg = wfMessage( 'conversion-ns' . $index )->inContentLanguage();
768			if ( $nsConvMsg->exists() ) {
769				$nsVariantText = $this->translate( $nsConvMsg->plain(), $variant );
770			}
771		}
772
773		if ( $nsVariantText === false ) {
774			// No message exists, retrieve it from the target variant's namespace names.
775			$mLangObj = MediaWikiServices::getInstance()
776				->getLanguageFactory()
777				->getLanguage( $variant );
778			$nsVariantText = $mLangObj->getFormattedNsText( $index );
779		}
780
781		$cache->set( $key, $nsVariantText, 60 );
782
783		return $nsVariantText;
784	}
785
786	/**
787	 * Convert text to different variants of a language. The automatic
788	 * conversion is done in autoConvert(). Here we parse the text
789	 * marked with -{}-, which specifies special conversions of the
790	 * text that can not be accomplished in autoConvert().
791	 *
792	 * Syntax of the markup:
793	 * -{code1:text1;code2:text2;...}-  or
794	 * -{flags|code1:text1;code2:text2;...}-  or
795	 * -{text}- in which case no conversion should take place for text
796	 *
797	 * @warning Glossary state is maintained between calls. Never feed this
798	 *   method input that hasn't properly been escaped as it may result in
799	 *   an XSS in subsequent calls, even if those subsequent calls properly
800	 *   escape things.
801	 * @param string $text Text to be converted, already html escaped.
802	 * @return string Converted text (html)
803	 */
804	public function convert( $text ) {
805		$variant = $this->getPreferredVariant();
806		return $this->convertTo( $text, $variant );
807	}
808
809	/**
810	 * Same as convert() except a extra parameter to custom variant.
811	 *
812	 * @param string $text Text to be converted, already html escaped
813	 * @param-taint $text exec_html
814	 * @param string $variant The target variant code
815	 * @return string Converted text
816	 * @return-taint escaped
817	 */
818	public function convertTo( $text, $variant ) {
819		$languageConverterFactory = MediaWikiServices::getInstance()->getLanguageConverterFactory();
820		if ( $languageConverterFactory->isConversionDisabled() ) {
821			return $text;
822		}
823		// Reset converter state for a new converter run.
824		$this->mConvRuleTitle = false;
825		return $this->recursiveConvertTopLevel( $text, $variant );
826	}
827
828	/**
829	 * Recursively convert text on the outside. Allow to use nested
830	 * markups to custom rules.
831	 *
832	 * @param string $text Text to be converted
833	 * @param string $variant The target variant code
834	 * @param int $depth Depth of recursion
835	 * @return string Converted text
836	 */
837	protected function recursiveConvertTopLevel( $text, $variant, $depth = 0 ) {
838		$startPos = 0;
839		$out = '';
840		$length = strlen( $text );
841		$shouldConvert = !$this->guessVariant( $text, $variant );
842		$continue = true;
843
844		$noScript = '<script.*?>.*?<\/script>(*SKIP)(*FAIL)';
845		$noStyle = '<style.*?>.*?<\/style>(*SKIP)(*FAIL)';
846		// phpcs:ignore Generic.Files.LineLength
847		$noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
848		while ( $startPos < $length && $continue ) {
849			$continue = preg_match(
850				// Only match -{ outside of html.
851				"/$noScript|$noStyle|$noHtml|-\{/",
852				$text,
853				$m,
854				PREG_OFFSET_CAPTURE,
855				$startPos
856			);
857
858			if ( !$continue ) {
859				// No more markup, append final segment
860				$fragment = substr( $text, $startPos );
861				$out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
862				return $out;
863			}
864
865			// Offset of the match of the regex pattern.
866			$pos = $m[0][1];
867
868			// Append initial segment
869			$fragment = substr( $text, $startPos, $pos - $startPos );
870			$out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
871			// -{ marker found, not in attribute
872			// Advance position up to -{ marker.
873			$startPos = $pos;
874			// Do recursive conversion
875			// Note: This passes $startPos by reference, and advances it.
876			$out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
877		}
878		return $out;
879	}
880
881	/**
882	 * Recursively convert text on the inside.
883	 *
884	 * @param string $text Text to be converted
885	 * @param string $variant The target variant code
886	 * @param int &$startPos
887	 * @param int $depth Depth of recursion
888	 *
889	 * @throws MWException
890	 * @return string Converted text
891	 */
892	protected function recursiveConvertRule( $text, $variant, &$startPos, $depth = 0 ) {
893		// Quick sanity check (no function calls)
894		if ( $text[$startPos] !== '-' || $text[$startPos + 1] !== '{' ) {
895			throw new MWException( __METHOD__ . ': invalid input string' );
896		}
897
898		$startPos += 2;
899		$inner = '';
900		$warningDone = false;
901		$length = strlen( $text );
902
903		while ( $startPos < $length ) {
904			$m = false;
905			preg_match( '/-\{|\}-/', $text, $m, PREG_OFFSET_CAPTURE, $startPos );
906			if ( !$m ) {
907				// Unclosed rule
908				break;
909			}
910
911			$token = $m[0][0];
912			$pos = $m[0][1];
913
914			// Markup found
915			// Append initial segment
916			$inner .= substr( $text, $startPos, $pos - $startPos );
917
918			// Advance position
919			$startPos = $pos;
920
921			switch ( $token ) {
922				case '-{':
923					// Check max depth
924					if ( $depth >= $this->mMaxDepth ) {
925						$inner .= '-{';
926						if ( !$warningDone ) {
927							$inner .= '<span class="error">' .
928								wfMessage( 'language-converter-depth-warning' )
929									->numParams( $this->mMaxDepth )->inContentLanguage()->text() .
930								'</span>';
931							$warningDone = true;
932						}
933						$startPos += 2;
934						break;
935					}
936					// Recursively parse another rule
937					$inner .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
938					break;
939				case '}-':
940					// Apply the rule
941					$startPos += 2;
942					$rule = new ConverterRule( $inner, $this );
943					$rule->parse( $variant );
944					$this->applyManualConv( $rule );
945					return $rule->getDisplay();
946				default:
947					throw new MWException( __METHOD__ . ': invalid regex match' );
948			}
949		}
950
951		// Unclosed rule
952		if ( $startPos < $length ) {
953			$inner .= substr( $text, $startPos );
954		}
955		$startPos = $length;
956		return '-{' . $this->autoConvert( $inner, $variant );
957	}
958
959	/**
960	 * If a language supports multiple variants, it is possible that
961	 * non-existing link in one variant actually exists in another variant.
962	 * This function tries to find it. See e.g. LanguageZh.php
963	 * The input parameters may be modified upon return
964	 *
965	 * @param string &$link The name of the link
966	 * @param Title &$nt The title object of the link
967	 * @param bool $ignoreOtherCond To disable other conditions when
968	 *   we need to transclude a template or update a category's link
969	 */
970	public function findVariantLink( &$link, &$nt, $ignoreOtherCond = false ) {
971		# If the article has already existed, there is no need to
972		# check it again, otherwise it may cause a fault.
973		if ( is_object( $nt ) && $nt->exists() ) {
974			return;
975		}
976
977		global $wgRequest;
978		$isredir = $wgRequest->getText( 'redirect', 'yes' );
979		$action = $wgRequest->getText( 'action' );
980		if ( $action == 'edit' && $wgRequest->getBool( 'redlink' ) ) {
981			$action = 'view';
982		}
983		$linkconvert = $wgRequest->getText( 'linkconvert', 'yes' );
984		$disableLinkConversion =
985			MediaWikiServices::getInstance()->getLanguageConverterFactory()
986			->isLinkConversionDisabled();
987		$linkBatchFactory = MediaWikiServices::getInstance()->getLinkBatchFactory();
988		$linkBatch = $linkBatchFactory->newLinkBatch();
989
990		$ns = NS_MAIN;
991
992		if ( $disableLinkConversion ||
993			( !$ignoreOtherCond &&
994				( $isredir == 'no'
995					|| $action == 'edit'
996					|| $action == 'submit'
997					|| $linkconvert == 'no' ) ) ) {
998			return;
999		}
1000
1001		if ( is_object( $nt ) ) {
1002			$ns = $nt->getNamespace();
1003		}
1004
1005		$variants = $this->autoConvertToAllVariants( $link );
1006		if ( !$variants ) { // give up
1007			return;
1008		}
1009
1010		$titles = [];
1011
1012		foreach ( $variants as $v ) {
1013			if ( $v != $link ) {
1014				$varnt = Title::newFromText( $v, $ns );
1015				if ( $varnt !== null ) {
1016					$linkBatch->addObj( $varnt );
1017					$titles[] = $varnt;
1018				}
1019			}
1020		}
1021
1022		// fetch all variants in single query
1023		$linkBatch->execute();
1024
1025		foreach ( $titles as $varnt ) {
1026			if ( $varnt->getArticleID() > 0 ) {
1027				$nt = $varnt;
1028				$link = $varnt->getText();
1029				break;
1030			}
1031		}
1032	}
1033
1034	/**
1035	 * Returns language specific hash options.
1036	 *
1037	 * @return string
1038	 */
1039	public function getExtraHashOptions() {
1040		$variant = $this->getPreferredVariant();
1041
1042		return '!' . $variant;
1043	}
1044
1045	/**
1046	 * Guess if a text is written in a variant. This should be implemented in subclasses.
1047	 *
1048	 * @param string $text The text to be checked
1049	 * @param string $variant Language code of the variant to be checked for
1050	 * @return bool True if $text appears to be written in $variant, false if not
1051	 *
1052	 * @author Nikola Smolenski <smolensk@eunet.rs>
1053	 * @since 1.19
1054	 */
1055	public function guessVariant( $text, $variant ) {
1056		return false;
1057	}
1058
1059	/**
1060	 * Load default conversion tables.
1061	 * This method must be implemented in derived class.
1062	 *
1063	 * @throws MWException
1064	 */
1065	protected function loadDefaultTables() {
1066		$class = static::class;
1067		throw new MWException( "Must implement loadDefaultTables() method in class $class" );
1068	}
1069
1070	/**
1071	 * Load conversion tables either from the cache or the disk.
1072	 * @private
1073	 * @param bool $fromCache Load from memcached? Defaults to true.
1074	 */
1075	protected function loadTables( $fromCache = true ) {
1076		global $wgLanguageConverterCacheType;
1077
1078		if ( $this->mTablesLoaded ) {
1079			return;
1080		}
1081
1082		$this->mTablesLoaded = true;
1083		// Do not use null as starting value, as that would confuse phan a lot.
1084		$this->mTables = [];
1085		$cache = ObjectCache::getInstance( $wgLanguageConverterCacheType );
1086		$cacheKey = $cache->makeKey( 'conversiontables', $this->getMainCode() );
1087		if ( $fromCache ) {
1088			$this->mTables = $cache->get( $cacheKey );
1089		}
1090		if ( !$this->mTables || !array_key_exists( self::CACHE_VERSION_KEY, $this->mTables ) ) {
1091			// not in cache, or we need a fresh reload.
1092			// We will first load the default tables
1093			// then update them using things in MediaWiki:Conversiontable/*
1094			$this->loadDefaultTables();
1095			foreach ( $this->getVariants() as $var ) {
1096				$cached = $this->parseCachedTable( $var );
1097				// @phan-suppress-next-next-line PhanTypeArraySuspiciousNullable
1098				// FIXME: $this->mTables could theoretically be null here
1099				$this->mTables[$var]->mergeArray( $cached );
1100			}
1101
1102			$this->postLoadTables();
1103			$this->mTables[self::CACHE_VERSION_KEY] = true;
1104
1105			$cache->set( $cacheKey, $this->mTables, 43200 );
1106		}
1107	}
1108
1109	/**
1110	 * Hook for post processing after conversion tables are loaded.
1111	 */
1112	protected function postLoadTables() {
1113	}
1114
1115	/**
1116	 * Reload the conversion tables.
1117	 *
1118	 * Also used by test suites which need to reset the converter state.
1119	 *
1120	 * @private
1121	 */
1122	private function reloadTables() {
1123		if ( $this->mTables ) {
1124			// @phan-suppress-next-line PhanTypeObjectUnsetDeclaredProperty
1125			unset( $this->mTables );
1126		}
1127
1128		$this->mTablesLoaded = false;
1129		$this->loadTables( false );
1130	}
1131
1132	/**
1133	 * Parse the conversion table stored in the cache.
1134	 *
1135	 * The tables should be in blocks of the following form:
1136	 * 		-{
1137	 * 			word => word ;
1138	 * 			word => word ;
1139	 * 			...
1140	 * 		}-
1141	 *
1142	 * To make the tables more manageable, subpages are allowed
1143	 * and will be parsed recursively if $recursive == true.
1144	 *
1145	 * @param string $code Language code
1146	 * @param string $subpage Subpage name
1147	 * @param bool $recursive Parse subpages recursively? Defaults to true.
1148	 *
1149	 * @return array
1150	 */
1151	private function parseCachedTable( $code, $subpage = '', $recursive = true ) {
1152		static $parsed = [];
1153
1154		$key = 'Conversiontable/' . $code;
1155		if ( $subpage ) {
1156			$key .= '/' . $subpage;
1157		}
1158		if ( array_key_exists( $key, $parsed ) ) {
1159			return [];
1160		}
1161
1162		$parsed[$key] = true;
1163
1164		if ( $subpage === '' ) {
1165			$messageCache = MediaWikiServices::getInstance()->getMessageCache();
1166			$txt = $messageCache->getMsgFromNamespace( $key, $code );
1167		} else {
1168			$txt = false;
1169			$title = Title::makeTitleSafe( NS_MEDIAWIKI, $key );
1170			if ( $title && $title->exists() ) {
1171				$revision = MediaWikiServices::getInstance()
1172					->getRevisionLookup()
1173					->getRevisionByTitle( $title );
1174				if ( $revision ) {
1175					$model = $revision->getSlot(
1176						SlotRecord::MAIN,
1177						RevisionRecord::RAW
1178					)->getModel();
1179					if ( $model == CONTENT_MODEL_WIKITEXT ) {
1180						// @phan-suppress-next-line PhanUndeclaredMethod
1181						$txt = $revision->getContent(
1182							SlotRecord::MAIN,
1183							RevisionRecord::RAW
1184						)->getText();
1185					}
1186
1187					// @todo in the future, use a specialized content model, perhaps based on json!
1188				}
1189			}
1190		}
1191
1192		# Nothing to parse if there's no text
1193		if ( $txt === false || $txt === null || $txt === '' ) {
1194			return [];
1195		}
1196
1197		// get all subpage links of the form
1198		// [[MediaWiki:Conversiontable/zh-xx/...|...]]
1199		$linkhead = $this->mLangObj->getNsText( NS_MEDIAWIKI ) .
1200			':Conversiontable';
1201		$subs = StringUtils::explode( '[[', $txt );
1202		$sublinks = [];
1203		foreach ( $subs as $sub ) {
1204			$link = explode( ']]', $sub, 2 );
1205			if ( count( $link ) != 2 ) {
1206				continue;
1207			}
1208			$b = explode( '|', $link[0], 2 );
1209			$b = explode( '/', trim( $b[0] ), 3 );
1210			if ( count( $b ) == 3 ) {
1211				$sublink = $b[2];
1212			} else {
1213				$sublink = '';
1214			}
1215
1216			if ( $b[0] == $linkhead && $b[1] == $code ) {
1217				$sublinks[] = $sublink;
1218			}
1219		}
1220
1221		// parse the mappings in this page
1222		$blocks = StringUtils::explode( '-{', $txt );
1223		$ret = [];
1224		$first = true;
1225		foreach ( $blocks as $block ) {
1226			if ( $first ) {
1227				// Skip the part before the first -{
1228				$first = false;
1229				continue;
1230			}
1231			$mappings = explode( '}-', $block, 2 )[0];
1232			$stripped = str_replace( [ "'", '"', '*', '#' ], '', $mappings );
1233			$table = StringUtils::explode( ';', $stripped );
1234			foreach ( $table as $t ) {
1235				$m = explode( '=>', $t, 3 );
1236				if ( count( $m ) != 2 ) {
1237					continue;
1238				}
1239				// trim any trailling comments starting with '//'
1240				$tt = explode( '//', $m[1], 2 );
1241				$ret[trim( $m[0] )] = trim( $tt[0] );
1242			}
1243		}
1244
1245		// recursively parse the subpages
1246		if ( $recursive ) {
1247			foreach ( $sublinks as $link ) {
1248				$s = $this->parseCachedTable( $code, $link, $recursive );
1249				$ret = $s + $ret;
1250			}
1251		}
1252
1253		if ( $this->mUcfirst ) {
1254			foreach ( $ret as $k => $v ) {
1255				$ret[$this->mLangObj->ucfirst( $k )] = $this->mLangObj->ucfirst( $v );
1256			}
1257		}
1258		return $ret;
1259	}
1260
1261	/**
1262	 * Enclose a string with the "no conversion" tag. This is used by
1263	 * various functions in the Parser.
1264	 *
1265	 * @param string $text Text to be tagged for no conversion
1266	 * @param bool $noParse Unused
1267	 * @return string The tagged text
1268	 */
1269	public function markNoConversion( $text, $noParse = false ) {
1270		# don't mark if already marked
1271		if ( strpos( $text, '-{' ) || strpos( $text, '}-' ) ) {
1272			return $text;
1273		}
1274
1275		$ret = "-{R|$text}-";
1276		return $ret;
1277	}
1278
1279	/**
1280	 * Convert the sorting key for category links. This should make different
1281	 * keys that are variants of each other map to the same key.
1282	 *
1283	 * @param string $key
1284	 *
1285	 * @return string
1286	 */
1287	public function convertCategoryKey( $key ) {
1288		return $key;
1289	}
1290
1291	/**
1292	 * Refresh the cache of conversion tables when
1293	 * MediaWiki:Conversiontable* is updated.
1294	 *
1295	 * @param LinkTarget $linkTarget The LinkTarget of the page being updated
1296	 */
1297	public function updateConversionTable( LinkTarget $linkTarget ) {
1298		if ( $linkTarget->getNamespace() === NS_MEDIAWIKI ) {
1299			$t = explode( '/', $linkTarget->getDBkey(), 3 );
1300			$c = count( $t );
1301			if ( $c > 1 && $t[0] == 'Conversiontable' ) {
1302				if ( $this->validateVariant( $t[1] ) ) {
1303					$this->reloadTables();
1304				}
1305			}
1306		}
1307	}
1308
1309	/**
1310	 * Get the cached separator pattern for ConverterRule::parseRules()
1311	 * @return string
1312	 */
1313	public function getVarSeparatorPattern() {
1314		if ( $this->mVarSeparatorPattern === null ) {
1315			// varsep_pattern for preg_split:
1316			// text should be splited by ";" only if a valid variant
1317			// name exist after the markup, for example:
1318			//  -{zh-hans:<span style="font-size:120%;">xxx</span>;zh-hant:\
1319			//  <span style="font-size:120%;">yyy</span>;}-
1320			// we should split it as:
1321			//  [
1322			//    [0] => 'zh-hans:<span style="font-size:120%;">xxx</span>'
1323			//    [1] => 'zh-hant:<span style="font-size:120%;">yyy</span>'
1324			//    [2] => ''
1325			//  ]
1326			$expandedVariants = [];
1327			foreach ( $this->getVariants() as $variant ) {
1328				$expandedVariants[ $variant ] = 1;
1329				// Accept standard BCP 47 names for variants as well.
1330				$expandedVariants[ LanguageCode::bcp47( $variant ) ] = 1;
1331			}
1332			// Accept old deprecated names for variants
1333			foreach ( LanguageCode::getDeprecatedCodeMapping() as $old => $new ) {
1334				if ( isset( $expandedVariants[ $new ] ) ) {
1335					$expandedVariants[ $old ] = 1;
1336				}
1337			}
1338
1339			$pat = '/;\s*(?=';
1340			foreach ( $expandedVariants as $variant => $ignore ) {
1341				// zh-hans:xxx;zh-hant:yyy
1342				$pat .= $variant . '\s*:|';
1343				// xxx=>zh-hans:yyy; xxx=>zh-hant:zzz
1344				$pat .= '[^;]*?=>\s*' . $variant . '\s*:|';
1345			}
1346			$pat .= '\s*$)/';
1347			$this->mVarSeparatorPattern = $pat;
1348		}
1349		return $this->mVarSeparatorPattern;
1350	}
1351
1352	/**
1353	 * Check if this is a language with variants
1354	 *
1355	 * @since 1.35
1356	 *
1357	 * @return bool
1358	 */
1359	public function hasVariants() {
1360		return count( $this->getVariants() ) > 1;
1361	}
1362
1363	/**
1364	 * Strict check if the language has the specific variant.
1365	 *
1366	 * Compare to LanguageConverter::validateVariant() which does a more
1367	 * lenient check and attempts to coerce the given code to a valid one.
1368	 *
1369	 * @since 1.35
1370	 * @param string $variant
1371	 * @return bool
1372	 */
1373	public function hasVariant( $variant ) {
1374		return $variant && ( $variant === $this->validateVariant( $variant ) );
1375	}
1376
1377	/**
1378	 * Perform output conversion on a string, and encode for safe HTML output.
1379	 *
1380	 * @since 1.35
1381	 *
1382	 * @param string $text Text to be converted
1383	 * @return string
1384	 */
1385	public function convertHtml( $text ) {
1386		// @phan-suppress-next-line SecurityCheck-DoubleEscaped convert() is documented to return html
1387		return htmlspecialchars( $this->convert( $text ) );
1388	}
1389}
1390