1<?php 2 3/** 4 * A bidirectional Language Converter, capable of round-tripping variant 5 * conversion. 6 * 7 * Language conversion is as DOMPostProcessor pass, run over the 8 * Parsoid-format HTML output, which may have embedded language converter 9 * rules. We first assign a (guessed) source variant to each DOM node, 10 * which will be used when round-tripping the result back to the original 11 * source variant. Then for each applicable text node in the DOM, we 12 * first "bracket" the text, splitting it into cleanly round-trippable 13 * segments and lossy/unclean segments. For the lossy segments we add 14 * additional metadata to the output to record the original source variant 15 * text to allow round-tripping (and variant-aware editing). 16 * 17 * Note that different wikis have different policies for source variant: 18 * in some wikis all articles are authored in one particular variant, by 19 * convention. In others, it's a "first author gets to choose the variant" 20 * situation. In both cases, a constant/per-article "source variant" may 21 * be specified via some as-of-yet-unimplemented mechanism; either part of 22 * the site configuration, or per-article metadata like pageLanguage. 23 * In other wikis (like zhwiki) the text is a random mix of variants; in 24 * these cases the "source variant" will be null/unspecified, and we'll 25 * dynamically pick the most likely source variant for each subtree. 26 * 27 * Each individual language has a dynamically-loaded subclass of `Language`, 28 * which may also have a `LanguageConverter` subclass to load appropriate 29 * `ReplacementMachine`s and do other language-specific customizations. 30 */ 31 32namespace Wikimedia\Parsoid\Language; 33 34use DOMDocument; 35use DOMNode; 36use Wikimedia\LangConv\ReplacementMachine; 37use Wikimedia\Parsoid\Config\Env; 38use Wikimedia\Parsoid\Core\ClientError; 39use Wikimedia\Parsoid\Utils\DOMCompat; 40use Wikimedia\Parsoid\Utils\DOMDataUtils; 41use Wikimedia\Parsoid\Utils\DOMUtils; 42use Wikimedia\Parsoid\Utils\Timing; 43 44/** 45 * Base class for language variant conversion. 46 */ 47class LanguageConverter { 48 49 /** @var Language */ 50 private $language; 51 52 /** @var string */ 53 private $langCode; 54 55 /** @var string[] */ 56 private $variants; 57 58 /** @var ?array */ 59 private $variantFallbacks; 60 61 /** @var ?ReplacementMachine */ 62 private $machine; 63 64 /** 65 * @param Language $language 66 * @param string $langCode The main language code of this language 67 * @param string[] $variants The supported variants of this language 68 * @param ?array $variantfallbacks The fallback language of each variant 69 * @param ?array $flags Defining the custom strings that maps to the flags 70 * @param ?array $manualLevel Limit for supported variants 71 */ 72 public function __construct( 73 Language $language, string $langCode, array $variants, 74 ?array $variantfallbacks = null, ?array $flags = null, 75 ?array $manualLevel = null 76 ) { 77 $this->language = $language; 78 $this->langCode = $langCode; 79 $this->variants = $variants; // XXX subtract disabled variants 80 $this->variantFallbacks = $variantfallbacks; 81 // this.mVariantNames = Language.// XXX 82 83 // Eagerly load conversion tables. 84 // XXX we could defer loading in the future, or cache more 85 // aggressively 86 $this->loadDefaultTables(); 87 } 88 89 public function loadDefaultTables() { 90 } 91 92 /** 93 * Return the {@link ReplacementMachine} powering this conversion. 94 * @return ?ReplacementMachine 95 */ 96 public function getMachine(): ?ReplacementMachine { 97 return $this->machine; 98 } 99 100 /** 101 * @param ReplacementMachine $machine 102 */ 103 public function setMachine( ReplacementMachine $machine ): void { 104 $this->machine = $machine; 105 } 106 107 /** 108 * Try to return a classname from a given code. 109 * @param string $code 110 * @param bool $fallback Whether we're going through language fallback 111 * @return class-string Name of the language class (if one were to exist) 112 */ 113 public static function classFromCode( string $code, bool $fallback ): string { 114 if ( $fallback && $code === 'en' ) { 115 return '\Wikimedia\Parsoid\Language\Language'; 116 } else { 117 $code = preg_replace_callback( '/^\w/', function ( $matches ) { 118 return strtoupper( $matches[0] ); 119 }, $code, 1 ); 120 $code = preg_replace( '/-/', '_', $code ); 121 $code = preg_replace( '#/|^\.+#', '', $code ); // avoid path attacks 122 return "\Wikimedia\Parsoid\Language\Language{$code}"; 123 } 124 } 125 126 /** 127 * @param Env $env 128 * @param string $lang 129 * @param bool $fallback 130 * @return Language 131 */ 132 public static function loadLanguage( Env $env, string $lang, bool $fallback = false ): Language { 133 try { 134 if ( Language::isValidCode( $lang ) ) { 135 $languageClass = self::classFromCode( $lang, $fallback ); 136 return new $languageClass(); 137 } 138 } catch ( \Error $e ) { 139 /* fall through */ 140 } 141 $env->log( 'info', "Couldn't load language: {$lang} fallback={$fallback}" ); 142 return new Language(); 143 } 144 145 // phpcs:ignore MediaWiki.Commenting.FunctionComment.MissingDocumentationPublic 146 public function findVariantLink( $link, $nt, $ignoreOtherCond ) { 147 // XXX unimplemented 148 return [ 'nt' => $nt, 'link' => $link ]; 149 } 150 151 /** 152 * @param string $fromVariant 153 * @param string $text 154 * @param string $toVariant 155 * @suppress PhanEmptyPublicMethod 156 */ 157 public function translate( $fromVariant, $text, $toVariant ) { 158 // XXX unimplemented 159 } 160 161 /** 162 * @param string $text 163 * @param string $variant 164 * @return bool 165 */ 166 public function guessVariant( $text, $variant ) { 167 return false; 168 } 169 170 /** 171 * Convert the given document into $targetVariant, if: 172 * 1) language converter is enabled on this wiki, and 173 * 2) the targetVariant is specified, and it is a known variant (not a 174 * base language code) 175 * 176 * The `$sourceVariant`, if provided is expected to be per-wiki or 177 * per-article metadata which specifies a standard "authoring variant" 178 * for this article or wiki. For example, all articles are authored in 179 * Cyrillic by convention. It should be left blank if there is no 180 * consistent convention on the wiki (as for zhwiki, for instance). 181 * 182 * @param Env $env 183 * @param DOMDocument $doc The input document. 184 * @param ?string $targetVariant The desired output variant. 185 * @param ?string $sourceVariant The variant used by convention when 186 * authoring pages, if there is one; otherwise left null. 187 */ 188 public static function maybeConvert( 189 Env $env, DOMDocument $doc, ?string $targetVariant, 190 ?string $sourceVariant 191 ): void { 192 // language converter must be enabled for the pagelanguage 193 if ( !$env->langConverterEnabled() ) { 194 return; 195 } 196 $variants = $env->getSiteConfig()->variants(); 197 198 // targetVariant must be specified, and a language-with-variants 199 if ( !( $targetVariant && array_key_exists( $targetVariant, $variants ) ) ) { 200 return; 201 } 202 203 // targetVariant must not be a base language code 204 if ( $variants[$targetVariant]['base'] === $targetVariant ) { 205 // XXX in the future we probably want to go ahead and expand 206 // empty <span>s left by -{...}- constructs, etc. 207 return; 208 } 209 210 // Record the fact that we've done conversion to targetVariant 211 $env->getPageConfig()->setVariant( $targetVariant ); 212 213 // But don't actually do the conversion if __NOCONTENTCONVERT__ 214 if ( DOMCompat::querySelector( $doc, 'meta[property="mw:PageProp/nocontentconvert"]' ) ) { 215 return; 216 } 217 218 // OK, convert! 219 self::baseToVariant( $env, DOMCompat::getBody( $doc ), $targetVariant, $sourceVariant ); 220 } 221 222 /** 223 * Convert a text in the "base variant" to a specific variant, given by `targetVariant`. If 224 * `sourceVariant` is given, assume that the input wikitext is in `sourceVariant` to 225 * construct round-trip metadata, instead of using a heuristic to guess the best variant 226 * for each DOM subtree of wikitext. 227 * @param Env $env 228 * @param DOMNode $rootNode The root node of a fragment to convert. 229 * @param string $targetVariant The variant to be used for the output DOM. 230 * @param ?string $sourceVariant An optional variant assumed for the 231 * input DOM in order to create roundtrip metadata. 232 */ 233 public static function baseToVariant( 234 Env $env, DOMNode $rootNode, string $targetVariant, 235 ?string $sourceVariant 236 ): void { 237 // PageConfig guarantees getPageLanguage() never returns null. 238 $pageLangCode = $env->getPageConfig()->getPageLanguage(); 239 $guesser = null; 240 241 $metrics = $env->getSiteConfig()->metrics(); 242 $loadTiming = Timing::start( $metrics ); 243 $languageClass = self::loadLanguage( $env, $pageLangCode ); 244 $lang = new $languageClass(); 245 $langconv = $lang->getConverter(); 246 // XXX we might want to lazily-load conversion tables here. 247 $loadTiming->end( "langconv.{$targetVariant}.init" ); 248 $loadTiming->end( 'langconv.init' ); 249 250 // Check the the target variant is valid (and implemented!) 251 $validTarget = $langconv !== null && $langconv->getMachine() !== null 252 && array_key_exists( $targetVariant, $langconv->getMachine()->getCodes() ); 253 if ( !$validTarget ) { 254 // XXX create a warning header? (T197949) 255 $env->log( 'info', "Unimplemented variant: {$targetVariant}" ); 256 return; /* no conversion */ 257 } 258 // Check that the source variant is valid. 259 $validSource = $sourceVariant === null || 260 array_key_exists( $sourceVariant, $langconv->getMachine()->getCodes() ); 261 if ( !$validSource ) { 262 throw new ClientError( "Invalid source variant: $sourceVariant for target $targetVariant" ); 263 } 264 265 $timing = Timing::start( $metrics ); 266 if ( $metrics ) { 267 $metrics->increment( 'langconv.count' ); 268 $metrics->increment( "langconv.{$targetVariant}.count" ); 269 } 270 271 // XXX Eventually we'll want to consult some wiki configuration to 272 // decide whether a ConstantLanguageGuesser is more appropriate. 273 if ( $sourceVariant ) { 274 $guesser = new ConstantLanguageGuesser( $sourceVariant ); 275 } else { 276 $guesser = new MachineLanguageGuesser( 277 $langconv->getMachine(), $rootNode, $targetVariant 278 ); 279 } 280 281 $ct = new ConversionTraverser( $targetVariant, $guesser, $langconv->getMachine() ); 282 $ct->traverse( $env, $rootNode, [], true ); 283 284 // HACK: to avoid data-parsoid="{}" in the output, set the isNew flag 285 // on synthetic spans 286 DOMUtils::assertElt( $rootNode ); 287 foreach ( DOMCompat::querySelectorAll( 288 $rootNode, 'span[typeof="mw:LanguageVariant"][data-mw-variant]' 289 ) as $span ) { 290 $dmwv = DOMDataUtils::getJSONAttribute( $span, 'data-mw-variant', null ); 291 if ( $dmwv->rt ?? false ) { 292 $dp = DOMDataUtils::getDataParsoid( $span ); 293 $dp->tmp->isNew = true; 294 } 295 } 296 297 $timing->end( 'langconv.total' ); 298 $timing->end( "langconv.{$targetVariant}.total" ); 299 $loadTiming->end( 'langconv.totalWithInit' ); 300 } 301} 302