1 /*
2 * This file is part of Poedit (http://poedit.net)
3 *
4 * Copyright (C) 2013-2015 Vaclav Slavik
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "language.h"
27
28 #include <cctype>
29 #include <algorithm>
30 #include <unordered_map>
31 #include <mutex>
32 #include <memory>
33
34 #include <unicode/uvernum.h>
35 #include <unicode/locid.h>
36 #include <unicode/coll.h>
37 #include <unicode/utypes.h>
38
39 #include <wx/filename.h>
40
41 #include "str_helpers.h"
42
43 #ifdef HAVE_CLD2
44 #ifdef HAVE_CLD2_PUBLIC_COMPACT_LANG_DET_H
45 #include <cld2/public/compact_lang_det.h>
46 #include <cld2/public/encodings.h>
47 #else
48 #include "public/compact_lang_det.h"
49 #include "public/encodings.h"
50 #endif
51 #endif
52
53 // GCC's libstdc++ didn't have functional std::regex implementation until 4.9
54 #if (defined(__GNUC__) && !defined(__clang__) && !wxCHECK_GCC_VERSION(4,9))
55 #include <boost/regex.hpp>
56 using boost::wregex;
57 using boost::regex_match;
58 #else
59 #include <regex>
60 using std::wregex;
61 using std::regex_match;
62 #endif
63
64 namespace
65 {
66
67 // see http://www.gnu.org/software/gettext/manual/html_node/Header-Entry.html
68 // for description of permitted formats
69 const wregex RE_LANG_CODE(L"([a-z]){2,3}(_([A-Z]{2}|[0-9]{3}))?(@[a-z]+)?");
70
71 // a more permissive variant of the same that TryNormalize() would fix
72 const wregex RE_LANG_CODE_PERMISSIVE(L"([a-zA-Z]){2,3}([_-]([a-zA-Z]{2}|[0-9]{3}))?(@[a-zA-Z]+)?");
73
74 // try some normalizations: s/-/_/, case adjustments
TryNormalize(std::wstring & s)75 void TryNormalize(std::wstring& s)
76 {
77 auto begin = s.begin();
78 auto end = s.end();
79
80 size_t pos = s.rfind('@');
81 if (pos != std::wstring::npos)
82 {
83 for (auto x = begin + pos; x != end; ++x)
84 *x = std::tolower(*x);
85 end = begin + pos;
86 }
87
88 bool upper = false;
89 for (auto x = begin; x != end; ++x)
90 {
91 if (*x == '-')
92 *x = '_';
93 if (*x == '_')
94 upper = true;
95 else if (std::isupper(*x) && !upper)
96 *x = std::tolower(*x);
97 else if (std::islower(*x) && upper)
98 *x = std::toupper(*x);
99 }
100 }
101
IsISOLanguage(const std::string & s)102 bool IsISOLanguage(const std::string& s)
103 {
104 const char *test = s.c_str();
105 for (const char * const* i = icu::Locale::getISOLanguages(); *i != nullptr; ++i)
106 {
107 if (strcmp(test, *i) == 0)
108 return true;
109 }
110 return false;
111 }
112
IsISOCountry(const std::string & s)113 bool IsISOCountry(const std::string& s)
114 {
115 const char *test = s.c_str();
116 for (const char * const* i = icu::Locale::getISOCountries(); *i != nullptr; ++i)
117 {
118 if (strcmp(test, *i) == 0)
119 return true;
120 }
121 return false;
122 }
123
124 // Mapping of names to their respective ISO codes.
125 struct DisplayNamesData
126 {
127 typedef std::unordered_map<std::wstring, std::string> Map;
128 Map names, namesEng;
129 std::vector<std::wstring> sortedNames;
130 };
131
132 std::once_flag of_namesList;
133
GetDisplayNamesData()134 const DisplayNamesData& GetDisplayNamesData()
135 {
136 static DisplayNamesData data;
137
138 std::call_once(of_namesList, [=]{
139 auto locEng = icu::Locale::getEnglish();
140 std::vector<icu::UnicodeString> names;
141
142 int32_t count;
143 const icu::Locale *loc = icu::Locale::getAvailableLocales(count);
144 names.reserve(count);
145 for (int i = 0; i < count; i++, loc++)
146 {
147 auto language = loc->getLanguage();
148 auto script = loc->getScript();
149 auto country = loc->getCountry();
150 auto variant = loc->getVariant();
151
152 // TODO: for now, ignore variants here and in FormatForRoundtrip(),
153 // because translating them between gettext and ICU is nontrivial
154 if (variant != nullptr && *variant != '\0')
155 continue;
156
157 icu::UnicodeString s;
158 loc->getDisplayName(s);
159 names.push_back(s);
160
161 if (strcmp(language, "zh") == 0 && *country == '\0')
162 {
163 if (strcmp(script, "Hans") == 0)
164 country = "CN";
165 else if (strcmp(script, "Hant") == 0)
166 country = "TW";
167 }
168
169 std::string code(language);
170 if (*country != '\0')
171 {
172 code += '_';
173 code += country;
174 }
175 if (*script != '\0')
176 {
177 if (strcmp(script, "Latn") == 0)
178 code += "@latin";
179 }
180
181 s.foldCase();
182 data.names[str::to_wstring(s)] = code;
183
184 loc->getDisplayName(locEng, s);
185 s.foldCase();
186 data.namesEng[str::to_wstring(s)] = code;
187 }
188
189 // sort the names alphabetically for data.sortedNames:
190 UErrorCode err = U_ZERO_ERROR;
191 std::unique_ptr<icu::Collator> coll(icu::Collator::createInstance(err));
192 if (coll)
193 {
194 coll->setStrength(icu::Collator::SECONDARY); // case insensitive
195
196 std::sort(names.begin(), names.end(),
197 [&coll](const icu::UnicodeString& a, const icu::UnicodeString& b){
198 UErrorCode e = U_ZERO_ERROR;
199 return coll->compare(a, b, e) == UCOL_LESS;
200 });
201 }
202 else
203 {
204 std::sort(names.begin(), names.end());
205 }
206
207 // convert into std::wstring
208 data.sortedNames.reserve(names.size());
209 for (auto s: names)
210 data.sortedNames.push_back(str::to_wstring(s));
211 });
212
213 return data;
214 }
215
216 } // anonymous namespace
217
218
IsValidCode(const std::wstring & s)219 bool Language::IsValidCode(const std::wstring& s)
220 {
221 return regex_match(s, RE_LANG_CODE);
222 }
223
Lang() const224 std::string Language::Lang() const
225 {
226 return m_code.substr(0, m_code.find_first_of("_@"));
227 }
228
Country() const229 std::string Language::Country() const
230 {
231 const size_t pos = m_code.find('_');
232 if (pos == std::string::npos)
233 return std::string();
234 else
235 return m_code.substr(pos+1, m_code.rfind('@'));
236 }
237
LangAndCountry() const238 std::string Language::LangAndCountry() const
239 {
240 return m_code.substr(0, m_code.rfind('@'));
241 }
242
Variant() const243 std::string Language::Variant() const
244 {
245 const size_t pos = m_code.rfind('@');
246 if (pos == std::string::npos)
247 return std::string();
248 else
249 return m_code.substr(0, pos);
250 }
251
RFC3066() const252 std::string Language::RFC3066() const
253 {
254 auto c = Country();
255 auto l = Lang();
256 if (c.empty())
257 return l;
258 else
259 return l + "-" + c;
260 }
261
262
TryParse(const std::wstring & s)263 Language Language::TryParse(const std::wstring& s)
264 {
265 if (IsValidCode(s))
266 return Language(s);
267
268 // Is it a standard language code?
269 if (regex_match(s, RE_LANG_CODE_PERMISSIVE))
270 {
271 std::wstring s2(s);
272 TryNormalize(s2);
273 if (IsValidCode(s2))
274 return Language(s2);
275 }
276
277 // If not, perhaps it's a human-readable name (perhaps coming from the language control)?
278 auto names = GetDisplayNamesData();
279 icu::UnicodeString s_icu = str::to_icu(s);
280 s_icu.foldCase();
281 std::wstring folded = str::to_wstring(s_icu);
282 auto i = names.names.find(folded);
283 if (i != names.names.end())
284 return Language(i->second);
285
286 // Maybe it was in English?
287 i = names.namesEng.find(folded);
288 if (i != names.namesEng.end())
289 return Language(i->second);
290
291 return Language(); // invalid
292 }
293
294
TryParseWithValidation(const std::wstring & s)295 Language Language::TryParseWithValidation(const std::wstring& s)
296 {
297 Language lang = Language::TryParse(s);
298 if (!lang.IsValid())
299 return Language(); // invalid
300
301 if (!IsISOLanguage(lang.Lang()))
302 return Language(); // invalid
303
304 auto country = lang.Country();
305 if (!country.empty() && !IsISOCountry(country))
306 return Language(); // invalid
307
308 return lang;
309 }
310
311
FromLegacyNames(const std::string & lang,const std::string & country)312 Language Language::FromLegacyNames(const std::string& lang, const std::string& country)
313 {
314 if (lang.empty())
315 return Language(); // invalid
316
317 #include "language_impl_legacy.h"
318
319 std::string code;
320 auto i = isoLanguages.find(lang);
321 if ( i != isoLanguages.end() )
322 code = i->second;
323 else
324 return Language(); // invalid
325
326 if (!country.empty())
327 {
328 auto iC = isoCountries.find(country);
329 if ( iC != isoCountries.end() )
330 code += "_" + iC->second;
331 }
332
333 return Language(code);
334 }
335
336
DefaultPluralFormsExpr() const337 std::string Language::DefaultPluralFormsExpr() const
338 {
339 if (!IsValid())
340 return std::string();
341
342 static const std::unordered_map<std::string, std::string> forms = {
343 #include "language_impl_plurals.h"
344 };
345
346 auto i = forms.find(m_code);
347 if ( i != forms.end() )
348 return i->second;
349
350 i = forms.find(LangAndCountry());
351 if ( i != forms.end() )
352 return i->second;
353
354 i = forms.find(Lang());
355 if ( i != forms.end() )
356 return i->second;
357
358 return std::string();
359 }
360
361
IsRTL() const362 bool Language::IsRTL() const
363 {
364 if (!IsValid())
365 return false; // fallback
366
367 #if U_ICU_VERSION_MAJOR_NUM >= 51
368 auto locale = IcuLocaleName();
369
370 UErrorCode err = U_ZERO_ERROR;
371 UScriptCode codes[10]= {USCRIPT_INVALID_CODE};
372 if (uscript_getCode(locale.c_str(), codes, 10, &err) == 0 || err != U_ZERO_ERROR)
373 return false; // fallback
374 return uscript_isRightToLeft(codes[0]);
375 #else
376 return false;
377 #endif
378 }
379
380
ToIcu() const381 icu::Locale Language::ToIcu() const
382 {
383 if (!IsValid())
384 return icu::Locale::getEnglish();
385
386 return icu::Locale(IcuLocaleName().c_str());
387 }
388
389
DisplayName() const390 wxString Language::DisplayName() const
391 {
392 icu::UnicodeString s;
393 ToIcu().getDisplayName(s);
394 return str::to_wx(s);
395 }
396
LanguageDisplayName() const397 wxString Language::LanguageDisplayName() const
398 {
399 icu::UnicodeString s;
400 ToIcu().getDisplayLanguage(s);
401 return str::to_wx(s);
402 }
403
DisplayNameInItself() const404 wxString Language::DisplayNameInItself() const
405 {
406 auto loc = ToIcu();
407 icu::UnicodeString s;
408 loc.getDisplayName(loc, s);
409 return str::to_wx(s);
410 }
411
FormatForRoundtrip() const412 wxString Language::FormatForRoundtrip() const
413 {
414 // TODO: Can't show variants nicely yet, not standardized
415 if (!Variant().empty())
416 return m_code;
417
418 wxString disp = DisplayName();
419 // ICU isn't 100% reliable, some of the display names it produces
420 // (e.g. "Chinese (China)" aren't in the list of known locale names
421 // (here because zh-Trans is preferred to zh_CN). So make sure it can
422 // be parsed back first.
423 if (TryParse(disp.ToStdWstring()).IsValid())
424 return disp;
425 else
426 return m_code;
427 }
428
429
AllFormattedNames()430 const std::vector<std::wstring>& Language::AllFormattedNames()
431 {
432 return GetDisplayNamesData().sortedNames;
433 }
434
435
TryGuessFromFilename(const wxString & filename)436 Language Language::TryGuessFromFilename(const wxString& filename)
437 {
438 wxFileName fn(filename);
439 fn.MakeAbsolute();
440
441 // Try matching the filename first:
442 // - entire name
443 // - suffix (foo.cs_CZ.po, wordpressTheme-cs_CZ.po)
444 // - directory name (cs_CZ, cs.lproj, cs/LC_MESSAGES)
445 std::wstring name = fn.GetName().ToStdWstring();
446 Language lang = Language::TryParseWithValidation(name);
447 if (lang.IsValid())
448 return lang;
449
450 size_t pos = name.find_first_of(L".-_");
451 while (pos != wxString::npos)
452 {
453 auto part = name.substr(pos+1);
454 lang = Language::TryParseWithValidation(part);
455 if (lang.IsValid())
456 return lang;
457 pos = name.find_first_of(L".-_", pos+1);
458 }
459
460 auto dirs = fn.GetDirs();
461 if (!dirs.empty())
462 {
463 auto d = dirs.rbegin();
464 if (d->IsSameAs("LC_MESSAGES", /*caseSensitive=*/false))
465 {
466 if (++d == dirs.rend())
467 return Language(); // failed to match
468 }
469 wxString rest;
470 if (d->EndsWith(".lproj", &rest))
471 return Language::TryParseWithValidation(rest.ToStdWstring());
472 else
473 return Language::TryParseWithValidation(d->ToStdWstring());
474 }
475
476 return Language(); // failed to match
477 }
478
479
TryDetectFromText(const char * buffer,size_t len,Language probableLanguage)480 Language Language::TryDetectFromText(const char *buffer, size_t len, Language probableLanguage)
481 {
482 #ifdef HAVE_CLD2
483 using namespace CLD2;
484
485 CLDHints hints = {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE};
486 if (probableLanguage.IsValid())
487 {
488 if (probableLanguage.Lang() == "en")
489 hints.language_hint = ENGLISH;
490 else
491 hints.language_hint = GetLanguageFromName(probableLanguage.RFC3066().c_str());
492 }
493
494 // three best guesses; we don't care, but they must be passed in
495 CLD2::Language language3[3];
496 int percent3[3];
497 double normalized_score3[3];
498 // more result info:
499 int text_bytes;
500 bool is_reliable;
501
502 auto lang = CLD2::ExtDetectLanguageSummary(
503 buffer, (int)len,
504 /*is_plain_text=*/true, // any embedded HTML markup should be insignificant
505 &hints,
506 /*flags=*/0,
507 language3, percent3, normalized_score3,
508 /*resultchunkvector=*/nullptr,
509 &text_bytes,
510 &is_reliable);
511
512 if (lang == UNKNOWN_LANGUAGE || !is_reliable)
513 return Language();
514
515 // CLD2 penalizes English in bilingual content in some cases as "boilerplate"
516 // because it is tailored for the web. So e.g. 66% English, 33% Italian is
517 // tagged as Italian.
518 //
519 // Poedit's bias is the opposite: English is almost always the correct answer
520 // for PO source language. Fix this up manually.
521 if (lang != language3[0] && language3[0] == CLD2::ENGLISH && language3[1] == lang)
522 lang = language3[0];
523
524 return Language::TryParse(LanguageCode(lang));
525 #else
526 (void)buffer;
527 (void)len;
528 return probableLanguage;
529 #endif
530 }
531