1 /*
2  *  This file is part of Poedit (http://poedit.net)
3  *
4  *  Copyright (C) 2013-2015 Vaclav Slavik
5  *
6  *  Permission is hereby granted, free of charge, to any person obtaining a
7  *  copy of this software and associated documentation files (the "Software"),
8  *  to deal in the Software without restriction, including without limitation
9  *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  *  and/or sell copies of the Software, and to permit persons to whom the
11  *  Software is furnished to do so, subject to the following conditions:
12  *
13  *  The above copyright notice and this permission notice shall be included in
14  *  all copies or substantial portions of the Software.
15  *
16  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  *  DEALINGS IN THE SOFTWARE.
23  *
24  */
25 
26 #include "language.h"
27 
28 #include <cctype>
29 #include <algorithm>
30 #include <unordered_map>
31 #include <mutex>
32 #include <memory>
33 
34 #include <unicode/uvernum.h>
35 #include <unicode/locid.h>
36 #include <unicode/coll.h>
37 #include <unicode/utypes.h>
38 
39 #include <wx/filename.h>
40 
41 #include "str_helpers.h"
42 
43 #ifdef HAVE_CLD2
44     #ifdef HAVE_CLD2_PUBLIC_COMPACT_LANG_DET_H
45         #include <cld2/public/compact_lang_det.h>
46         #include <cld2/public/encodings.h>
47     #else
48         #include "public/compact_lang_det.h"
49         #include "public/encodings.h"
50     #endif
51 #endif
52 
53 // GCC's libstdc++ didn't have functional std::regex implementation until 4.9
54 #if (defined(__GNUC__) && !defined(__clang__) && !wxCHECK_GCC_VERSION(4,9))
55     #include <boost/regex.hpp>
56     using boost::wregex;
57     using boost::regex_match;
58 #else
59     #include <regex>
60     using std::wregex;
61     using std::regex_match;
62 #endif
63 
64 namespace
65 {
66 
67 // see http://www.gnu.org/software/gettext/manual/html_node/Header-Entry.html
68 // for description of permitted formats
69 const wregex RE_LANG_CODE(L"([a-z]){2,3}(_([A-Z]{2}|[0-9]{3}))?(@[a-z]+)?");
70 
71 // a more permissive variant of the same that TryNormalize() would fix
72 const wregex RE_LANG_CODE_PERMISSIVE(L"([a-zA-Z]){2,3}([_-]([a-zA-Z]{2}|[0-9]{3}))?(@[a-zA-Z]+)?");
73 
74 // try some normalizations: s/-/_/, case adjustments
TryNormalize(std::wstring & s)75 void TryNormalize(std::wstring& s)
76 {
77     auto begin = s.begin();
78     auto end = s.end();
79 
80     size_t pos = s.rfind('@');
81     if (pos != std::wstring::npos)
82     {
83         for (auto x = begin + pos; x != end; ++x)
84             *x = std::tolower(*x);
85         end = begin + pos;
86     }
87 
88     bool upper = false;
89     for (auto x = begin; x != end; ++x)
90     {
91         if (*x == '-')
92             *x = '_';
93         if (*x == '_')
94             upper = true;
95         else if (std::isupper(*x) && !upper)
96             *x = std::tolower(*x);
97         else if (std::islower(*x) && upper)
98             *x = std::toupper(*x);
99     }
100 }
101 
IsISOLanguage(const std::string & s)102 bool IsISOLanguage(const std::string& s)
103 {
104     const char *test = s.c_str();
105     for (const char * const* i = icu::Locale::getISOLanguages(); *i != nullptr; ++i)
106     {
107         if (strcmp(test, *i) == 0)
108             return true;
109     }
110     return false;
111 }
112 
IsISOCountry(const std::string & s)113 bool IsISOCountry(const std::string& s)
114 {
115     const char *test = s.c_str();
116     for (const char * const* i = icu::Locale::getISOCountries(); *i != nullptr; ++i)
117     {
118         if (strcmp(test, *i) == 0)
119             return true;
120     }
121     return false;
122 }
123 
124 // Mapping of names to their respective ISO codes.
125 struct DisplayNamesData
126 {
127     typedef std::unordered_map<std::wstring, std::string> Map;
128     Map names, namesEng;
129     std::vector<std::wstring> sortedNames;
130 };
131 
132 std::once_flag of_namesList;
133 
GetDisplayNamesData()134 const DisplayNamesData& GetDisplayNamesData()
135 {
136     static DisplayNamesData data;
137 
138     std::call_once(of_namesList, [=]{
139         auto locEng = icu::Locale::getEnglish();
140         std::vector<icu::UnicodeString> names;
141 
142         int32_t count;
143         const icu::Locale *loc = icu::Locale::getAvailableLocales(count);
144         names.reserve(count);
145         for (int i = 0; i < count; i++, loc++)
146         {
147             auto language = loc->getLanguage();
148             auto script = loc->getScript();
149             auto country = loc->getCountry();
150             auto variant = loc->getVariant();
151 
152             // TODO: for now, ignore variants here and in FormatForRoundtrip(),
153             //       because translating them between gettext and ICU is nontrivial
154             if (variant != nullptr && *variant != '\0')
155                 continue;
156 
157             icu::UnicodeString s;
158             loc->getDisplayName(s);
159             names.push_back(s);
160 
161             if (strcmp(language, "zh") == 0 && *country == '\0')
162             {
163                 if (strcmp(script, "Hans") == 0)
164                     country = "CN";
165                 else if (strcmp(script, "Hant") == 0)
166                     country = "TW";
167             }
168 
169             std::string code(language);
170             if (*country != '\0')
171             {
172                 code += '_';
173                 code += country;
174             }
175             if (*script != '\0')
176             {
177                 if (strcmp(script, "Latn") == 0)
178                     code += "@latin";
179             }
180 
181             s.foldCase();
182             data.names[str::to_wstring(s)] = code;
183 
184             loc->getDisplayName(locEng, s);
185             s.foldCase();
186             data.namesEng[str::to_wstring(s)] = code;
187         }
188 
189         // sort the names alphabetically for data.sortedNames:
190         UErrorCode err = U_ZERO_ERROR;
191         std::unique_ptr<icu::Collator> coll(icu::Collator::createInstance(err));
192         if (coll)
193         {
194             coll->setStrength(icu::Collator::SECONDARY); // case insensitive
195 
196             std::sort(names.begin(), names.end(),
197                       [&coll](const icu::UnicodeString& a, const icu::UnicodeString& b){
198                           UErrorCode e = U_ZERO_ERROR;
199                           return coll->compare(a, b, e) == UCOL_LESS;
200                       });
201         }
202         else
203         {
204             std::sort(names.begin(), names.end());
205         }
206 
207         // convert into std::wstring
208         data.sortedNames.reserve(names.size());
209         for (auto s: names)
210             data.sortedNames.push_back(str::to_wstring(s));
211     });
212 
213     return data;
214 }
215 
216 } // anonymous namespace
217 
218 
IsValidCode(const std::wstring & s)219 bool Language::IsValidCode(const std::wstring& s)
220 {
221     return regex_match(s, RE_LANG_CODE);
222 }
223 
Lang() const224 std::string Language::Lang() const
225 {
226     return m_code.substr(0, m_code.find_first_of("_@"));
227 }
228 
Country() const229 std::string Language::Country() const
230 {
231     const size_t pos = m_code.find('_');
232     if (pos == std::string::npos)
233         return std::string();
234     else
235         return m_code.substr(pos+1, m_code.rfind('@'));
236 }
237 
LangAndCountry() const238 std::string Language::LangAndCountry() const
239 {
240     return m_code.substr(0, m_code.rfind('@'));
241 }
242 
Variant() const243 std::string Language::Variant() const
244 {
245     const size_t pos = m_code.rfind('@');
246     if (pos == std::string::npos)
247         return std::string();
248     else
249         return m_code.substr(0, pos);
250 }
251 
RFC3066() const252 std::string Language::RFC3066() const
253 {
254     auto c = Country();
255     auto l = Lang();
256     if (c.empty())
257         return l;
258     else
259         return l + "-" + c;
260 }
261 
262 
TryParse(const std::wstring & s)263 Language Language::TryParse(const std::wstring& s)
264 {
265     if (IsValidCode(s))
266         return Language(s);
267 
268     // Is it a standard language code?
269     if (regex_match(s, RE_LANG_CODE_PERMISSIVE))
270     {
271         std::wstring s2(s);
272         TryNormalize(s2);
273         if (IsValidCode(s2))
274             return Language(s2);
275     }
276 
277     // If not, perhaps it's a human-readable name (perhaps coming from the language control)?
278     auto names = GetDisplayNamesData();
279     icu::UnicodeString s_icu = str::to_icu(s);
280     s_icu.foldCase();
281     std::wstring folded = str::to_wstring(s_icu);
282     auto i = names.names.find(folded);
283     if (i != names.names.end())
284         return Language(i->second);
285 
286     // Maybe it was in English?
287     i = names.namesEng.find(folded);
288     if (i != names.namesEng.end())
289         return Language(i->second);
290 
291     return Language(); // invalid
292 }
293 
294 
TryParseWithValidation(const std::wstring & s)295 Language Language::TryParseWithValidation(const std::wstring& s)
296 {
297     Language lang = Language::TryParse(s);
298     if (!lang.IsValid())
299         return Language(); // invalid
300 
301     if (!IsISOLanguage(lang.Lang()))
302         return Language(); // invalid
303 
304     auto country = lang.Country();
305     if (!country.empty() && !IsISOCountry(country))
306         return Language(); // invalid
307 
308     return lang;
309 }
310 
311 
FromLegacyNames(const std::string & lang,const std::string & country)312 Language Language::FromLegacyNames(const std::string& lang, const std::string& country)
313 {
314     if (lang.empty())
315         return Language(); // invalid
316 
317     #include "language_impl_legacy.h"
318 
319     std::string code;
320     auto i = isoLanguages.find(lang);
321     if ( i != isoLanguages.end() )
322         code = i->second;
323     else
324         return Language(); // invalid
325 
326     if (!country.empty())
327     {
328         auto iC = isoCountries.find(country);
329         if ( iC != isoCountries.end() )
330             code += "_" + iC->second;
331     }
332 
333     return Language(code);
334 }
335 
336 
DefaultPluralFormsExpr() const337 std::string Language::DefaultPluralFormsExpr() const
338 {
339     if (!IsValid())
340         return std::string();
341 
342     static const std::unordered_map<std::string, std::string> forms = {
343         #include "language_impl_plurals.h"
344     };
345 
346     auto i = forms.find(m_code);
347     if ( i != forms.end() )
348         return i->second;
349 
350     i = forms.find(LangAndCountry());
351     if ( i != forms.end() )
352         return i->second;
353 
354     i = forms.find(Lang());
355     if ( i != forms.end() )
356         return i->second;
357 
358     return std::string();
359 }
360 
361 
IsRTL() const362 bool Language::IsRTL() const
363 {
364     if (!IsValid())
365         return false; // fallback
366 
367 #if U_ICU_VERSION_MAJOR_NUM >= 51
368     auto locale = IcuLocaleName();
369 
370     UErrorCode err = U_ZERO_ERROR;
371     UScriptCode codes[10]= {USCRIPT_INVALID_CODE};
372     if (uscript_getCode(locale.c_str(), codes, 10, &err) == 0 || err != U_ZERO_ERROR)
373         return false; // fallback
374     return uscript_isRightToLeft(codes[0]);
375 #else
376     return false;
377 #endif
378 }
379 
380 
ToIcu() const381 icu::Locale Language::ToIcu() const
382 {
383     if (!IsValid())
384         return icu::Locale::getEnglish();
385 
386     return icu::Locale(IcuLocaleName().c_str());
387 }
388 
389 
DisplayName() const390 wxString Language::DisplayName() const
391 {
392     icu::UnicodeString s;
393     ToIcu().getDisplayName(s);
394     return str::to_wx(s);
395 }
396 
LanguageDisplayName() const397 wxString Language::LanguageDisplayName() const
398 {
399     icu::UnicodeString s;
400     ToIcu().getDisplayLanguage(s);
401     return str::to_wx(s);
402 }
403 
DisplayNameInItself() const404 wxString Language::DisplayNameInItself() const
405 {
406     auto loc = ToIcu();
407     icu::UnicodeString s;
408     loc.getDisplayName(loc, s);
409     return str::to_wx(s);
410 }
411 
FormatForRoundtrip() const412 wxString Language::FormatForRoundtrip() const
413 {
414     // TODO: Can't show variants nicely yet, not standardized
415     if (!Variant().empty())
416         return m_code;
417 
418     wxString disp = DisplayName();
419     // ICU isn't 100% reliable, some of the display names it produces
420     // (e.g. "Chinese (China)" aren't in the list of known locale names
421     // (here because zh-Trans is preferred to zh_CN). So make sure it can
422     // be parsed back first.
423     if (TryParse(disp.ToStdWstring()).IsValid())
424         return disp;
425     else
426         return m_code;
427 }
428 
429 
AllFormattedNames()430 const std::vector<std::wstring>& Language::AllFormattedNames()
431 {
432     return GetDisplayNamesData().sortedNames;
433 }
434 
435 
TryGuessFromFilename(const wxString & filename)436 Language Language::TryGuessFromFilename(const wxString& filename)
437 {
438     wxFileName fn(filename);
439     fn.MakeAbsolute();
440 
441     // Try matching the filename first:
442     //  - entire name
443     //  - suffix (foo.cs_CZ.po, wordpressTheme-cs_CZ.po)
444     //  - directory name (cs_CZ, cs.lproj, cs/LC_MESSAGES)
445     std::wstring name = fn.GetName().ToStdWstring();
446     Language lang = Language::TryParseWithValidation(name);
447             if (lang.IsValid())
448                 return lang;
449 
450     size_t pos = name.find_first_of(L".-_");
451     while (pos != wxString::npos)
452     {
453         auto part = name.substr(pos+1);
454         lang = Language::TryParseWithValidation(part);
455         if (lang.IsValid())
456             return lang;
457          pos = name.find_first_of(L".-_",  pos+1);
458     }
459 
460     auto dirs = fn.GetDirs();
461     if (!dirs.empty())
462     {
463         auto d = dirs.rbegin();
464         if (d->IsSameAs("LC_MESSAGES", /*caseSensitive=*/false))
465         {
466             if (++d == dirs.rend())
467                 return Language(); // failed to match
468         }
469         wxString rest;
470         if (d->EndsWith(".lproj", &rest))
471             return Language::TryParseWithValidation(rest.ToStdWstring());
472         else
473             return Language::TryParseWithValidation(d->ToStdWstring());
474     }
475 
476     return Language(); // failed to match
477 }
478 
479 
TryDetectFromText(const char * buffer,size_t len,Language probableLanguage)480 Language Language::TryDetectFromText(const char *buffer, size_t len, Language probableLanguage)
481 {
482 #ifdef HAVE_CLD2
483     using namespace CLD2;
484 
485     CLDHints hints = {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE};
486     if (probableLanguage.IsValid())
487     {
488         if (probableLanguage.Lang() == "en")
489             hints.language_hint = ENGLISH;
490         else
491             hints.language_hint = GetLanguageFromName(probableLanguage.RFC3066().c_str());
492     }
493 
494     // three best guesses; we don't care, but they must be passed in
495     CLD2::Language language3[3];
496     int percent3[3];
497     double normalized_score3[3];
498     // more result info:
499     int text_bytes;
500     bool is_reliable;
501 
502     auto lang = CLD2::ExtDetectLanguageSummary(
503                         buffer, (int)len,
504                         /*is_plain_text=*/true, // any embedded HTML markup should be insignificant
505                         &hints,
506                         /*flags=*/0,
507                         language3, percent3, normalized_score3,
508                         /*resultchunkvector=*/nullptr,
509                         &text_bytes,
510                         &is_reliable);
511 
512     if (lang == UNKNOWN_LANGUAGE || !is_reliable)
513         return Language();
514 
515     // CLD2 penalizes English in bilingual content in some cases as "boilerplate"
516     // because it is tailored for the web. So e.g. 66% English, 33% Italian is
517     // tagged as Italian.
518     //
519     // Poedit's bias is the opposite: English is almost always the correct answer
520     // for PO source language. Fix this up manually.
521     if (lang != language3[0] && language3[0] == CLD2::ENGLISH && language3[1] == lang)
522         lang = language3[0];
523 
524     return Language::TryParse(LanguageCode(lang));
525 #else
526     (void)buffer;
527     (void)len;
528     return probableLanguage;
529 #endif
530 }
531