1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "nsLanguageAtomService.h"
7 #include "nsUConvPropertySearch.h"
8 #include "nsUnicharUtils.h"
9 #include "nsAtom.h"
10 #include "nsGkAtoms.h"
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/ClearOnShutdown.h"
13 #include "mozilla/Encoding.h"
14 #include "mozilla/intl/Locale.h"
15 #include "mozilla/intl/OSPreferences.h"
16 #include "mozilla/ServoBindings.h"
17 #include "mozilla/ServoUtils.h"
18 
19 using namespace mozilla;
20 using mozilla::intl::OSPreferences;
21 
22 static constexpr nsUConvProp encodingsGroups[] = {
23 #include "encodingsgroups.properties.h"
24 };
25 
26 // List of mozilla internal x-* tags that map to themselves (see bug 256257)
27 static constexpr nsStaticAtom* kLangGroups[] = {
28     // This list must be sorted!
29     nsGkAtoms::x_armn,  nsGkAtoms::x_cyrillic, nsGkAtoms::x_devanagari,
30     nsGkAtoms::x_geor,  nsGkAtoms::x_math,     nsGkAtoms::x_tamil,
31     nsGkAtoms::Unicode, nsGkAtoms::x_western
32     // These self-mappings are not necessary unless somebody use them to specify
33     // lang in (X)HTML/XML documents, which they shouldn't. (see bug 256257)
34     // x-beng=x-beng
35     // x-cans=x-cans
36     // x-ethi=x-ethi
37     // x-guru=x-guru
38     // x-gujr=x-gujr
39     // x-khmr=x-khmr
40     // x-mlym=x-mlym
41 };
42 
43 // Map ISO 15924 script codes from BCP47 lang tag to mozilla's langGroups.
44 static constexpr struct {
45   const char* mTag;
46   nsStaticAtom* mAtom;
47 } kScriptLangGroup[] = {
48     // This list must be sorted by script code!
49     {"Arab", nsGkAtoms::ar},
50     {"Armn", nsGkAtoms::x_armn},
51     {"Beng", nsGkAtoms::x_beng},
52     {"Cans", nsGkAtoms::x_cans},
53     {"Cyrl", nsGkAtoms::x_cyrillic},
54     {"Deva", nsGkAtoms::x_devanagari},
55     {"Ethi", nsGkAtoms::x_ethi},
56     {"Geok", nsGkAtoms::x_geor},
57     {"Geor", nsGkAtoms::x_geor},
58     {"Grek", nsGkAtoms::el},
59     {"Gujr", nsGkAtoms::x_gujr},
60     {"Guru", nsGkAtoms::x_guru},
61     {"Hang", nsGkAtoms::ko},
62     // Hani is not mapped to a specific langGroup, we prefer to look at the
63     // primary language subtag in this case
64     {"Hans", nsGkAtoms::Chinese},
65     // Hant is special-cased in code
66     // Hant=zh-HK
67     // Hant=zh-TW
68     {"Hebr", nsGkAtoms::he},
69     {"Hira", nsGkAtoms::Japanese},
70     {"Jpan", nsGkAtoms::Japanese},
71     {"Kana", nsGkAtoms::Japanese},
72     {"Khmr", nsGkAtoms::x_khmr},
73     {"Knda", nsGkAtoms::x_knda},
74     {"Kore", nsGkAtoms::ko},
75     {"Latn", nsGkAtoms::x_western},
76     {"Mlym", nsGkAtoms::x_mlym},
77     {"Orya", nsGkAtoms::x_orya},
78     {"Sinh", nsGkAtoms::x_sinh},
79     {"Taml", nsGkAtoms::x_tamil},
80     {"Telu", nsGkAtoms::x_telu},
81     {"Thai", nsGkAtoms::th},
82     {"Tibt", nsGkAtoms::x_tibt}};
83 
84 static UniquePtr<nsLanguageAtomService> gLangAtomService;
85 
86 // static
GetService()87 nsLanguageAtomService* nsLanguageAtomService::GetService() {
88   if (!gLangAtomService) {
89     gLangAtomService = MakeUnique<nsLanguageAtomService>();
90   }
91   return gLangAtomService.get();
92 }
93 
94 // static
Shutdown()95 void nsLanguageAtomService::Shutdown() { gLangAtomService = nullptr; }
96 
LookupLanguage(const nsACString & aLanguage)97 nsStaticAtom* nsLanguageAtomService::LookupLanguage(
98     const nsACString& aLanguage) {
99   nsAutoCString lowered(aLanguage);
100   ToLowerCase(lowered);
101 
102   RefPtr<nsAtom> lang = NS_Atomize(lowered);
103   return GetLanguageGroup(lang);
104 }
105 
LookupCharSet(NotNull<const Encoding * > aEncoding)106 already_AddRefed<nsAtom> nsLanguageAtomService::LookupCharSet(
107     NotNull<const Encoding*> aEncoding) {
108   nsAutoCString charset;
109   aEncoding->Name(charset);
110   nsAutoCString group;
111   if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
112           encodingsGroups, ArrayLength(encodingsGroups), charset, group))) {
113     return RefPtr<nsAtom>(nsGkAtoms::Unicode).forget();
114   }
115   return NS_Atomize(group);
116 }
117 
GetLocaleLanguage()118 nsAtom* nsLanguageAtomService::GetLocaleLanguage() {
119   do {
120     if (!mLocaleLanguage) {
121       AutoTArray<nsCString, 10> regionalPrefsLocales;
122       if (NS_SUCCEEDED(OSPreferences::GetInstance()->GetRegionalPrefsLocales(
123               regionalPrefsLocales))) {
124         // use lowercase for all language atoms
125         ToLowerCase(regionalPrefsLocales[0]);
126         mLocaleLanguage = NS_Atomize(regionalPrefsLocales[0]);
127       } else {
128         nsAutoCString locale;
129         OSPreferences::GetInstance()->GetSystemLocale(locale);
130 
131         ToLowerCase(locale);  // use lowercase for all language atoms
132         mLocaleLanguage = NS_Atomize(locale);
133       }
134     }
135   } while (0);
136 
137   return mLocaleLanguage;
138 }
139 
GetLanguageGroup(nsAtom * aLanguage,bool * aNeedsToCache)140 nsStaticAtom* nsLanguageAtomService::GetLanguageGroup(nsAtom* aLanguage,
141                                                       bool* aNeedsToCache) {
142   if (aNeedsToCache) {
143     if (nsStaticAtom* atom = mLangToGroup.Get(aLanguage)) {
144       return atom;
145     }
146     *aNeedsToCache = true;
147     return nullptr;
148   }
149 
150   return mLangToGroup.LookupOrInsertWith(aLanguage, [&] {
151     AssertIsMainThreadOrServoFontMetricsLocked();
152     return GetUncachedLanguageGroup(aLanguage);
153   });
154 }
155 
GetUncachedLanguageGroup(nsAtom * aLanguage) const156 nsStaticAtom* nsLanguageAtomService::GetUncachedLanguageGroup(
157     nsAtom* aLanguage) const {
158   nsAutoCString langStr;
159   aLanguage->ToUTF8String(langStr);
160   ToLowerCase(langStr);
161 
162   if (langStr[0] == 'x' && langStr[1] == '-') {
163     // Internal x-* langGroup codes map to themselves (see bug 256257)
164     for (nsStaticAtom* langGroup : kLangGroups) {
165       if (langGroup == aLanguage) {
166         return langGroup;
167       }
168       if (aLanguage->IsAsciiLowercase()) {
169         continue;
170       }
171       // Do the slow ascii-case-insensitive comparison just if needed.
172       nsDependentAtomString string(langGroup);
173       if (string.EqualsASCII(langStr.get(), langStr.Length())) {
174         return langGroup;
175       }
176     }
177   } else {
178     // If the lang code can be parsed as BCP47, look up its (likely) script.
179 
180     // https://bugzilla.mozilla.org/show_bug.cgi?id=1618034:
181     // First strip any private subtags that would cause Locale to reject the
182     // tag as non-wellformed.
183     nsACString::const_iterator start, end;
184     langStr.BeginReading(start);
185     langStr.EndReading(end);
186     if (FindInReadable("-x-"_ns, start, end)) {
187       // The substring we want ends at the beginning of the "-x-" subtag.
188       langStr.Truncate(start.get() - langStr.BeginReading());
189     }
190 
191     Locale loc;
192     auto result = LocaleParser::TryParse(langStr, loc);
193     if (!result.isOk()) {
194       // Did the author (wrongly) use '_' instead of '-' to separate subtags?
195       // If so, fix it up and re-try parsing.
196       if (langStr.Contains('_')) {
197         langStr.ReplaceChar('_', '-');
198         result = LocaleParser::TryParse(langStr, loc);
199       }
200     }
201     if (result.isOk() && loc.Canonicalize().isOk()) {
202       // Fill in script subtag if not present.
203       if (loc.Script().Missing()) {
204         if (loc.AddLikelySubtags().isErr()) {
205           // Fall back to x-unicode if no match was found
206           return nsGkAtoms::Unicode;
207         }
208       }
209       // Traditional Chinese has separate prefs for Hong Kong / Taiwan;
210       // check the region subtag.
211       if (loc.Script().EqualTo("Hant")) {
212         if (loc.Region().EqualTo("HK")) {
213           return nsGkAtoms::HongKongChinese;
214         }
215         return nsGkAtoms::Taiwanese;
216       }
217       // Search list of known script subtags that map to langGroup codes.
218       size_t foundIndex;
219       Span<const char> scriptAsSpan = loc.Script().Span();
220       nsDependentCSubstring script(scriptAsSpan.data(), scriptAsSpan.size());
221       if (BinarySearchIf(
222               kScriptLangGroup, 0, ArrayLength(kScriptLangGroup),
223               [script](const auto& entry) -> int {
224                 return script.Compare(entry.mTag);
225               },
226               &foundIndex)) {
227         return kScriptLangGroup[foundIndex].mAtom;
228       }
229       // Script subtag was not recognized (includes "Hani"); check the language
230       // subtag for CJK possibilities so that we'll prefer the appropriate font
231       // rather than falling back to the browser's hardcoded preference.
232       if (loc.Language().EqualTo("zh")) {
233         if (loc.Region().EqualTo("HK")) {
234           return nsGkAtoms::HongKongChinese;
235         }
236         if (loc.Region().EqualTo("TW")) {
237           return nsGkAtoms::Taiwanese;
238         }
239         return nsGkAtoms::Chinese;
240       }
241       if (loc.Language().EqualTo("ja")) {
242         return nsGkAtoms::Japanese;
243       }
244       if (loc.Language().EqualTo("ko")) {
245         return nsGkAtoms::ko;
246       }
247     }
248   }
249 
250   // Fall back to x-unicode if no match was found
251   return nsGkAtoms::Unicode;
252 }
253