1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsLanguageAtomService.h"
7 #include "nsUConvPropertySearch.h"
8 #include "nsUnicharUtils.h"
9 #include "nsAtom.h"
10 #include "nsGkAtoms.h"
11 #include "mozilla/ArrayUtils.h"
12 #include "mozilla/ClearOnShutdown.h"
13 #include "mozilla/Encoding.h"
14 #include "mozilla/intl/Locale.h"
15 #include "mozilla/intl/OSPreferences.h"
16 #include "mozilla/ServoBindings.h"
17 #include "mozilla/ServoUtils.h"
18
19 using namespace mozilla;
20 using mozilla::intl::OSPreferences;
21
22 static constexpr nsUConvProp encodingsGroups[] = {
23 #include "encodingsgroups.properties.h"
24 };
25
26 // List of mozilla internal x-* tags that map to themselves (see bug 256257)
27 static constexpr nsStaticAtom* kLangGroups[] = {
28 // This list must be sorted!
29 nsGkAtoms::x_armn, nsGkAtoms::x_cyrillic, nsGkAtoms::x_devanagari,
30 nsGkAtoms::x_geor, nsGkAtoms::x_math, nsGkAtoms::x_tamil,
31 nsGkAtoms::Unicode, nsGkAtoms::x_western
32 // These self-mappings are not necessary unless somebody use them to specify
33 // lang in (X)HTML/XML documents, which they shouldn't. (see bug 256257)
34 // x-beng=x-beng
35 // x-cans=x-cans
36 // x-ethi=x-ethi
37 // x-guru=x-guru
38 // x-gujr=x-gujr
39 // x-khmr=x-khmr
40 // x-mlym=x-mlym
41 };
42
43 // Map ISO 15924 script codes from BCP47 lang tag to mozilla's langGroups.
44 static constexpr struct {
45 const char* mTag;
46 nsStaticAtom* mAtom;
47 } kScriptLangGroup[] = {
48 // This list must be sorted by script code!
49 {"Arab", nsGkAtoms::ar},
50 {"Armn", nsGkAtoms::x_armn},
51 {"Beng", nsGkAtoms::x_beng},
52 {"Cans", nsGkAtoms::x_cans},
53 {"Cyrl", nsGkAtoms::x_cyrillic},
54 {"Deva", nsGkAtoms::x_devanagari},
55 {"Ethi", nsGkAtoms::x_ethi},
56 {"Geok", nsGkAtoms::x_geor},
57 {"Geor", nsGkAtoms::x_geor},
58 {"Grek", nsGkAtoms::el},
59 {"Gujr", nsGkAtoms::x_gujr},
60 {"Guru", nsGkAtoms::x_guru},
61 {"Hang", nsGkAtoms::ko},
62 // Hani is not mapped to a specific langGroup, we prefer to look at the
63 // primary language subtag in this case
64 {"Hans", nsGkAtoms::Chinese},
65 // Hant is special-cased in code
66 // Hant=zh-HK
67 // Hant=zh-TW
68 {"Hebr", nsGkAtoms::he},
69 {"Hira", nsGkAtoms::Japanese},
70 {"Jpan", nsGkAtoms::Japanese},
71 {"Kana", nsGkAtoms::Japanese},
72 {"Khmr", nsGkAtoms::x_khmr},
73 {"Knda", nsGkAtoms::x_knda},
74 {"Kore", nsGkAtoms::ko},
75 {"Latn", nsGkAtoms::x_western},
76 {"Mlym", nsGkAtoms::x_mlym},
77 {"Orya", nsGkAtoms::x_orya},
78 {"Sinh", nsGkAtoms::x_sinh},
79 {"Taml", nsGkAtoms::x_tamil},
80 {"Telu", nsGkAtoms::x_telu},
81 {"Thai", nsGkAtoms::th},
82 {"Tibt", nsGkAtoms::x_tibt}};
83
84 static UniquePtr<nsLanguageAtomService> gLangAtomService;
85
86 // static
GetService()87 nsLanguageAtomService* nsLanguageAtomService::GetService() {
88 if (!gLangAtomService) {
89 gLangAtomService = MakeUnique<nsLanguageAtomService>();
90 }
91 return gLangAtomService.get();
92 }
93
94 // static
Shutdown()95 void nsLanguageAtomService::Shutdown() { gLangAtomService = nullptr; }
96
LookupLanguage(const nsACString & aLanguage)97 nsStaticAtom* nsLanguageAtomService::LookupLanguage(
98 const nsACString& aLanguage) {
99 nsAutoCString lowered(aLanguage);
100 ToLowerCase(lowered);
101
102 RefPtr<nsAtom> lang = NS_Atomize(lowered);
103 return GetLanguageGroup(lang);
104 }
105
LookupCharSet(NotNull<const Encoding * > aEncoding)106 already_AddRefed<nsAtom> nsLanguageAtomService::LookupCharSet(
107 NotNull<const Encoding*> aEncoding) {
108 nsAutoCString charset;
109 aEncoding->Name(charset);
110 nsAutoCString group;
111 if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
112 encodingsGroups, ArrayLength(encodingsGroups), charset, group))) {
113 return RefPtr<nsAtom>(nsGkAtoms::Unicode).forget();
114 }
115 return NS_Atomize(group);
116 }
117
GetLocaleLanguage()118 nsAtom* nsLanguageAtomService::GetLocaleLanguage() {
119 do {
120 if (!mLocaleLanguage) {
121 AutoTArray<nsCString, 10> regionalPrefsLocales;
122 if (NS_SUCCEEDED(OSPreferences::GetInstance()->GetRegionalPrefsLocales(
123 regionalPrefsLocales))) {
124 // use lowercase for all language atoms
125 ToLowerCase(regionalPrefsLocales[0]);
126 mLocaleLanguage = NS_Atomize(regionalPrefsLocales[0]);
127 } else {
128 nsAutoCString locale;
129 OSPreferences::GetInstance()->GetSystemLocale(locale);
130
131 ToLowerCase(locale); // use lowercase for all language atoms
132 mLocaleLanguage = NS_Atomize(locale);
133 }
134 }
135 } while (0);
136
137 return mLocaleLanguage;
138 }
139
GetLanguageGroup(nsAtom * aLanguage,bool * aNeedsToCache)140 nsStaticAtom* nsLanguageAtomService::GetLanguageGroup(nsAtom* aLanguage,
141 bool* aNeedsToCache) {
142 if (aNeedsToCache) {
143 if (nsStaticAtom* atom = mLangToGroup.Get(aLanguage)) {
144 return atom;
145 }
146 *aNeedsToCache = true;
147 return nullptr;
148 }
149
150 return mLangToGroup.LookupOrInsertWith(aLanguage, [&] {
151 AssertIsMainThreadOrServoFontMetricsLocked();
152 return GetUncachedLanguageGroup(aLanguage);
153 });
154 }
155
GetUncachedLanguageGroup(nsAtom * aLanguage) const156 nsStaticAtom* nsLanguageAtomService::GetUncachedLanguageGroup(
157 nsAtom* aLanguage) const {
158 nsAutoCString langStr;
159 aLanguage->ToUTF8String(langStr);
160 ToLowerCase(langStr);
161
162 if (langStr[0] == 'x' && langStr[1] == '-') {
163 // Internal x-* langGroup codes map to themselves (see bug 256257)
164 for (nsStaticAtom* langGroup : kLangGroups) {
165 if (langGroup == aLanguage) {
166 return langGroup;
167 }
168 if (aLanguage->IsAsciiLowercase()) {
169 continue;
170 }
171 // Do the slow ascii-case-insensitive comparison just if needed.
172 nsDependentAtomString string(langGroup);
173 if (string.EqualsASCII(langStr.get(), langStr.Length())) {
174 return langGroup;
175 }
176 }
177 } else {
178 // If the lang code can be parsed as BCP47, look up its (likely) script.
179
180 // https://bugzilla.mozilla.org/show_bug.cgi?id=1618034:
181 // First strip any private subtags that would cause Locale to reject the
182 // tag as non-wellformed.
183 nsACString::const_iterator start, end;
184 langStr.BeginReading(start);
185 langStr.EndReading(end);
186 if (FindInReadable("-x-"_ns, start, end)) {
187 // The substring we want ends at the beginning of the "-x-" subtag.
188 langStr.Truncate(start.get() - langStr.BeginReading());
189 }
190
191 Locale loc;
192 auto result = LocaleParser::TryParse(langStr, loc);
193 if (!result.isOk()) {
194 // Did the author (wrongly) use '_' instead of '-' to separate subtags?
195 // If so, fix it up and re-try parsing.
196 if (langStr.Contains('_')) {
197 langStr.ReplaceChar('_', '-');
198 result = LocaleParser::TryParse(langStr, loc);
199 }
200 }
201 if (result.isOk() && loc.Canonicalize().isOk()) {
202 // Fill in script subtag if not present.
203 if (loc.Script().Missing()) {
204 if (loc.AddLikelySubtags().isErr()) {
205 // Fall back to x-unicode if no match was found
206 return nsGkAtoms::Unicode;
207 }
208 }
209 // Traditional Chinese has separate prefs for Hong Kong / Taiwan;
210 // check the region subtag.
211 if (loc.Script().EqualTo("Hant")) {
212 if (loc.Region().EqualTo("HK")) {
213 return nsGkAtoms::HongKongChinese;
214 }
215 return nsGkAtoms::Taiwanese;
216 }
217 // Search list of known script subtags that map to langGroup codes.
218 size_t foundIndex;
219 Span<const char> scriptAsSpan = loc.Script().Span();
220 nsDependentCSubstring script(scriptAsSpan.data(), scriptAsSpan.size());
221 if (BinarySearchIf(
222 kScriptLangGroup, 0, ArrayLength(kScriptLangGroup),
223 [script](const auto& entry) -> int {
224 return script.Compare(entry.mTag);
225 },
226 &foundIndex)) {
227 return kScriptLangGroup[foundIndex].mAtom;
228 }
229 // Script subtag was not recognized (includes "Hani"); check the language
230 // subtag for CJK possibilities so that we'll prefer the appropriate font
231 // rather than falling back to the browser's hardcoded preference.
232 if (loc.Language().EqualTo("zh")) {
233 if (loc.Region().EqualTo("HK")) {
234 return nsGkAtoms::HongKongChinese;
235 }
236 if (loc.Region().EqualTo("TW")) {
237 return nsGkAtoms::Taiwanese;
238 }
239 return nsGkAtoms::Chinese;
240 }
241 if (loc.Language().EqualTo("ja")) {
242 return nsGkAtoms::Japanese;
243 }
244 if (loc.Language().EqualTo("ko")) {
245 return nsGkAtoms::ko;
246 }
247 }
248 }
249
250 // Fall back to x-unicode if no match was found
251 return nsGkAtoms::Unicode;
252 }
253