1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/spellcheck/common/spellcheck_common.h"
6
7 #include "base/check.h"
8 #include "base/command_line.h"
9 #include "base/files/file_path.h"
10 #include "base/metrics/field_trial.h"
11 #include "base/stl_util.h"
12 #include "base/strings/string_util.h"
13 #include "third_party/icu/source/common/unicode/uloc.h"
14 #include "third_party/icu/source/common/unicode/urename.h"
15 #include "third_party/icu/source/common/unicode/utypes.h"
16
17 namespace spellcheck {
18
19 struct LanguageRegion {
20 const char* language; // The language.
21 const char* language_region; // language & region, used by dictionaries.
22 };
23
24 struct LanguageVersion {
25 const char* language; // The language input.
26 const char* version; // The corresponding version.
27 };
28
29 static constexpr LanguageRegion kSupportedSpellCheckerLanguages[] = {
30 // Several languages are not to be included in the spellchecker list:
31 // th-TH, vi-VI.
32 // clang-format off
33 {"af", "af-ZA"},
34 {"bg", "bg-BG"},
35 {"ca", "ca-ES"},
36 {"cs", "cs-CZ"},
37 {"cy", "cy-GB"},
38 {"da", "da-DK"},
39 {"de", "de-DE"},
40 {"el", "el-GR"},
41 {"en-AU", "en-AU"},
42 {"en-CA", "en-CA"},
43 {"en-GB", "en-GB"},
44 {"en-GB-oxendict", "en-GB-oxendict"},
45 {"en-US", "en-US"},
46 {"es", "es-ES"},
47 {"es-419", "es-ES"},
48 {"es-AR", "es-ES"},
49 {"es-ES", "es-ES"},
50 {"es-MX", "es-ES"},
51 {"es-US", "es-ES"},
52 {"et", "et-EE"},
53 {"fa", "fa-IR"},
54 {"fo", "fo-FO"},
55 {"fr", "fr-FR"},
56 {"he", "he-IL"},
57 {"hi", "hi-IN"},
58 {"hr", "hr-HR"},
59 {"hu", "hu-HU"},
60 {"hy", "hy"},
61 {"id", "id-ID"},
62 {"it", "it-IT"},
63 {"ko", "ko"},
64 {"lt", "lt-LT"},
65 {"lv", "lv-LV"},
66 {"nb", "nb-NO"},
67 {"nl", "nl-NL"},
68 {"pl", "pl-PL"},
69 {"pt-BR", "pt-BR"},
70 {"pt-PT", "pt-PT"},
71 {"ro", "ro-RO"},
72 {"ru", "ru-RU"},
73 {"sh", "sh"},
74 {"sk", "sk-SK"},
75 {"sl", "sl-SI"},
76 {"sq", "sq"},
77 {"sr", "sr"},
78 {"sv", "sv-SE"},
79 {"ta", "ta-IN"},
80 {"tg", "tg-TG"},
81 {"tr", "tr-TR"},
82 {"uk", "uk-UA"},
83 {"vi", "vi-VN"},
84 // clang-format on
85 };
86
IsValidRegion(const std::string & region)87 bool IsValidRegion(const std::string& region) {
88 for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
89 if (lang_region.language_region == region)
90 return true;
91 }
92 return false;
93 }
94
95 // This function returns the language-region version of language name.
96 // e.g. returns hi-IN for hi.
GetSpellCheckLanguageRegion(base::StringPiece input_language)97 std::string GetSpellCheckLanguageRegion(base::StringPiece input_language) {
98 for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
99 if (lang_region.language == input_language)
100 return lang_region.language_region;
101 }
102
103 return input_language.as_string();
104 }
105
GetVersionedFileName(base::StringPiece input_language,const base::FilePath & dict_dir)106 base::FilePath GetVersionedFileName(base::StringPiece input_language,
107 const base::FilePath& dict_dir) {
108 // The default dictionary version is 3-0. This version indicates that the bdic
109 // file contains a checksum.
110 static const char kDefaultVersionString[] = "-3-0";
111
112 // Add non-default version strings here. Use the same version for all the
113 // dictionaries that you add at the same time. Increment the major version
114 // number if you're updating either dic or aff files. Increment the minor
115 // version number if you're updating only dic_delta files.
116 static constexpr LanguageVersion kSpecialVersionString[] = {
117 // Jan 9, 2013: Add "FLAG num" to aff to avoid heapcheck crash.
118 {"tr-TR", "-4-0"},
119
120 // Mar 4, 2014: Add Tajik dictionary.
121 {"tg-TG", "-5-0"},
122
123 // Feb 2019: Initial check-in of Welsh.
124 {"cy-GB", "-1-0"},
125
126 // April 2019: Initial check-in of Armenian.
127 {"hy", "-1-0"},
128
129 // November 2019: Update Serbian-Latin and Serbian-Cyrillic
130 {"sh", "-4-0"},
131 {"sr", "-4-0"},
132
133 // January 2020: Update en-* and fa-IR dictionaries from upstream.
134 {"en-AU", "-9-0"},
135 {"en-CA", "-9-0"},
136 {"en-GB", "-9-0"},
137 {"en-US", "-9-0"},
138 {"fa-IR", "-9-0"},
139
140 // March 2020: Update uk-UA dictionary from upstream.
141 {"uk-UA", "-4-0"},
142
143 // June 2020: Add the en-GB-oxendict dictionary.
144 {"en-GB-oxendict", "-9-0"},
145 };
146
147 // Generate the bdict file name using default version string or special
148 // version string, depending on the language.
149 std::string language = GetSpellCheckLanguageRegion(input_language);
150 std::string version = kDefaultVersionString;
151 for (const auto& lang_ver : kSpecialVersionString) {
152 if (language == lang_ver.language) {
153 version = lang_ver.version;
154 break;
155 }
156 }
157 std::string versioned_bdict_file_name(language + version + ".bdic");
158 return dict_dir.AppendASCII(versioned_bdict_file_name);
159 }
160
GetCorrespondingSpellCheckLanguage(base::StringPiece language)161 std::string GetCorrespondingSpellCheckLanguage(base::StringPiece language) {
162 std::string best_match;
163 // Look for exact match in the Spell Check language list.
164 for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
165 // First look for exact match in the language region of the list.
166 if (lang_region.language == language)
167 return language.as_string();
168
169 // Next, look for exact match in the language_region part of the list.
170 if (lang_region.language_region == language) {
171 if (best_match.empty())
172 best_match = lang_region.language;
173 }
174 }
175
176 // No match found - return best match, if any.
177 return best_match;
178 }
179
SpellCheckLanguages()180 std::vector<std::string> SpellCheckLanguages() {
181 std::vector<std::string> languages;
182 for (const auto& lang_region : kSupportedSpellCheckerLanguages)
183 languages.push_back(lang_region.language);
184 return languages;
185 }
186
GetISOLanguageCountryCodeFromLocale(const std::string & locale,std::string * language_code,std::string * country_code)187 void GetISOLanguageCountryCodeFromLocale(const std::string& locale,
188 std::string* language_code,
189 std::string* country_code) {
190 DCHECK(language_code);
191 DCHECK(country_code);
192 char language[ULOC_LANG_CAPACITY] = ULOC_ENGLISH;
193 const char* country = "USA";
194 if (!locale.empty()) {
195 UErrorCode error = U_ZERO_ERROR;
196 char id[ULOC_LANG_CAPACITY + ULOC_SCRIPT_CAPACITY + ULOC_COUNTRY_CAPACITY];
197 uloc_addLikelySubtags(locale.c_str(), id, base::size(id), &error);
198 error = U_ZERO_ERROR;
199 uloc_getLanguage(id, language, base::size(language), &error);
200 country = uloc_getISO3Country(id);
201 }
202 *language_code = std::string(language);
203 *country_code = std::string(country);
204 }
205
FillSuggestions(const std::vector<std::vector<base::string16>> & suggestions_list,std::vector<base::string16> * optional_suggestions)206 void FillSuggestions(
207 const std::vector<std::vector<base::string16>>& suggestions_list,
208 std::vector<base::string16>* optional_suggestions) {
209 DCHECK(optional_suggestions);
210 size_t num_languages = suggestions_list.size();
211
212 // Compute maximum number of suggestions in a single language.
213 size_t max_suggestions = 0;
214 for (const auto& suggestions : suggestions_list)
215 max_suggestions = std::max(max_suggestions, suggestions.size());
216
217 for (size_t count = 0; count < (max_suggestions * num_languages); ++count) {
218 size_t language = count % num_languages;
219 size_t index = count / num_languages;
220
221 if (suggestions_list[language].size() <= index)
222 continue;
223
224 const base::string16& suggestion = suggestions_list[language][index];
225 // Only add the suggestion if it's unique.
226 if (!base::Contains(*optional_suggestions, suggestion)) {
227 optional_suggestions->push_back(suggestion);
228 }
229 if (optional_suggestions->size() >= kMaxSuggestions) {
230 break;
231 }
232 }
233 }
234
235 } // namespace spellcheck
236