1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16
17 #include "util/languages/languages.h"
18
19 #include "util/basictypes.h"
20 #include "util/string_util.h"
21
22
default_language()23 Language default_language() {return ENGLISH;}
24
25
26 // Language names and codes
27
28 struct LanguageInfo {
29 const char * language_name_;
30 const char * language_code_639_1_; // the ISO-639-1 code for the language
31 const char * language_code_639_2_; // the ISO-639-2 code for the language
32 const char * language_code_other_; // some nonstandard code for the language
33 };
34
35 static const LanguageInfo kLanguageInfoTable[] = {
36 { "ENGLISH", "en", "eng", NULL},
37 { "DANISH", "da", "dan", NULL},
38 { "DUTCH", "nl", "dut", NULL},
39 { "FINNISH", "fi", "fin", NULL},
40 { "FRENCH", "fr", "fre", NULL},
41 { "GERMAN", "de", "ger", NULL},
42 { "HEBREW", "he", "heb", NULL},
43 { "ITALIAN", "it", "ita", NULL},
44 { "Japanese", "ja", "jpn", NULL},
45 { "Korean", "ko", "kor", NULL},
46 { "NORWEGIAN", "nb", "nor", NULL},
47 { "POLISH", "pl", "pol", NULL},
48 { "PORTUGUESE", "pt", "por", NULL},
49 { "RUSSIAN", "ru", "rus", NULL},
50 { "SPANISH", "es", "spa", NULL},
51 { "SWEDISH", "sv", "swe", NULL},
52 { "Chinese", "zh", "chi", "zh-CN"},
53 { "CZECH", "cs", "cze", NULL},
54 { "GREEK", "el", "gre", NULL},
55 { "ICELANDIC", "is", "ice", NULL},
56 { "LATVIAN", "lv", "lav", NULL},
57 { "LITHUANIAN", "lt", "lit", NULL},
58 { "ROMANIAN", "ro", "rum", NULL},
59 { "HUNGARIAN", "hu", "hun", NULL},
60 { "ESTONIAN", "et", "est", NULL},
61 // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE"
62 // and "Unknown", they are essentially the same. Need to unify them.
63 // "un" and "ut" are invented by us, not from ISO-639.
64 //
65 { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"},
66 { "Unknown", NULL, NULL, "un"},
67 { "BULGARIAN", "bg", "bul", NULL},
68 { "CROATIAN", "hr", "scr", NULL},
69 { "SERBIAN", "sr", "scc", NULL},
70 { "IRISH", "ga", "gle", NULL},
71 { "GALICIAN", "gl", "glg", NULL},
72 // Impossible to tell Tagalog from Filipino at the moment.
73 // Use ISO 639-2 code for Filipino here.
74 { "TAGALOG", NULL, "fil", NULL},
75 { "TURKISH", "tr", "tur", NULL},
76 { "UKRAINIAN", "uk", "ukr", NULL},
77 { "HINDI", "hi", "hin", NULL},
78 { "MACEDONIAN", "mk", "mac", NULL},
79 { "BENGALI", "bn", "ben", NULL},
80 { "INDONESIAN", "id", "ind", NULL},
81 { "LATIN", "la", "lat", NULL},
82 { "MALAY", "ms", "may", NULL},
83 { "MALAYALAM", "ml", "mal", NULL},
84 { "WELSH", "cy", "wel", NULL},
85 { "NEPALI", "ne", "nep", NULL},
86 { "TELUGU", "te", "tel", NULL},
87 { "ALBANIAN", "sq", "alb", NULL},
88 { "TAMIL", "ta", "tam", NULL},
89 { "BELARUSIAN", "be", "bel", NULL},
90 { "JAVANESE", "jw", "jav", NULL},
91 { "OCCITAN", "oc", "oci", NULL},
92 { "URDU", "ur", "urd", NULL},
93 { "BIHARI", "bh", "bih", NULL},
94 { "GUJARATI", "gu", "guj", NULL},
95 { "THAI", "th", "tha", NULL},
96 { "ARABIC", "ar", "ara", NULL},
97 { "CATALAN", "ca", "cat", NULL},
98 { "ESPERANTO", "eo", "epo", NULL},
99 { "BASQUE", "eu", "baq", NULL},
100 { "INTERLINGUA", "ia", "ina", NULL},
101 { "KANNADA", "kn", "kan", NULL},
102 { "PUNJABI", "pa", "pan", NULL},
103 { "SCOTS_GAELIC", "gd", "gla", NULL},
104 { "SWAHILI", "sw", "swa", NULL},
105 { "SLOVENIAN", "sl", "slv", NULL},
106 { "MARATHI", "mr", "mar", NULL},
107 { "MALTESE", "mt", "mlt", NULL},
108 { "VIETNAMESE", "vi", "vie", NULL},
109 { "FRISIAN", "fy", "fry", NULL},
110 { "SLOVAK", "sk", "slo", NULL},
111 { "ChineseT",
112 NULL, NULL, // We intentionally set these 2 fields to NULL to avoid
113 // confusion between CHINESE_T and CHINESE.
114 "zh-TW"},
115 { "FAROESE", "fo", "fao", NULL},
116 { "SUNDANESE", "su", "sun", NULL},
117 { "UZBEK", "uz", "uzb", NULL},
118 { "AMHARIC", "am", "amh", NULL},
119 { "AZERBAIJANI", "az", "aze", NULL},
120 { "GEORGIAN", "ka", "geo", NULL},
121 { "TIGRINYA", "ti", "tir", NULL},
122 { "PERSIAN", "fa", "per", NULL},
123 { "BOSNIAN", "bs", "bos", NULL},
124 { "SINHALESE", "si", "sin", NULL},
125 { "NORWEGIAN_N", "nn", "nno", NULL},
126 { "PORTUGUESE_P", NULL, NULL, "pt-PT"},
127 { "PORTUGUESE_B", NULL, NULL, "pt-BR"},
128 { "XHOSA", "xh", "xho", NULL},
129 { "ZULU", "zu", "zul", NULL},
130 { "GUARANI", "gn", "grn", NULL},
131 { "SESOTHO", "st", "sot", NULL},
132 { "TURKMEN", "tk", "tuk", NULL},
133 { "KYRGYZ", "ky", "kir", NULL},
134 { "BRETON", "br", "bre", NULL},
135 { "TWI", "tw", "twi", NULL},
136 { "YIDDISH", "yi", "yid", NULL},
137 { "SERBO_CROATIAN", "sh", NULL, NULL},
138 { "SOMALI", "so", "som", NULL},
139 { "UIGHUR", "ug", "uig", NULL},
140 { "KURDISH", "ku", "kur", NULL},
141 { "MONGOLIAN", "mn", "mon", NULL},
142 { "ARMENIAN", "hy", "arm", NULL},
143 { "LAOTHIAN", "lo", "lao", NULL},
144 { "SINDHI", "sd", "snd", NULL},
145 { "RHAETO_ROMANCE", "rm", "roh", NULL},
146 { "AFRIKAANS", "af", "afr", NULL},
147 { "LUXEMBOURGISH", "lb", "ltz", NULL},
148 { "BURMESE", "my", "bur", NULL},
149 // KHMER is known as Cambodian for Google user interfaces.
150 { "KHMER", "km", "khm", NULL},
151 { "TIBETAN", "bo", "tib", NULL},
152 { "DHIVEHI", "dv", "div", NULL},
153 { "CHEROKEE", NULL, "chr", NULL},
154 { "SYRIAC", NULL, "syr", NULL},
155 { "LIMBU", NULL, NULL, "sit-NP"},
156 { "ORIYA", "or", "ori", NULL},
157 { "ASSAMESE", "as", "asm", NULL},
158 { "CORSICAN", "co", "cos", NULL},
159 { "INTERLINGUE", "ie", "ine", NULL},
160 { "KAZAKH", "kk", "kaz", NULL},
161 { "LINGALA", "ln", "lin", NULL},
162 { "MOLDAVIAN", "mo", "mol", NULL},
163 { "PASHTO", "ps", "pus", NULL},
164 { "QUECHUA", "qu", "que", NULL},
165 { "SHONA", "sn", "sna", NULL},
166 { "TAJIK", "tg", "tgk", NULL},
167 { "TATAR", "tt", "tat", NULL},
168 { "TONGA", "to", "tog", NULL},
169 { "YORUBA", "yo", "yor", NULL},
170 { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL},
171 { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL},
172 { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL},
173 { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL},
174 { "MAORI", "mi", "mao", NULL},
175 { "WOLOF", "wo", "wol", NULL},
176 { "ABKHAZIAN", "ab", "abk", NULL},
177 { "AFAR", "aa", "aar", NULL},
178 { "AYMARA", "ay", "aym", NULL},
179 { "BASHKIR", "ba", "bak", NULL},
180 { "BISLAMA", "bi", "bis", NULL},
181 { "DZONGKHA", "dz", "dzo", NULL},
182 { "FIJIAN", "fj", "fij", NULL},
183 { "GREENLANDIC", "kl", "kal", NULL},
184 { "HAUSA", "ha", "hau", NULL},
185 { "HAITIAN_CREOLE", "ht", NULL, NULL},
186 { "INUPIAK", "ik", "ipk", NULL},
187 { "INUKTITUT", "iu", "iku", NULL},
188 { "KASHMIRI", "ks", "kas", NULL},
189 { "KINYARWANDA", "rw", "kin", NULL},
190 { "MALAGASY", "mg", "mlg", NULL},
191 { "NAURU", "na", "nau", NULL},
192 { "OROMO", "om", "orm", NULL},
193 { "RUNDI", "rn", "run", NULL},
194 { "SAMOAN", "sm", "smo", NULL},
195 { "SANGO", "sg", "sag", NULL},
196 { "SANSKRIT", "sa", "san", NULL},
197 { "SISWANT", "ss", "ssw", NULL},
198 { "TSONGA", "ts", "tso", NULL},
199 { "TSWANA", "tn", "tsn", NULL},
200 { "VOLAPUK", "vo", "vol", NULL},
201 { "ZHUANG", "za", "zha", NULL},
202 { "KHASI", NULL, "kha", NULL},
203 { "SCOTS", NULL, "sco", NULL},
204 { "GANDA", "lg", "lug", NULL},
205 { "MANX", "gv", "glv", NULL},
206 { "MONTENEGRIN", NULL, NULL, "sr-ME"},
207 { "XX", NULL, NULL, "XX"},
208 };
209
210 COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1,
211 kLanguageInfoTable_has_incorrect_length);
212
213
214 // LANGUAGE NAMES
215
default_language_name()216 const char* default_language_name() {
217 return kLanguageInfoTable[ENGLISH].language_name_;
218 }
219
220 static const char* const kInvalidLanguageName = "invalid_language";
221
invalid_language_name()222 const char *invalid_language_name() {
223 return kInvalidLanguageName;
224 }
225
LanguageName(Language lang)226 const char* LanguageName(Language lang) {
227 return IsValidLanguage(lang)
228 ? kLanguageInfoTable[lang].language_name_
229 : kInvalidLanguageName;
230 }
231
232
233
234 // LANGUAGE CODES
235
236
237 // The space before invalid_language_code is intentional. It is used
238 // to prevent it matching any two letter language code.
239 //
240 static const char* const kInvalidLanguageCode = " invalid_language_code";
241
invalid_language_code()242 const char *invalid_language_code() {
243 return kInvalidLanguageCode;
244 }
245
LanguageCode(Language lang)246 const char * LanguageCode(Language lang) {
247 if (! IsValidLanguage(lang))
248 return kInvalidLanguageCode;
249 const LanguageInfo& info = kLanguageInfoTable[lang];
250 if (info.language_code_639_1_) {
251 return info.language_code_639_1_;
252 } else if (info.language_code_639_2_) {
253 return info.language_code_639_2_;
254 } else if (info.language_code_other_) {
255 return info.language_code_other_;
256 } else {
257 return kInvalidLanguageCode;
258 }
259 }
260
default_language_code()261 const char* default_language_code() {
262 return kLanguageInfoTable[ENGLISH].language_code_639_1_;
263 }
264
LanguageCodeISO639_1(Language lang)265 const char* LanguageCodeISO639_1(Language lang) {
266 if (! IsValidLanguage(lang))
267 return kInvalidLanguageCode;
268 if (const char* code = kLanguageInfoTable[lang].language_code_639_1_)
269 return code;
270 return kInvalidLanguageCode;
271 }
272
LanguageCodeISO639_2(Language lang)273 const char* LanguageCodeISO639_2(Language lang) {
274 if (! IsValidLanguage(lang))
275 return kInvalidLanguageCode;
276 if (const char* code = kLanguageInfoTable[lang].language_code_639_2_)
277 return code;
278 return kInvalidLanguageCode;
279 }
280
LanguageCodeWithDialects(Language lang)281 const char* LanguageCodeWithDialects(Language lang) {
282 if (lang == CHINESE)
283 return "zh-CN";
284 return LanguageCode(lang);
285 }
286
287
288
LanguageFromCode(const char * lang_code,Language * language)289 bool LanguageFromCode(const char* lang_code, Language *language) {
290 *language = UNKNOWN_LANGUAGE;
291 if ( lang_code == NULL ) return false;
292
293 for ( int i = 0 ; i < kNumLanguages ; i++ ) {
294 const LanguageInfo& info = kLanguageInfoTable[i];
295 if ((info.language_code_639_1_ &&
296 !base::strcasecmp(lang_code, info.language_code_639_1_)) ||
297 (info.language_code_639_2_ &&
298 !base::strcasecmp(lang_code, info.language_code_639_2_)) ||
299 (info.language_code_other_ &&
300 !base::strcasecmp(lang_code, info.language_code_other_))) {
301 *language = static_cast<Language>(i);
302 return true;
303 }
304 }
305
306 // For convenience, this function can also parse the non-standard
307 // five-letter language codes "zh-cn" and "zh-tw" which are used by
308 // front-ends such as GWS to distinguish Simplified from Traditional
309 // Chinese.
310 if (!base::strcasecmp(lang_code, "zh-cn") ||
311 !base::strcasecmp(lang_code, "zh_cn")) {
312 *language = CHINESE;
313 return true;
314 }
315 if (!base::strcasecmp(lang_code, "zh-tw") ||
316 !base::strcasecmp(lang_code, "zh_tw")) {
317 *language = CHINESE_T;
318 return true;
319 }
320 if (!base::strcasecmp(lang_code, "sr-me") ||
321 !base::strcasecmp(lang_code, "sr_me")) {
322 *language = MONTENEGRIN;
323 return true;
324 }
325
326 // Process language-code synonyms.
327 if (!base::strcasecmp(lang_code, "he")) {
328 *language = HEBREW; // Use "iw".
329 return true;
330 }
331 if (!base::strcasecmp(lang_code, "in")) {
332 *language = INDONESIAN; // Use "id".
333 return true;
334 }
335 if (!base::strcasecmp(lang_code, "ji")) {
336 *language = YIDDISH; // Use "yi".
337 return true;
338 }
339
340 // Process language-detection synonyms.
341 // These distinct languages cannot be differentiated by our current
342 // language-detection algorithms.
343 if (!base::strcasecmp(lang_code, "fil")) {
344 *language = TAGALOG;
345 return true;
346 }
347
348 return false;
349 }
350