1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18
19 #include "compact_lang_det_hint_code.h"
20
21 #include <stdlib.h> // for abs()
22 #include <stdio.h> // for sprintf()
23 #include <string.h> //
24 #include "lang_script.h"
25 #include "port.h"
26
27 using namespace std;
28
29 namespace CLD2 {
30
31 static const int kCLDPriorEncodingWeight = 4; // 100x more likely
32 static const int kCLDPriorLanguageWeight = 8; // 10000x more likely
33
34
35 // Tables to map lang="..." language code lists to actual languages.
36 // based on scraping and hand-edits, dsites June 2011
37
38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
39
40 // For close pairs like ms/id, more weight on TLD and lang=
41 // Alternately, weaker boost but mark others of set as negative;
42 // makes "neither" an easier result.
43 // lang=en low weight 4
44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
45 // (except maybe en)
46
47 // TLD to separate, e.g., burundi from rwanda
48
49 // Encoding lookup: OneLangProb array
50 // TLD lookup: tld OneLangProb pairs
51
52
53 typedef struct {
54 const char* const langtag; // Lowercased, hyphen only lookup key
55 const char* const langcode; // Canonical language codes; two if ambiguous
56 OneCLDLangPrior onelangprior1;
57 OneCLDLangPrior onelangprior2;
58 } LangTagLookup;
59
60 typedef struct {
61 const char* const tld; // Lowercased, hyphen only lookup key
62 OneCLDLangPrior onelangprior1;
63 OneCLDLangPrior onelangprior2;
64 } TLDLookup;
65
66
67 #define W2 (2 << 10) // 3**2 = 10x more likely
68 #define W4 (4 << 10) // 3**4 = 100x more likely
69 #define W6 (6 << 10) // 3**6 = 1000x more likely
70 #define W8 (8 << 10) // 3**8 = 10K x more likely
71 #define W10 (10 << 10) // 3**10 = 100K x more likely
72 #define W12 (12 << 10) // 3**12 = 1M x more likely
73
74 // TODO: more about ba hr sr sr-ME and sl
75 // Temporary state of affairs:
76 // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
77 // Eventually, we want to do all four, but it requires a CLD change to handle
78 // up to six languages per quadgram.
79
80
81 // Close pairs boost one of pair, demote other.
82 // Statistically close pairs:
83 // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
84 //
85 // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
86 // TIBETAN DZONGKHA coef=0.4571
87 // CZECH SLOVAK coef=0.4273
88 // NORWEGIAN NORWEGIAN_N coef=0.4182
89 //
90 // HINDI MARATHI coef=0.3795
91 // ZULU XHOSA coef=0.3716
92 //
93 // DANISH NORWEGIAN coef=0.3672 Usually OK
94 // BIHARI HINDI coef=0.3668 Usually OK
95 // ICELANDIC FAROESE coef=0.3519 Usually OK
96
97 //
98 // Table to look up lang= tags longer than three characters
99 // Overrides table below, which is truncated at first hyphen
100 // In alphabetical order for binary search
101 static const int kCLDTable1Size = 213;
102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
103 {"abkhazian", "ab", ABKHAZIAN + W10, 0},
104 {"afar", "aa", AFAR + W10, 0},
105 {"afrikaans", "af", AFRIKAANS + W10, 0},
106 {"akan", "ak", AKAN + W10, 0},
107 {"albanian", "sq", ALBANIAN + W10, 0},
108 {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous
109 {"amharic", "am", AMHARIC + W10, 0},
110 {"arabic", "ar", ARABIC + W10, 0},
111 {"argentina", "es", SPANISH + W10, 0},
112 {"armenian", "hy", ARMENIAN + W10, 0},
113 {"assamese", "as", ASSAMESE + W10, 0},
114 {"aymara", "ay", AYMARA + W10, 0},
115 {"azerbaijani", "az", AZERBAIJANI + W10, 0},
116
117 {"bangla", "bn", BENGALI + W10, 0},
118 {"bashkir", "ba", BASHKIR + W10, 0},
119 {"basque", "eu", BASQUE + W10, 0},
120 {"belarusian", "be", BELARUSIAN + W10, 0},
121 {"bengali", "bn", BENGALI + W10, 0},
122 {"bihari", "bh", BIHARI + W10, HINDI - W4},
123 {"bislama", "bi", BISLAMA + W10, 0},
124 {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
125 {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous
126 {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous
127 {"breton", "br", BRETON + W10, 0},
128 {"bulgarian", "bg", BULGARIAN + W10, 0},
129 {"burmese", "my", BURMESE + W10, 0}, // Myanmar
130
131 {"catalan", "ca", CATALAN + W10, 0},
132 {"cherokee", "chr", CHEROKEE + W10, 0},
133 {"chichewa", "ny", NYANJA + W10, 0},
134
135 {"chinese", "zh", CHINESE + W10, 0},
136 {"chinese-t", "zhT", CHINESE_T + W10, 0},
137 {"chineset", "zhT", CHINESE_T + W10, 0},
138 {"corsican", "co", CORSICAN + W10, 0},
139 {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
140 {"croatian", "hr", CROATIAN + W10, 0},
141 {"czech", "cs", CZECH + W10, SLOVAK - W4},
142
143 {"danish", "da", DANISH + W10, NORWEGIAN - W4},
144 {"deutsch", "de", GERMAN + W10, 0},
145 {"dhivehi", "dv", DHIVEHI + W10, 0},
146 {"dutch", "nl", DUTCH + W10, 0},
147 {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4},
148
149 {"ell-gr", "el", GREEK + W10, 0},
150 {"english", "en", ENGLISH + W4, 0},
151 {"esperanto", "eo", ESPERANTO + W10, 0},
152 {"estonian", "et", ESTONIAN + W10, 0},
153 {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
154 {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding
155
156 {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
157 {"fijian", "fj", FIJIAN + W10, 0},
158 {"finnish", "fi", FINNISH + W10, 0},
159 {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII
160 {"francais", "fr", FRENCH + W10, 0},
161 {"french", "fr", FRENCH + W10, 0},
162 {"frisian", "fy", FRISIAN + W10, 0},
163
164 {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous
165 {"galician", "gl", GALICIAN + W10, 0},
166 {"ganda", "lg", GANDA + W10, 0},
167 {"georgian", "ka", GEORGIAN + W10, 0},
168 {"german", "de", GERMAN + W10, 0},
169 {"greek", "el", GREEK + W10, 0},
170 {"greenlandic", "kl", GREENLANDIC + W10, 0},
171 {"guarani", "gn", GUARANI + W10, 0},
172 {"gujarati", "gu", GUJARATI + W10, 0},
173
174 {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
175 {"hausa", "ha", HAUSA + W10, 0},
176 {"hawaiian", "haw", HAWAIIAN + W10, 0},
177 {"hebrew", "he", HEBREW + W10, 0},
178 {"hindi", "hi", HINDI + W10, MARATHI - W4},
179 {"hn-in", "hi", HINDI + W10, MARATHI - W4},
180 {"hungarian", "hu", HUNGARIAN + W10, 0},
181
182 {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
183 {"igbo", "ig", IGBO + W10, 0},
184 {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
185 {"interlingua", "ia", INTERLINGUA + W10, 0},
186 {"interlingue", "ie", INTERLINGUE + W10, 0},
187 // 1:2 iu-Cans ik-Latn
188 {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
189 {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
190 {"ir-ie", "ga", IRISH + W10, 0}, // Irish
191 {"irish", "ga", IRISH + W10, 0},
192 {"italian", "it", ITALIAN + W10, 0},
193
194 {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding
195 {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
196 {"japanese", "ja", JAPANESE + W10, 0},
197 {"javanese", "jw", JAVANESE + W10, 0},
198
199 {"kannada", "kn", KANNADA + W10, 0},
200 {"kashmiri", "ks", KASHMIRI + W10, 0},
201 {"kazakh", "kk", KAZAKH + W10, 0},
202 {"khasi", "kha", KHASI + W10, 0},
203 {"khmer", "km", KHMER + W10, 0},
204 {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
205 {"klingon", "tlh", X_KLINGON + W10, 0},
206 {"korean", "ko", KOREAN + W10, 0},
207 {"kurdish", "ku", KURDISH + W10, 0},
208 {"kyrgyz", "ky", KYRGYZ + W10, 0},
209
210 {"laothian", "lo", LAOTHIAN + W10, 0},
211 {"latin", "la", LATIN + W10, 0},
212 {"latvian", "lv", LATVIAN + W10, 0},
213 {"limbu", "sit", LIMBU + W10, 0},
214 {"lingala", "ln", LINGALA + W10, 0},
215 {"lithuanian", "lt", LITHUANIAN + W10, 0},
216 {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
217
218 {"macedonian", "mk", MACEDONIAN + W10, 0},
219 {"malagasy", "mg", MALAGASY + W10, 0},
220 {"malay", "ms", MALAY + W10, INDONESIAN - W4},
221 {"malayalam", "ml", MALAYALAM + W10, 0},
222 {"maltese", "mt", MALTESE + W10, 0},
223 {"manx", "gv", MANX + W10, 0},
224 {"maori", "mi", MAORI + W10, 0},
225 {"marathi", "mr", MARATHI + W10, HINDI - W4},
226 {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
227 {"moldavian", "mo", ROMANIAN + W10, 0},
228 {"mongolian", "mn", MONGOLIAN + W10, 0},
229 {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
230 {"myanmar", "my", BURMESE + W10, 0}, // Myanmar
231 {"nauru", "na", NAURU + W10, 0},
232 {"ndebele", "nr", NDEBELE + W10, 0},
233 {"nepali", "ne", NEPALI + W10, 0},
234 {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
235 {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
236 {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
237 {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
238 {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk
239 {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
240 {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
241 {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
242 {"nyanja", "ny", NYANJA + W10, 0},
243
244 {"occitan", "oc", OCCITAN + W10, 0},
245 {"oriya", "or", ORIYA + W10, 0},
246 {"oromo", "om", OROMO + W10, 0},
247 {"parsi", "fa", PERSIAN + W10, 0},
248
249 {"pashto", "ps", PASHTO + W10, 0},
250 {"pedi", "nso", PEDI + W10, 0},
251 {"persian", "fa", PERSIAN + W10, 0},
252 {"polish", "pl", POLISH + W10, 0},
253 {"polska", "pl", POLISH + W10, 0},
254 {"polski", "pl", POLISH + W10, 0},
255 {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII
256 {"portuguese", "pt", PORTUGUESE + W10, 0},
257 {"punjabi", "pa", PUNJABI + W10, 0},
258
259 {"quechua", "qu", QUECHUA + W10, 0},
260
261 {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
262 {"romanian", "ro", ROMANIAN + W10, 0},
263 {"rundi", "rn", RUNDI + W10, 0},
264 {"russian", "ru", RUSSIAN + W10, 0},
265
266 {"samoan", "sm", SAMOAN + W10, 0},
267 {"sango", "sg", SANGO + W10, 0},
268 {"sanskrit", "sa", SANSKRIT + W10, 0},
269 {"scots", "sco", SCOTS + W10, ENGLISH - W4},
270 {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
271 {"serbian", "sr", SERBIAN + W10, 0},
272 {"seselwa", "crs", SESELWA + W10, 0},
273 {"sesotho", "st", SESOTHO + W10, 0},
274 {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding
275 {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding
276 {"shona", "sn", SHONA + W10, 0},
277 {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous
278 {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
279 {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
280 {"sindhi", "sd", SINDHI + W10, 0},
281 {"sinhalese", "si", SINHALESE + W10, 0},
282 {"siswant", "ss", SISWANT + W10, 0},
283 {"sit-np", "sit", LIMBU + W10, 0},
284 {"slovak", "sk", SLOVAK + W10, CZECH - W4},
285 {"slovenian", "sl", SLOVENIAN + W10, 0},
286 {"somali", "so", SOMALI + W10, 0},
287 {"spanish", "es", SPANISH + W10, 0},
288 {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
289 {"sundanese", "su", SUNDANESE + W10, 0},
290 {"suomi", "fi", FINNISH + W10, 0}, // Finnish
291 {"swahili", "sw", SWAHILI + W10, 0},
292 {"swedish", "sv", SWEDISH + W10, 0},
293 {"syriac", "syr", SYRIAC + W10, 0},
294
295 {"tagalog", "tl", TAGALOG + W10, 0},
296 {"tajik", "tg", TAJIK + W10, 0},
297 {"tamil", "ta", TAMIL + W10, 0},
298 {"tatar", "tt", TATAR + W10, 0},
299 {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet
300 {"tchinese", "zhT", CHINESE_T + W10, 0},
301 {"telugu", "te", TELUGU + W10, 0},
302 {"thai", "th", THAI + W10, 0},
303 {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
304 {"tigrinya", "ti", TIGRINYA + W10, 0},
305 {"tonga", "to", TONGA + W10, 0},
306 {"tsonga", "ts", TSONGA + W10, 0},
307 {"tswana", "tn", TSWANA + W10, 0},
308 {"tt-ru", "tt", TATAR + W10, 0},
309 {"tur-tr", "tr", TURKISH + W10, 0},
310 {"turkish", "tr", TURKISH + W10, 0},
311 {"turkmen", "tk", TURKMEN + W10, 0},
312 {"uighur", "ug", UIGHUR + W10, 0},
313 {"ukrainian", "uk", UKRAINIAN + W10, 0},
314 {"urdu", "ur", URDU + W10, 0},
315 {"uzbek", "uz", UZBEK + W10, 0},
316
317 {"venda", "ve", VENDA + W10, 0},
318 {"vietnam", "vi", VIETNAMESE + W10, 0},
319 {"vietnamese", "vi", VIETNAMESE + W10, 0},
320 {"volapuk", "vo", VOLAPUK + W10, 0},
321
322 {"welsh", "cy", WELSH + W10, 0},
323 {"wolof", "wo", WOLOF + W10, 0},
324
325 {"xhosa", "xh", XHOSA + W10, ZULU - W4},
326
327 {"yiddish", "yi", YIDDISH + W10, 0},
328 {"yoruba", "yo", YORUBA + W10, 0},
329
330 {"zh-classical", "zhT", CHINESE_T + W10, 0},
331 {"zh-cn", "zh", CHINESE + W10, 0},
332 {"zh-hans", "zh", CHINESE + W10, 0},
333 {"zh-hant", "zhT", CHINESE_T + W10, 0},
334 {"zh-hk", "zhT", CHINESE_T + W10, 0},
335 {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
336 {"zh-sg", "zhT", CHINESE_T + W10, 0},
337 {"zh-tw", "zhT", CHINESE_T + W10, 0},
338 {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese
339 {"zhuang", "za", ZHUANG + W10, 0},
340 {"zulu", "zu", ZULU + W10, XHOSA - W4},
341 };
342
343
344
345 // Table to look up lang= tags of two/three characters after truncate at hyphen
346 // In alphabetical order for binary search
347 static const int kCLDTable2Size = 257;
348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
349 {"aa", "aa", AFAR + W10, 0},
350 {"ab", "ab", ABKHAZIAN + W10, 0},
351 {"af", "af", AFRIKAANS + W10, 0},
352 {"ak", "ak", AKAN + W10, 0},
353 {"al", "sq", ALBANIAN + W10, 0}, // Albania
354 {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian
355 {"ar", "ar", ARABIC + W10, 0},
356 {"ara", "ar", ARABIC + W10, 0},
357 {"arm", "hy", ARMENIAN + W10, 0}, // Armenia
358 {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic
359 {"as", "as", ASSAMESE + W10, 0},
360 {"at", "de", GERMAN + W10, 0}, // Austria
361 {"au", "de", GERMAN + W10, 0}, // Austria
362 {"ay", "ay", AYMARA + W10, 0},
363 {"az", "az", AZERBAIJANI + W10, 0},
364 {"aze", "az", AZERBAIJANI + W10, 0},
365
366 {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia
367 {"be", "be", BELARUSIAN + W10, 0},
368 {"bel", "be", BELARUSIAN + W10, 0},
369 {"bg", "bg", BULGARIAN + W10, 0},
370 {"bh", "bh", BIHARI + W10, HINDI - W4},
371 {"bi", "bi", BISLAMA + W10, 0},
372 {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding
373 {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia
374 {"bn", "bn", BENGALI + W10, 0},
375 {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
376 // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
377 {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
378 {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
379
380 {"ca", "ca", CATALAN + W10, 0},
381 {"cat", "ca", CATALAN + W10, 0},
382 {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland
383 {"chn", "zh", CHINESE + W10, 0},
384 {"chr", "chr", CHEROKEE + W10, 0},
385 {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish
386 {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker.
387 // Offset by 2 so that TLD=tw or
388 // enc=big5 will put zhT ahead
389 {"co", "co", CORSICAN + W10, 0},
390 {"cro", "hr", CROATIAN + W10, 0}, // Croatia
391 {"crs", "crs", SESELWA + W10, 0},
392 {"cs", "cs", CZECH + W10, SLOVAK - W4},
393 {"ct", "ca", CATALAN + W10, 0},
394 {"cy", "cy", WELSH + W10, 0},
395 {"cym", "cy", WELSH + W10, 0},
396 {"cz", "cs", CZECH + W10, SLOVAK - W4},
397
398 {"da", "da", DANISH + W10, NORWEGIAN - W4},
399 {"dan", "da", DANISH + W10, NORWEGIAN - W4},
400 {"de", "de", GERMAN + W10, 0},
401 {"deu", "de", GERMAN + W10, 0},
402 {"div", "dv", DHIVEHI + W10, 0},
403 {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark
404 {"dut", "nl", DUTCH + W10, 0}, // Dutch
405 {"dv", "dv", DHIVEHI + W10, 0},
406 {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
407
408 {"ee", "et", ESTONIAN + W10, 0}, // Estonia
409 {"eg", "ar", ARABIC + W10, 0}, // Egypt
410 {"el", "el", GREEK + W10, 0},
411 {"en", "en", ENGLISH + W4, 0},
412 {"eng", "en", ENGLISH + W4, 0},
413 {"eo", "eo", ESPERANTO + W10, 0},
414 {"er", "ur", URDU + W10, 0}, // "Erdu"
415 {"es", "es", SPANISH + W10, 0},
416 {"esp", "es", SPANISH + W10, 0},
417 {"est", "et", ESTONIAN + W10, 0},
418 {"et", "et", ESTONIAN + W10, 0},
419 {"eu", "eu", BASQUE + W10, 0},
420
421 {"fa", "fa", PERSIAN + W10, 0},
422 {"far", "fa", PERSIAN + W10, 0},
423 {"fi", "fi", FINNISH + W10, 0},
424 {"fil", "tl", TAGALOG + W10, 0}, // Philippines
425 {"fj", "fj", FIJIAN + W10, 0},
426 {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
427 {"fr", "fr", FRENCH + W10, 0},
428 {"fra", "fr", FRENCH + W10, 0},
429 {"fre", "fr", FRENCH + W10, 0},
430 {"fy", "fy", FRISIAN + W10, 0},
431
432 {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician
433 {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either
434 {"gal", "gl", GALICIAN + W10, 0},
435 {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding
436 {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding
437 {"gd", "gd", SCOTS_GAELIC + W10, 0},
438 {"ge", "ka", GEORGIAN + W10, 0}, // Georgia
439 {"geo", "ka", GEORGIAN + W10, 0},
440 {"ger", "de", GERMAN + W10, 0},
441 {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse
442 {"gn", "gn", GUARANI + W10, 0},
443 {"gr", "el", GREEK + W10, 0}, // Greece
444 {"gu", "gu", GUJARATI + W10, 0},
445 {"gv", "gv", MANX + W10, 0},
446
447 {"ha", "ha", HAUSA + W10, 0},
448 {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti
449 {"haw", "haw", HAWAIIAN + W10, 0},
450 {"hb", "he", HEBREW + W10, 0},
451 {"he", "he", HEBREW + W10, 0},
452 {"heb", "he", HEBREW + W10, 0},
453 {"hi", "hi", HINDI + W10, MARATHI - W4},
454 {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong
455 {"hr", "hr", CROATIAN + W10, 0},
456 {"ht", "ht", HAITIAN_CREOLE + W10, 0},
457 {"hu", "hu", HUNGARIAN + W10, 0},
458 {"hun", "hu", HUNGARIAN + W10, 0},
459 {"hy", "hy", ARMENIAN + W10, 0},
460
461 {"ia", "ia", INTERLINGUA + W10, 0},
462 {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland
463 {"id", "id", INDONESIAN + W10, MALAY - W4},
464 {"ids", "id", INDONESIAN + W10, MALAY - W4},
465 {"ie", "ie", INTERLINGUE + W10, 0},
466 {"ig", "ig", IGBO + W10, 0},
467 // 1:2 iu-Cans ik-Latn
468 {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
469 {"in", "id", INDONESIAN + W10, MALAY - W4},
470 {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia
471 {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
472 {"is", "is", ICELANDIC + W10, FAROESE - W4},
473 {"it", "it", ITALIAN + W10, 0},
474 {"ita", "it", ITALIAN + W10, 0},
475 {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
476 {"iw", "he", HEBREW + W10, 0},
477
478 {"ja", "ja", JAPANESE + W10, 0},
479 {"jp", "ja", JAPANESE + W10, 0}, // Japan
480 {"jpn", "ja", JAPANESE + W10, 0},
481 {"jv", "jw", JAVANESE + W10, 0},
482 {"jw", "jw", JAVANESE + W10, 0},
483
484 {"ka", "ka", GEORGIAN + W10, 0},
485 {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua
486 {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan
487 {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia)
488 {"kha", "kha", KHASI + W10, 0},
489 {"kk", "kk", KAZAKH + W10, 0}, // Kazakh
490 {"kl", "kl", GREENLANDIC + W10, 0},
491 {"km", "km", KHMER + W10, 0},
492 {"kn", "kn", KANNADA + W10, 0},
493 {"ko", "ko", KOREAN + W10, 0},
494 {"kor", "ko", KOREAN + W10, 0},
495 {"kr", "ko", KOREAN + W10, 0}, // Country code Korea
496 {"ks", "ks", KASHMIRI + W10, 0},
497 {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding
498 {"ku", "ku", KURDISH + W10, 0},
499 {"ky", "ky", KYRGYZ + W10, 0},
500 {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan
501 {"la", "la", LATIN + W10, 0},
502 {"lao", "lo", LAOTHIAN + W10, 0}, // Laos
503
504 {"lb", "lb", LUXEMBOURGISH + W10, 0},
505 {"lg", "lg", GANDA + W10, 0},
506 {"lit", "lt", LITHUANIAN + W10, 0},
507 {"ln", "ln", LINGALA + W10, 0},
508 {"lo", "lo", LAOTHIAN + W10, 0},
509 {"lt", "lt", LITHUANIAN + W10, 0},
510 {"ltu", "lt", LITHUANIAN + W10, 0},
511 {"lv", "lv", LATVIAN + W10, 0},
512
513 {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
514 {"mg", "mg", MALAGASY + W10, 0},
515 {"mi", "mi", MAORI + W10, 0},
516 {"mk", "mk", MACEDONIAN + W10, 0},
517 {"ml", "ml", MALAYALAM + W10, 0},
518 {"mn", "mn", MONGOLIAN + W10, 0},
519 {"mo", "mo", ROMANIAN + W10, 0},
520 {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian
521 {"mr", "mr", MARATHI + W10, HINDI - W4},
522 {"ms", "ms", MALAY + W10, INDONESIAN - W4},
523 {"mt", "mt", MALTESE + W10, 0},
524 {"mx", "es", SPANISH + W10, 0}, // Mexico
525 {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
526
527 {"na", "na", NAURU + W10, 0},
528 {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
529 {"ne", "ne", NEPALI + W10, 0},
530 {"nl", "nl", DUTCH + W10, 0},
531 {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
532 {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
533 {"nr", "nr", NDEBELE + W10, 0},
534 {"nso", "nso", PEDI + W10, 0},
535 {"ny", "ny", NYANJA + W10, 0},
536
537 {"oc", "oc", OCCITAN + W10, 0},
538 {"om", "om", OROMO + W10, 0},
539 {"or", "or", ORIYA + W10, 0},
540
541 {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab
542 {"per", "fa", PERSIAN + W10, 0},
543 {"ph", "tl", TAGALOG + W10, 0}, // Philippines
544 {"pk", "ur", URDU + W10, 0}, // Pakistan
545 {"pl", "pl", POLISH + W10, 0},
546 {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi
547 {"pol", "pl", POLISH + W10, 0},
548 {"por", "pt", PORTUGUESE + W10, 0},
549 {"ps", "ps", PASHTO + W10, 0},
550 {"pt", "pt", PORTUGUESE + W10, 0},
551 {"ptg", "pt", PORTUGUESE + W10, 0},
552 {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code
553 {"qu", "qu", QUECHUA + W10, 0},
554
555 {"rm", "rm", RHAETO_ROMANCE + W10, 0},
556 {"rn", "rn", RUNDI + W10, 0},
557 {"ro", "ro", ROMANIAN + W10, 0},
558 {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code
559 {"ru", "ru", RUSSIAN + W10, 0},
560 {"rus", "ru", RUSSIAN + W10, 0},
561 {"rw", "rw", KINYARWANDA + W10, 0},
562
563 {"sa", "sa", SANSKRIT + W10, 0},
564 {"sco", "sco", SCOTS + W10, ENGLISH - W4},
565 {"sd", "sd", SINDHI + W10, 0},
566 {"se", "sv", SWEDISH + W10, 0},
567 {"sg", "sg", SANGO + W10, 0},
568 {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia
569 {"sk", "sk", SLOVAK + W10, CZECH - W4},
570 {"sl", "sl", SLOVENIAN + W10, 0},
571 {"slo", "sl", SLOVENIAN + W10, 0},
572 {"sm", "sm", SAMOAN + W10, 0},
573 {"sn", "sn", SHONA + W10, 0},
574 {"so", "so", SOMALI + W10, 0},
575 {"sp", "es", SPANISH + W10, 0},
576 {"sq", "sq", ALBANIAN + W10, 0},
577 {"sr", "sr", SERBIAN + W10, 0},
578 {"srb", "sr", SERBIAN + W10, 0},
579 {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin
580 {"srp", "sr", SERBIAN + W10, 0},
581 {"ss", "ss", SISWANT + W10, 0},
582 {"st", "st", SESOTHO + W10, 0},
583 {"su", "su", SUNDANESE + W10, 0},
584 {"sv", "sv", SWEDISH + W10, 0},
585 {"sve", "sv", SWEDISH + W10, 0},
586 {"sw", "sw", SWAHILI + W10, 0},
587 {"swe", "sv", SWEDISH + W10, 0},
588 {"sy", "syr", SYRIAC + W10, 0},
589 {"syr", "syr", SYRIAC + W10, 0},
590
591 {"ta", "ta", TAMIL + W10, 0},
592 {"te", "te", TELUGU + W10, 0},
593 {"tg", "tg", TAJIK + W10, 0},
594 {"th", "th", THAI + W10, 0},
595 {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet
596 {"tj", "tg", TAJIK + W10, 0}, // Tajikistan
597 {"tk", "tk", TURKMEN + W10, 0},
598 {"tl", "tl", TAGALOG + W10, 0},
599 {"tlh", "tlh", X_KLINGON + W10, 0},
600 {"tn", "tn", TSWANA + W10, 0},
601 {"to", "to", TONGA + W10, 0},
602 {"tr", "tr", TURKISH + W10, 0},
603 {"ts", "ts", TSONGA + W10, 0},
604 {"tt", "tt", TATAR + W10, 0},
605 {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan
606 {"twi", "ak", AKAN + W10, 0}, // Twi => Akan
607
608 {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine
609 {"ug", "ug", UIGHUR + W10, 0},
610 {"uk", "uk", UKRAINIAN + W10, 0},
611 {"ur", "ur", URDU + W10, 0},
612 {"uz", "uz", UZBEK + W10, 0},
613
614 {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan
615 {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan
616 {"ve", "ve", VENDA + W10, 0},
617 {"vi", "vi", VIETNAMESE + W10, 0},
618 {"vie", "vi", VIETNAMESE + W10, 0},
619 {"vn", "vi", VIETNAMESE + W10, 0},
620 {"vo", "vo", VOLAPUK + W10, 0},
621
622 {"wo", "wo", WOLOF + W10, 0},
623
624 {"xh", "xh", XHOSA + W10, ZULU - W4},
625 {"xho", "xh", XHOSA + W10, ZULU - W4},
626
627 {"yi", "yi", YIDDISH + W10, 0},
628 {"yo", "yo", YORUBA + W10, 0},
629
630 {"za", "za", ZHUANG + W10, 0},
631 {"zh", "zh", CHINESE + W10, 0},
632 {"zht", "zhT", CHINESE_T + W10, 0},
633 {"zu", "zu", ZULU + W10, XHOSA - W4},
634 };
635
636
637 // Possibly map to tl:
638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray
641
642
643
644 // Table to look up country TLD (no general TLD)
645 // In alphabetical order for binary search
646 static const int kCLDTable3Size = 181;
647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
648 {"ac", JAPANESE + W2, 0},
649 {"ad", CATALAN + W4, 0},
650 {"ae", ARABIC + W4, 0},
651 {"af", PASHTO + W4, PERSIAN + W4},
652 {"ag", GERMAN + W2, 0}, // meager
653 // {"ai", 0, 0}, // meager
654 {"al", ALBANIAN + W4, 0},
655 {"am", ARMENIAN + W4, 0},
656 {"an", DUTCH + W4, 0}, // meager
657 {"ao", PORTUGUESE + W4, 0},
658 // {"aq", 0, 0}, // meager
659 {"ar", SPANISH + W4, 0},
660 // {"as", 0, 0},
661 {"at", GERMAN + W4, 0},
662 {"au", ENGLISH + W2, 0},
663 {"aw", DUTCH + W4, 0},
664 {"ax", SWEDISH + W4, 0},
665 {"az", AZERBAIJANI + W4, 0},
666
667 {"ba", BOSNIAN + W8, CROATIAN - W4},
668 // {"bb", 0, 0},
669 {"bd", BENGALI + W4, 0},
670 {"be", DUTCH + W4, FRENCH + W4},
671 {"bf", FRENCH + W4, 0},
672 {"bg", BULGARIAN + W4, 0},
673 {"bh", ARABIC + W4, 0},
674 {"bi", RUNDI + W4, FRENCH + W4},
675 {"bj", FRENCH + W4, 0},
676 {"bm", ENGLISH + W2, 0},
677 {"bn", MALAY + W4, INDONESIAN - W4},
678 {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA
679 {"br", PORTUGUESE + W4, 0},
680 // {"bs", 0, 0},
681 {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha
682 {"bw", TSWANA + W4, 0},
683 {"by", BELARUSIAN + W4, 0},
684 // {"bz", 0, 0},
685
686 {"ca", FRENCH + W4, ENGLISH + W2},
687 {"cat", CATALAN + W4, 0},
688 {"cc", 0, 0},
689 {"cd", FRENCH + W4, 0},
690 {"cf", FRENCH + W4, 0},
691 {"cg", FRENCH + W4, 0},
692 {"ch", GERMAN + W4, FRENCH + W4},
693 {"ci", FRENCH + W4, 0},
694 // {"ck", 0, 0},
695 {"cl", SPANISH + W4, 0},
696 {"cm", FRENCH + W4, 0},
697 {"cn", CHINESE + W4, 0},
698 {"co", SPANISH + W4, 0},
699 {"cr", SPANISH + W4, 0},
700 {"cu", SPANISH + W4, 0},
701 {"cv", PORTUGUESE + W4, 0},
702 // {"cx", 0, 0},
703 {"cy", GREEK + W4, TURKISH + W4},
704 {"cz", CZECH + W4, SLOVAK - W4},
705
706 {"de", GERMAN + W4, 0},
707 {"dj", 0, 0},
708 {"dk", DANISH + W4, NORWEGIAN - W4},
709 {"dm", 0, 0},
710 {"do", SPANISH + W4, 0},
711 {"dz", FRENCH + W4, ARABIC + W4},
712
713 {"ec", SPANISH + W4, 0},
714 {"ee", ESTONIAN + W4, 0},
715 {"eg", ARABIC + W4, 0},
716 {"er", AFAR + W4, 0},
717 {"es", SPANISH + W4, 0},
718 {"et", AMHARIC + W4, AFAR + W4},
719
720 {"fi", FINNISH + W4, 0},
721 {"fj", FIJIAN + W4, 0},
722 // {"fk", 0, 0},
723 // {"fm", 0, 0},
724 {"fo", FAROESE + W4, ICELANDIC - W4},
725 {"fr", FRENCH + W4, 0},
726
727 {"ga", FRENCH + W4, 0},
728 {"gd", 0, 0},
729 {"ge", GEORGIAN + W4, 0},
730 {"gf", FRENCH + W4, 0},
731 // {"gg", 0, 0},
732 // {"gh", 0, 0},
733 // {"gi", 0, 0},
734 {"gl", GREENLANDIC + W4, DANISH + W4},
735 // {"gm", 0, 0},
736 {"gn", FRENCH + W4, 0},
737 // {"gp", 0, 0},
738 // {"gq", 0, 0},
739 {"gr", GREEK + W4, 0},
740 // {"gs", 0, 0},
741 {"gt", SPANISH + W4, 0},
742 // {"gu", 0, 0},
743 // {"gy", 0, 0},
744
745 {"hk", CHINESE_T + W4, 0},
746 // {"hm", 0, 0},
747 {"hn", SPANISH + W4, 0},
748 {"hr", CROATIAN + W8, BOSNIAN - W4},
749 {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
750 {"hu", HUNGARIAN + W4, 0},
751
752 {"id", INDONESIAN + W4, MALAY - W4},
753 {"ie", IRISH + W4, 0},
754 {"il", HEBREW + W4, 0},
755 {"im", MANX + W4, 0},
756 // {"in", 0, 0},
757 // {"io", 0, 0},
758 {"iq", ARABIC + W4, 0},
759 {"ir", PERSIAN + W4, 0},
760 {"is", ICELANDIC + W4, FAROESE - W4},
761 {"it", ITALIAN + W4, 0},
762
763 // {"je", 0, 0},
764 // {"jm", 0, 0},
765 {"jo", ARABIC + W4, 0},
766 {"jp", JAPANESE + W4, 0},
767
768 // {"ke", 0, 0},
769 {"kg", KYRGYZ + W4, 0},
770 {"kh", KHMER + W4, 0},
771 // {"ki", 0, 0},
772 {"km", FRENCH + W4, 0},
773 // {"kn", 0, 0},
774 {"kp", KOREAN + W4, 0},
775 {"kr", KOREAN + W4, 0},
776 {"kw", ARABIC + W4, 0},
777 // {"ky", 0, 0},
778 {"kz", KAZAKH + W4, 0},
779
780 {"la", LAOTHIAN + W4, 0},
781 {"lb", ARABIC + W4, FRENCH + W4},
782 // {"lc", 0, 0},
783 {"li", GERMAN + W4, 0},
784 {"lk", SINHALESE + W4, 0},
785 // {"lr", 0, 0},
786 {"ls", SESOTHO + W4, 0},
787 {"lt", LITHUANIAN + W4, 0},
788 {"lu", LUXEMBOURGISH + W4},
789 {"lv", LATVIAN + W4, 0},
790 {"ly", ARABIC + W4, 0},
791
792 {"ma", FRENCH + W4, 0},
793 {"mc", FRENCH + W4, 0},
794 {"md", ROMANIAN + W4, 0},
795 {"me", MONTENEGRIN + W8, SERBIAN - W4},
796 {"mg", FRENCH + W4, 0},
797 {"mk", MACEDONIAN + W4, 0},
798 {"ml", FRENCH + W4, 0},
799 {"mm", BURMESE + W4, 0},
800 {"mn", MONGOLIAN + W4, 0},
801 {"mo", CHINESE_T + W4, PORTUGUESE + W4},
802 // {"mp", 0, 0},
803 {"mq", FRENCH + W4, 0},
804 {"mr", FRENCH + W4, ARABIC + W4},
805 // {"ms", 0, 0},
806 {"mt", MALTESE + W4, 0},
807 // {"mu", 0, 0},
808 {"mv", DHIVEHI + W4, 0},
809 // {"mw", 0, 0},
810 {"mx", SPANISH + W4, 0},
811 {"my", MALAY + W4, INDONESIAN - W4},
812 {"mz", PORTUGUESE + W4, 0},
813
814 {"na", 0, 0}, // Namibia
815 {"nc", FRENCH + W4, 0},
816 {"ne", FRENCH + W4, 0},
817 {"nf", FRENCH + W4, 0},
818 // {"ng", 0, 0},
819 {"ni", SPANISH + W4, 0},
820 {"nl", DUTCH + W4, 0},
821 {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
822 {"np", NEPALI + W4, 0},
823 {"nr", NAURU + W4, 0},
824 {"nu", SWEDISH + W4, 0},
825 {"nz", MAORI + W4, ENGLISH + W2},
826
827 {"om", ARABIC + W4, 0},
828
829 {"pa", SPANISH + W4, 0},
830 {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA
831 {"pf", FRENCH + W4, 0},
832 // {"pg", 0, 0},
833 {"ph", TAGALOG + W4, 0},
834 {"pk", URDU + W4, 0},
835 {"pl", POLISH + W4, 0},
836 // {"pn", 0, 0},
837 {"pr", SPANISH + W4, 0},
838 {"ps", ARABIC + W4, 0},
839 {"pt", PORTUGUESE + W4, 0},
840 {"py", SPANISH + W4, GUARANI + W2},
841
842 {"qa", ARABIC + W4, 0},
843
844 {"re", FRENCH + W4, 0},
845 {"ro", ROMANIAN + W4, 0},
846 {"rs", SERBIAN + W8, MONTENEGRIN - W4},
847 {"ru", RUSSIAN + W4, 0},
848 {"rw", KINYARWANDA + W4, FRENCH + W2},
849
850 {"sa", ARABIC + W4, 0},
851 // {"sb", 0, 0},
852 {"sc", SESELWA + W4, 0},
853 {"sd", ARABIC + W4, 0},
854 {"se", SWEDISH + W4, 0},
855 // {"sg", 0, 0},
856 // {"sh", 0, 0},
857 {"si", SLOVENIAN + W4, 0},
858 {"sk", SLOVAK + W4, CZECH - W4},
859 // {"sl", 0, 0},
860 {"sm", ITALIAN + W4, 0},
861 {"sn", FRENCH + W4, 0},
862 // {"sr", 0, 0},
863 {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07
864 // {"st", 0, 0},
865 {"su", RUSSIAN + W4, 0},
866 {"sv", SPANISH + W4, 0},
867 {"sy", ARABIC + W4, 0},
868 // {"sz", 0, 0},
869
870 // {"tc", 0, 0},
871 {"td", FRENCH + W4, 0},
872 // {"tf", 0, 0},
873 {"tg", FRENCH + W4, 0},
874 {"th", THAI + W4, 0},
875 // Tibet has no country code (see .cn)
876 {"tj", TAJIK + W4, 0},
877 // {"tk", 0, 0},
878 // {"tl", 0, 0},
879 {"tm", TURKISH + W4, 0},
880 {"tn", FRENCH + W4, ARABIC + W4},
881 // {"to", 0, 0},
882 {"tp", JAPANESE + W4, 0},
883 {"tr", TURKISH + W4, 0},
884 // {"tt", 0, 0},
885 // {"tv", 0, 0},
886 {"tw", CHINESE_T + W4, 0},
887 {"tz", SWAHILI + W4, AKAN + W4},
888
889 {"ua", UKRAINIAN + W4, 0},
890 {"ug", GANDA + W4, 0},
891 {"uk", ENGLISH + W2, 0},
892 {"us", ENGLISH + W2, 0},
893 {"uy", SPANISH + W4, 0},
894 {"uz", UZBEK + W4, 0},
895
896 {"va", ITALIAN + W4, LATIN + W2},
897 // {"vc", 0, 0},
898 {"ve", SPANISH + W4, 0},
899 // {"vg", 0, 0},
900 // {"vi", 0, 0},
901 {"vn", VIETNAMESE + W4, 0},
902 // {"vu", 0, 0},
903
904 {"wf", FRENCH + W4, 0},
905 // {"ws", 0, 0},
906
907 {"ye", ARABIC + W4, 0},
908
909 {"za", AFRIKAANS + W4, 0},
910 // {"zm", 0, 0},
911 // {"zw", 0, 0},
912 };
913
914 #undef W2
915 #undef W4
916 #undef W6
917 #undef W8
918 #undef W10
919 #undef W12
920
921
922
923
924
SetCLDPriorWeight(int w,OneCLDLangPrior * olp)925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
926 *olp = (*olp & 0x3ff) + (w << 10);
927 }
SetCLDPriorLang(Language lang,OneCLDLangPrior * olp)928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
929 *olp = (*olp & ~0x3ff) + lang;
930 }
931
PackCLDPriorLangWeight(Language lang,int w)932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
933 return (w << 10) + lang;
934 }
935
MaxInt(int a,int b)936 inline int MaxInt(int a, int b) {
937 return (a >= b) ? a : b;
938 }
939
940 // Merge in another language prior, taking max if already there
MergeCLDLangPriorsMax(OneCLDLangPrior olp,CLDLangPriors * lps)941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
942 if (olp == 0) {return;}
943 Language target_lang = GetCLDPriorLang(olp);
944 for (int i = 0; i < lps->n; ++i) {
945 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
946 int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
947 GetCLDPriorWeight(olp));
948 SetCLDPriorWeight(new_weight, &lps->prior[i]);
949 return;
950 }
951 }
952 // Not found; add it if room
953 if (lps->n >= kMaxOneCLDLangPrior) {return;}
954 lps->prior[lps->n++] = olp;
955 }
956
957 // Merge in another language prior, boosting 10x if already there
MergeCLDLangPriorsBoost(OneCLDLangPrior olp,CLDLangPriors * lps)958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
959 if (olp == 0) {return;}
960 Language target_lang = GetCLDPriorLang(olp);
961 for (int i = 0; i < lps->n; ++i) {
962 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
963 int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
964 SetCLDPriorWeight(new_weight, &lps->prior[i]);
965 return;
966 }
967 }
968 // Not found; add it if room
969 if (lps->n >= kMaxOneCLDLangPrior) {return;}
970 lps->prior[lps->n++] = olp;
971 }
972
973
974 // Trim language priors to no more than max_entries, keeping largest abs weights
TrimCLDLangPriors(int max_entries,CLDLangPriors * lps)975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
976 if (lps->n <= max_entries) {return;}
977
978 // Insertion sort in-place by abs(weight)
979 for (int i = 0; i < lps->n; ++i) {
980 OneCLDLangPrior temp_olp = lps->prior[i];
981 int w = abs(GetCLDPriorWeight(temp_olp));
982 int kk = i;
983 for (; kk > 0; --kk) {
984 if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
985 // Move down and continue
986 lps->prior[kk] = lps->prior[kk - 1];
987 } else {
988 // abs(weight[kk - 1]) >= w, time to stop
989 break;
990 }
991 }
992 lps->prior[kk] = temp_olp;
993 }
994
995 lps->n = max_entries;
996 }
997
CountCommas(const string & langtags)998 int CountCommas(const string& langtags) {
999 int commas = 0;
1000 for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
1001 if (langtags[i] == ',') {++commas;}
1002 }
1003 return commas;
1004 }
1005
1006 // Binary lookup on language tag
DoLangTagLookup(const char * key,const LangTagLookup * tbl,int tbl_size)1007 const LangTagLookup* DoLangTagLookup(const char* key,
1008 const LangTagLookup* tbl, int tbl_size) {
1009 // Key is always in range [lo..hi)
1010 int lo = 0;
1011 int hi = tbl_size;
1012 while (lo < hi) {
1013 int mid = (lo + hi) >> 1;
1014 int comp = strcmp(tbl[mid].langtag, key);
1015 if (comp < 0) {
1016 lo = mid + 1;
1017 } else if (comp > 0) {
1018 hi = mid;
1019 } else {
1020 return &tbl[mid];
1021 }
1022 }
1023 return NULL;
1024 }
1025
1026 // Binary lookup on tld
DoTLDLookup(const char * key,const TLDLookup * tbl,int tbl_size)1027 const TLDLookup* DoTLDLookup(const char* key,
1028 const TLDLookup* tbl, int tbl_size) {
1029 // Key is always in range [lo..hi)
1030 int lo = 0;
1031 int hi = tbl_size;
1032 while (lo < hi) {
1033 int mid = (lo + hi) >> 1;
1034 int comp = strcmp(tbl[mid].tld, key);
1035 if (comp < 0) {
1036 lo = mid + 1;
1037 } else if (comp > 0) {
1038 hi = mid;
1039 } else {
1040 return &tbl[mid];
1041 }
1042 }
1043 return NULL;
1044 }
1045
1046
1047
1048 // Trim language tag string to canonical form for each language
1049 // Input is from GetLangTagsFromHtml(), already lowercased
TrimCLDLangTagsHint(const string & langtags)1050 string TrimCLDLangTagsHint(const string& langtags) {
1051 string retval;
1052 if (langtags.empty()) {return retval;}
1053 int commas = CountCommas(langtags);
1054 if (commas > 4) {return retval;} // Ignore if too many language tags
1055
1056 char temp[20];
1057 int pos = 0;
1058 while (pos < static_cast<int>(langtags.size())) {
1059 int comma = langtags.find(',', pos);
1060 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1061 int len = comma - pos;
1062 if (len <= 16) {
1063 // Short enough to use
1064 memcpy(temp, &langtags[pos], len);
1065 temp[len] = '\0';
1066 const LangTagLookup* entry = DoLangTagLookup(temp,
1067 kCLDLangTagsHintTable1,
1068 kCLDTable1Size);
1069 if (entry != NULL) {
1070 // First table hit
1071 retval.append(entry->langcode); // may be "code1,code2"
1072 retval.append(1, ',');
1073 } else {
1074 // Try second table with language code truncated at first hyphen
1075 char* hyphen = strchr(temp, '-');
1076 if (hyphen != NULL) {*hyphen = '\0';}
1077 len = strlen(temp);
1078 if (len <= 3) { // Short enough to use
1079 entry = DoLangTagLookup(temp,
1080 kCLDLangTagsHintTable2,
1081 kCLDTable2Size);
1082 if (entry != NULL) {
1083 // Second table hit
1084 retval.append(entry->langcode); // may be "code1,code2"
1085 retval.append(1, ',');
1086 }
1087 }
1088 }
1089 }
1090 pos = comma + 1;
1091 }
1092
1093 // Remove trainling comma, if any
1094 if (!retval.empty()) {retval.resize(retval.size() - 1);}
1095 return retval;
1096 }
1097
1098
1099
1100 //==============================================================================
1101
1102 // Little state machine to scan insides of language attribute quoted-string.
1103 // Each language code is lowercased and copied to the output string. Underscore
1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and
1105 // multiple consecutive commas are removed.
1106 // Each language code in the output list will be followed by a single comma.
1107
1108 // There are three states, and we start in state 1:
1109 // State 0: After a letter.
1110 // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
1111 // State 1: Just after a comma.
1112 // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
1113 // State 2: Skipping.
1114 // All characters except comma skip and stay in [2]. comma goes to [1]
1115
1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0,
1117 // and always comma when going to state 1 or 2. The design depends on copying
1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
1119
1120 // We pack all this into 8 bits:
1121 // +--+---+---+
1122 // |78|654|321|
1123 // +--+---+---+
1124 //
1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
1126 // where . is always zero
1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C.
1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
1129
1130 #define SKIP0 0
1131 #define SKIP1 1
1132 #define SKIP2 2
1133 #define COPY0 4 // copy kLangCodeRemap[c]
1134 #define COPY1 5 // copy ','
1135 #define COPY2 6 // copy ','
1136
1137 // These combined actions pack three states into one byte.
1138 // Ninth bit must be zero, so all state 2 values must be skips.
1139 // state[2] state[1] state[0]
1140 #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
1143 #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
1144
1145 // Treat as letter: a-z, A-Z
1146 // Treat as minus: 2D minus, 5F underscore
1147 // Treat as comma: 09 tab, 20 space, 2C comma
1148
1149 static const unsigned char kLangCodeAction[256] = {
1150 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
1151 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1152 COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
1153 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1154
1155 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1156 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
1157 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1158 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
1159
1160 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1161 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1162 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1163 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1164
1165 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1166 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1167 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1168 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1169 };
1170
1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma
1172 static const unsigned char kLangCodeRemap[256] = {
1173 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab
1174 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1175 ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus
1176 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1177
1178 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
1179 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore
1180 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
1181 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0,
1182
1183 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1184 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1185 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1186 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1187
1188 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1189 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1190 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1191 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1192 };
1193
1194 #undef LTR
1195 #undef MINUS
1196 #undef COMMA
1197 #undef Bad
1198
1199 #undef SKIP0
1200 #undef SKIP1
1201 #undef SKIP2
1202 #undef COPY0
1203 #undef COPY1
1204 #undef COPY2
1205
1206
1207 // Find opening '<' for HTML tag
1208 // Note: this is all somewhat insensitive to mismatched quotes
FindTagStart(const char * utf8_body,int32 pos,int32 max_pos)1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
1210 int i = pos;
1211 // Advance i by 4 if none of the next 4 bytes are '<'
1212 for (i = pos; i < (max_pos - 3); i += 4) {
1213 // Fast check for any <
1214 const char* p = &utf8_body[i];
1215 uint32 s0123 = UNALIGNED_LOAD32(p);
1216 uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<<
1217 if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
1218 // At least one byte is '<'
1219 break;
1220 }
1221 }
1222 // Continue, advancing i by 1
1223 for (; i < max_pos; ++i) {
1224 if (utf8_body[i] == '<') {return i;}
1225 }
1226 return -1;
1227 }
1228
1229
1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
FindTagEnd(const char * utf8_body,int32 pos,int32 max_pos)1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1232 // Always outside quotes
1233 for (int i = pos; i < max_pos; ++i) {
1234 char c = utf8_body[i];
1235 if (c == '>') {return i;}
1236 if (c == '<') {return i - 1;}
1237 if (c == '&') {return i - 1;}
1238 }
1239 return -1; // nothing found
1240 }
1241
1242 // Find opening quote or apostrophe, skipping spaces
1243 // Note: this is all somewhat insensitive to mismatched quotes
FindQuoteStart(const char * utf8_body,int32 pos,int32 max_pos)1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
1245 for (int i = pos; i < max_pos; ++i) {
1246 char c = utf8_body[i];
1247 if (c == '"') {return i;}
1248 if (c == '\'') {return i;}
1249 if (c != ' ') {return -1;}
1250 }
1251 return -1;
1252 }
1253
1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
FindQuoteEnd(const char * utf8_body,int32 pos,int32 max_pos)1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1256 // Always outside quotes
1257 for (int i = pos; i < max_pos; ++i) {
1258 char c = utf8_body[i];
1259 if (c == '"') {return i;}
1260 if (c == '\'') {return i;}
1261 if (c == '>') {return i - 1;}
1262 if (c == '=') {return i - 1;}
1263 if (c == '<') {return i - 1;}
1264 if (c == '&') {return i - 1;}
1265 }
1266 return -1; // nothing found
1267 }
1268
FindEqualSign(const char * utf8_body,int32 pos,int32 max_pos)1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
1270 // Outside quotes/apostrophes loop
1271 for (int i = pos; i < max_pos; ++i) {
1272 char c = utf8_body[i];
1273 if (c == '=') { // Found bare equal sign inside tag
1274 return i;
1275 } else if (c == '"') {
1276 // Inside quotes loop
1277 int j;
1278 for (j = i + 1; j < max_pos; ++j) {
1279 if (utf8_body[j] == '"') {
1280 break;
1281 } else if (utf8_body[j] == '\\') {
1282 ++j;
1283 }
1284 }
1285 i = j;
1286 } else if (c == '\'') {
1287 // Inside apostrophes loop
1288 int j;
1289 for (j = i + 1; j < max_pos; ++j) {
1290 if (utf8_body[j] == '\'') {
1291 break;
1292 } else if (utf8_body[j] == '\\') {
1293 ++j;
1294 }
1295 }
1296 i = j;
1297 }
1298
1299 }
1300 return -1; // nothing found
1301 }
1302
1303 // Scan backwards for case-insensitive string s in [min_pos..pos)
1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1305 // Cheap lowercase. Control codes will masquerade as 20..3f
FindBefore(const char * utf8_body,int32 min_pos,int32 pos,const char * s)1306 bool FindBefore(const char* utf8_body,
1307 int32 min_pos, int32 pos, const char* s) {
1308 int len = strlen(s);
1309 if ((pos - min_pos) < len) {return false;} // Too small to fit s
1310
1311 // Skip trailing spaces
1312 int i = pos;
1313 while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
1314 i -= len;
1315 if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found
1316
1317 const char* p = &utf8_body[i];
1318 for (int j = 0; j < len; ++j) {
1319 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
1320 }
1321 return true; // All bytes equal at i
1322 }
1323
1324 // Scan forwards for case-insensitive string s in [pos..max_pos)
1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1326 // Cheap lowercase. Control codes will masquerade as 20..3f
1327 // Allows but does not require quoted/apostrophe string
FindAfter(const char * utf8_body,int32 pos,int32 max_pos,const char * s)1328 bool FindAfter(const char* utf8_body,
1329 int32 pos, int32 max_pos, const char* s) {
1330 int len = strlen(s);
1331 if ((max_pos - pos) < len) {return false;} // Too small to fit s
1332
1333 // Skip leading spaces, quote, apostrophe
1334 int i = pos;
1335 while (i < (max_pos - len)) {
1336 unsigned char c = utf8_body[i];
1337 if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
1338 else {break;}
1339 }
1340
1341 const char* p = &utf8_body[i];
1342 for (int j = 0; j < len; ++j) {
1343 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
1344 }
1345 return true; // All bytes equal
1346 }
1347
1348
1349
1350 // Copy attribute value in [pos..max_pos)
1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one
1352 // String must all be on a single line.
1353 // Return slightly-normalized language list, empty or ending in comma
1354 // Does lowercasing and removes excess punctuation/space
CopyOneQuotedString(const char * utf8_body,int32 pos,int32 max_pos)1355 string CopyOneQuotedString(const char* utf8_body,
1356 int32 pos, int32 max_pos) {
1357 string s;
1358 int state = 1; // Front is logically just after a comma
1359 for (int i = pos; i < max_pos; ++i) {
1360 unsigned char c = utf8_body[i];
1361 int e = kLangCodeAction[c] >> (3 * state);
1362 state = e & 3; // Update to next state
1363 if ((e & 4) != 0) {
1364 // Copy a remapped byte if going to state 0, else copy a comma
1365 if (state == 0) {
1366 s.append(1, kLangCodeRemap[c]);
1367 } else {
1368 s.append(1, ',');
1369 }
1370 }
1371 }
1372
1373 // Add final comma if needed
1374 if (state == 0) {
1375 s.append(1, ',');
1376 }
1377 return s;
1378 }
1379
1380 // Find and copy attribute value: quoted string in [pos..max_pos)
1381 // Return slightly-normalized language list, empty or ending in comma
CopyQuotedString(const char * utf8_body,int32 pos,int32 max_pos)1382 string CopyQuotedString(const char* utf8_body,
1383 int32 pos, int32 max_pos) {
1384 int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
1385 if (start_quote < 0) {return string("");}
1386 int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
1387 if (end_quote < 0) {return string("");}
1388
1389 return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
1390 }
1391
1392 // Add hints to vector of langpriors
1393 // Input is from GetLangTagsFromHtml(), already lowercased
SetCLDLangTagsHint(const string & langtags,CLDLangPriors * langpriors)1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
1395 if (langtags.empty()) {return;}
1396 int commas = CountCommas(langtags);
1397 if (commas > 4) {return;} // Ignore if too many language tags
1398
1399 char temp[20];
1400 int pos = 0;
1401 while (pos < static_cast<int>(langtags.size())) {
1402 int comma = langtags.find(',', pos);
1403 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1404 int len = comma - pos;
1405 if (len <= 16) {
1406 // Short enough to use
1407 memcpy(temp, &langtags[pos], len);
1408 temp[len] = '\0';
1409 const LangTagLookup* entry = DoLangTagLookup(temp,
1410 kCLDLangTagsHintTable1,
1411 kCLDTable1Size);
1412 if (entry != NULL) {
1413 // First table hit
1414 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1415 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1416 } else {
1417 // Try second table with language code truncated at first hyphen
1418 char* hyphen = strchr(temp, '-');
1419 if (hyphen != NULL) {*hyphen = '\0';}
1420 len = strlen(temp);
1421 if (len <= 3) { // Short enough to use
1422 entry = DoLangTagLookup(temp,
1423 kCLDLangTagsHintTable2,
1424 kCLDTable2Size);
1425 if (entry != NULL) {
1426 // Second table hit
1427 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1428 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1429 }
1430 }
1431 }
1432 }
1433 pos = comma + 1;
1434 }
1435 }
1436
1437 // Add hints to vector of langpriors
1438 // Input is string after HTTP header Content-Language:
SetCLDContentLangHint(const char * contentlang,CLDLangPriors * langpriors)1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
1440 string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
1441 SetCLDLangTagsHint(langtags, langpriors);
1442 }
1443
1444 // Add hints to vector of langpriors
1445 // Input is last element of hostname (no dot), e.g. from GetTLD()
SetCLDTLDHint(const char * tld,CLDLangPriors * langpriors)1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
1447 int len = strlen(tld);
1448 if (len > 3) {return;} // Ignore if more than three letters
1449 char local_tld[4];
1450 strncpy(local_tld, tld, 4);
1451 local_tld[3] = '\0'; // Safety move
1452 // Lowercase
1453 for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
1454 const TLDLookup* entry = DoTLDLookup(local_tld,
1455 kCLDTLDHintTable,
1456 kCLDTable3Size);
1457 if (entry != NULL) {
1458 // Table hit
1459 MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
1460 MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
1461 }
1462 }
1463
1464 // Add hints to vector of langpriors
1465 // Input is from DetectEncoding()
SetCLDEncodingHint(Encoding enc,CLDLangPriors * langpriors)1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
1467 OneCLDLangPrior olp;
1468 switch (enc) {
1469 case CHINESE_GB:
1470 case GBK:
1471 case GB18030:
1472 case ISO_2022_CN:
1473 case HZ_GB_2312:
1474 olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
1475 MergeCLDLangPriorsBoost(olp, langpriors);
1476 break;
1477 case CHINESE_BIG5:
1478 case CHINESE_BIG5_CP950:
1479 case BIG5_HKSCS:
1480 olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
1481 MergeCLDLangPriorsBoost(olp, langpriors);
1482 break;
1483 case JAPANESE_EUC_JP:
1484 case JAPANESE_SHIFT_JIS:
1485 case JAPANESE_CP932:
1486 case JAPANESE_JIS: // ISO-2022-JP
1487 olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
1488 MergeCLDLangPriorsBoost(olp, langpriors);
1489 break;
1490 case KOREAN_EUC_KR:
1491 case ISO_2022_KR:
1492 olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
1493 MergeCLDLangPriorsBoost(olp, langpriors);
1494 break;
1495
1496 default:
1497 break;
1498 }
1499 }
1500
1501 // Add hints to vector of langpriors
1502 // Input is from random source
SetCLDLanguageHint(Language lang,CLDLangPriors * langpriors)1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
1504 OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
1505 MergeCLDLangPriorsBoost(olp, langpriors);
1506 }
1507
1508
1509 // Make printable string of priors
DumpCLDLangPriors(const CLDLangPriors * langpriors)1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
1511 string retval;
1512 for (int i = 0; i < langpriors->n; ++i) {
1513 char temp[64];
1514 sprintf(temp, "%s.%d ",
1515 LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
1516 GetCLDPriorWeight(langpriors->prior[i]));
1517 retval.append(temp);
1518 }
1519 return retval;
1520 }
1521
1522
1523
1524
1525 // Look for
1526 // <html lang="en">
1527 // <doc xml:lang="en">
1528 // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
1529 // <meta http-equiv="content-language" content="en-GB" />
1530 // <meta name="language" content="Srpski">
1531 // <meta name="DC.language" scheme="RFCOMMA766" content="en">
1532 // <SPAN id="msg1" class="info" lang='en'>
1533 //
1534 // Do not trigger on
1535 // <!-- lang=french ...-->
1536 // <font lang=postscript ...>
1537 // <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
1538 // <META name="Author" lang="fr" content="Arnaud Le Hors">
1539 //
1540 // Stop fairly quickly on mismatched quotes
1541 //
1542 // Allowed language characters
1543 // a-z A-Z -_ , space\t
1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
1545 // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
1546 // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation)
1547 // GB2312 => gb
1548 // Big5 => big
1549 // zh_CN.gb18030_C => zh-cn
1550 //
1551 // Remove duplicates and extra spaces as we go
1552 // Lowercase as we go.
1553
1554 // Get language tag hints from HTML body
1555 // Normalize: remove spaces and make lowercase comma list
1556
GetLangTagsFromHtml(const char * utf8_body,int32 utf8_body_len,int32 max_scan_bytes)1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
1558 int32 max_scan_bytes) {
1559 string retval;
1560 if (max_scan_bytes > utf8_body_len) {
1561 max_scan_bytes = utf8_body_len;
1562 }
1563
1564 int32 k = 0;
1565 while (k < max_scan_bytes) {
1566 int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
1567 if (start_tag < 0) {break;}
1568 int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
1569 // FindTagEnd exits on < > &
1570 if (end_tag < 0) {break;}
1571
1572 // Skip <!--...>
1573 // Skip <font ...>
1574 // Skip <script ...>
1575 // Skip <link ...>
1576 // Skip <img ...>
1577 // Skip <a ...>
1578 if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
1579 FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
1580 FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
1581 FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
1582 FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
1583 FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
1584 k = end_tag + 1;
1585 continue;
1586 }
1587
1588 // Remember <meta ...>
1589 bool in_meta = false;
1590 if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
1591 in_meta = true;
1592 }
1593
1594 // Scan for each equal sign inside tag
1595 bool content_is_lang = false;
1596 int32 kk = start_tag + 1;
1597 int32 equal_sign;
1598 while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
1599 // eq exits on < > &
1600
1601 // Look inside a meta tag
1602 // <meta ... http-equiv="content-language" ...>
1603 // <meta ... name="language" ...>
1604 // <meta ... name="dc.language" ...>
1605 if (in_meta) {
1606 if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
1607 FindAfter(utf8_body, equal_sign + 1, end_tag,
1608 "content-language ")) {
1609 content_is_lang = true;
1610 } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
1611 (FindAfter(utf8_body, equal_sign + 1, end_tag,
1612 "dc.language ") ||
1613 FindAfter(utf8_body, equal_sign + 1, end_tag,
1614 "language "))) {
1615 content_is_lang = true;
1616 }
1617 }
1618
1619 // Look inside any tag
1620 // <meta ... content="lang-list" ...>
1621 // <... lang="lang-list" ...>
1622 // <... xml:lang="lang-list" ...>
1623 if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
1624 " content")) ||
1625 FindBefore(utf8_body, kk, equal_sign, " lang") ||
1626 FindBefore(utf8_body, kk, equal_sign, ":lang")) {
1627 string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
1628
1629 // Append new lang tag(s) if not a duplicate
1630 if (!temp.empty() && (retval.find(temp) == string::npos)) {
1631 retval.append(temp);
1632 }
1633 }
1634
1635 kk = equal_sign + 1;
1636 }
1637 k = end_tag + 1;
1638 }
1639
1640 // Strip last comma
1641 if (retval.size() > 1) {
1642 retval.erase(retval.size() - 1);
1643 }
1644 return retval;
1645 }
1646
1647 } // End namespace CLD2
1648
1649 //==============================================================================
1650