1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18 
19 #include "compact_lang_det_hint_code.h"
20 
21 #include <stdlib.h>     // for abs()
22 #include <stdio.h>      // for sprintf()
23 #include <string.h>     //
24 #include "lang_script.h"
25 #include "port.h"
26 
27 using namespace std;
28 
29 namespace CLD2 {
30 
31 static const int kCLDPriorEncodingWeight = 4;   // 100x more likely
32 static const int kCLDPriorLanguageWeight = 8;   // 10000x more likely
33 
34 
35 // Tables to map lang="..." language code lists to actual languages.
36 // based on scraping and hand-edits, dsites June 2011
37 
38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
39 
40 // For close pairs like ms/id, more weight on TLD and lang=
41 // Alternately, weaker boost but mark others of set as negative;
42 // makes "neither" an easier result.
43 // lang=en low weight 4
44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
45 // (except maybe en)
46 
47 // TLD to separate, e.g., burundi from rwanda
48 
49 // Encoding lookup: OneLangProb array
50 // TLD lookup:   tld OneLangProb pairs
51 
52 
53 typedef struct {
54   const char* const langtag;    // Lowercased, hyphen only lookup key
55   const char* const langcode;   // Canonical language codes; two if ambiguous
56   OneCLDLangPrior onelangprior1;
57   OneCLDLangPrior onelangprior2;
58 } LangTagLookup;
59 
60 typedef struct {
61   const char* const tld;        // Lowercased, hyphen only lookup key
62   OneCLDLangPrior onelangprior1;
63   OneCLDLangPrior onelangprior2;
64 } TLDLookup;
65 
66 
67 #define W2 (2 << 10)            // 3**2 = 10x more likely
68 #define W4 (4 << 10)            // 3**4 = 100x more likely
69 #define W6 (6 << 10)            // 3**6 = 1000x more likely
70 #define W8 (8 << 10)            // 3**8 = 10K x more likely
71 #define W10 (10 << 10)          // 3**10 = 100K x more likely
72 #define W12 (12 << 10)          // 3**12 = 1M x more likely
73 
74 // TODO: more about ba hr sr sr-ME and sl
75 // Temporary state of affairs:
76 //   BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
77 // Eventually, we want to do all four, but it requires a CLD change to handle
78 // up to six languages per quadgram.
79 
80 
81 // Close pairs boost one of pair, demote other.
82 //   Statistically close pairs:
83 //   INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
84 //
85 //   INDONESIAN MALAY coef=0.4698        Problematic w/o extra words
86 //   TIBETAN DZONGKHA coef=0.4571
87 //   CZECH SLOVAK coef=0.4273
88 //   NORWEGIAN NORWEGIAN_N coef=0.4182
89 //
90 //   HINDI MARATHI coef=0.3795
91 //   ZULU XHOSA coef=0.3716
92 //
93 //   DANISH NORWEGIAN coef=0.3672        Usually OK
94 //   BIHARI HINDI coef=0.3668            Usually OK
95 //   ICELANDIC FAROESE coef=0.3519       Usually OK
96 
97 //
98 // Table to look up lang= tags longer than three characters
99 // Overrides table below, which is truncated at first hyphen
100 // In alphabetical order for binary search
101 static const int kCLDTable1Size = 213;
102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
103   {"abkhazian", "ab", ABKHAZIAN + W10, 0},
104   {"afar", "aa", AFAR + W10, 0},
105   {"afrikaans", "af", AFRIKAANS + W10, 0},
106   {"akan", "ak", AKAN + W10, 0},
107   {"albanian", "sq", ALBANIAN + W10, 0},
108   {"am-am", "hy", ARMENIAN + W10, 0},        // 1:2 Armenian, not ambiguous
109   {"amharic", "am", AMHARIC + W10, 0},
110   {"arabic", "ar", ARABIC + W10, 0},
111   {"argentina", "es", SPANISH + W10, 0},
112   {"armenian", "hy", ARMENIAN + W10, 0},
113   {"assamese", "as", ASSAMESE + W10, 0},
114   {"aymara", "ay", AYMARA + W10, 0},
115   {"azerbaijani", "az", AZERBAIJANI + W10, 0},
116 
117   {"bangla", "bn", BENGALI + W10, 0},
118   {"bashkir", "ba", BASHKIR + W10, 0},
119   {"basque", "eu", BASQUE + W10, 0},
120   {"belarusian", "be", BELARUSIAN + W10, 0},
121   {"bengali", "bn", BENGALI + W10, 0},
122   {"bihari", "bh", BIHARI + W10, HINDI - W4},
123   {"bislama", "bi", BISLAMA + W10, 0},
124   {"bosnian", "bs", BOSNIAN + W10, 0},      // Bosnian => Bosnian
125   {"br-br", "pt", PORTUGUESE + W10, 0},     // 1:2 Portuguese, not ambiguous
126   {"br-fr", "br", BRETON + W10, 0},         // 1:2 Breton, not ambiguous
127   {"breton", "br", BRETON + W10, 0},
128   {"bulgarian", "bg", BULGARIAN + W10, 0},
129   {"burmese", "my", BURMESE + W10, 0},      // Myanmar
130 
131   {"catalan", "ca", CATALAN + W10, 0},
132   {"cherokee", "chr", CHEROKEE + W10, 0},
133   {"chichewa", "ny", NYANJA + W10, 0},
134 
135   {"chinese", "zh", CHINESE + W10, 0},
136   {"chinese-t", "zhT", CHINESE_T + W10, 0},
137   {"chineset", "zhT", CHINESE_T + W10, 0},
138   {"corsican", "co", CORSICAN + W10, 0},
139   {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
140   {"croatian", "hr", CROATIAN + W10, 0},
141   {"czech", "cs", CZECH + W10, SLOVAK - W4},
142 
143   {"danish", "da", DANISH + W10, NORWEGIAN - W4},
144   {"deutsch", "de", GERMAN + W10, 0},
145   {"dhivehi", "dv", DHIVEHI + W10, 0},
146   {"dutch", "nl", DUTCH + W10, 0},
147   {"dzongkha", "dz", DZONGKHA + W10,  TIBETAN - W4},
148 
149   {"ell-gr", "el", GREEK + W10, 0},
150   {"english", "en", ENGLISH + W4, 0},
151   {"esperanto", "eo", ESPERANTO + W10, 0},
152   {"estonian", "et", ESTONIAN + W10, 0},
153   {"euc-jp", "ja", JAPANESE + W10, 0},       // Japanese encoding
154   {"euc-kr", "ko", KOREAN + W10, 0},         // Korean encoding
155 
156   {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
157   {"fijian", "fj", FIJIAN + W10, 0},
158   {"finnish", "fi", FINNISH + W10, 0},
159   {"fran", "fr", FRENCH + W10, 0},            // Truncated at non-ASCII
160   {"francais", "fr", FRENCH + W10, 0},
161   {"french", "fr", FRENCH + W10, 0},
162   {"frisian", "fy", FRISIAN + W10, 0},
163 
164   {"ga-es", "gl", GALICIAN + W10, 0},         // 1:2 Galician, not ambiguous
165   {"galician", "gl", GALICIAN + W10, 0},
166   {"ganda", "lg", GANDA + W10, 0},
167   {"georgian", "ka", GEORGIAN + W10, 0},
168   {"german", "de", GERMAN + W10, 0},
169   {"greek", "el", GREEK + W10, 0},
170   {"greenlandic", "kl", GREENLANDIC + W10, 0},
171   {"guarani", "gn", GUARANI + W10, 0},
172   {"gujarati", "gu", GUJARATI + W10, 0},
173 
174   {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
175   {"hausa", "ha", HAUSA + W10, 0},
176   {"hawaiian", "haw", HAWAIIAN + W10, 0},
177   {"hebrew", "he", HEBREW + W10, 0},
178   {"hindi", "hi", HINDI + W10, MARATHI - W4},
179   {"hn-in", "hi", HINDI + W10, MARATHI - W4},
180   {"hungarian", "hu", HUNGARIAN + W10, 0},
181 
182   {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
183   {"igbo", "ig", IGBO + W10, 0},
184   {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
185   {"interlingua", "ia", INTERLINGUA + W10, 0},
186   {"interlingue", "ie", INTERLINGUE + W10, 0},
187   // 1:2 iu-Cans ik-Latn
188   {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
189   {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10},   // 1:2
190   {"ir-ie", "ga", IRISH + W10, 0},          // Irish
191   {"irish", "ga", IRISH + W10, 0},
192   {"italian", "it", ITALIAN + W10, 0},
193 
194   {"ja-euc", "ja", JAPANESE + W10, 0},      // Japanese encoding
195   {"jan-jp", "ja", JAPANESE + W10, 0},      // Japanese encoding
196   {"japanese", "ja", JAPANESE + W10, 0},
197   {"javanese", "jw", JAVANESE + W10, 0},
198 
199   {"kannada", "kn", KANNADA + W10, 0},
200   {"kashmiri", "ks", KASHMIRI + W10, 0},
201   {"kazakh", "kk", KAZAKH + W10, 0},
202   {"khasi", "kha", KHASI + W10, 0},
203   {"khmer", "km", KHMER + W10, 0},
204   {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
205   {"klingon", "tlh", X_KLINGON + W10, 0},
206   {"korean", "ko", KOREAN + W10, 0},
207   {"kurdish", "ku", KURDISH + W10, 0},
208   {"kyrgyz", "ky", KYRGYZ + W10, 0},
209 
210   {"laothian", "lo", LAOTHIAN + W10, 0},
211   {"latin", "la", LATIN + W10, 0},
212   {"latvian", "lv", LATVIAN + W10, 0},
213   {"limbu", "sit", LIMBU + W10, 0},
214   {"lingala", "ln", LINGALA + W10, 0},
215   {"lithuanian", "lt", LITHUANIAN + W10, 0},
216   {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
217 
218   {"macedonian", "mk", MACEDONIAN + W10, 0},
219   {"malagasy", "mg", MALAGASY + W10, 0},
220   {"malay", "ms", MALAY + W10, INDONESIAN - W4},
221   {"malayalam", "ml", MALAYALAM + W10, 0},
222   {"maltese", "mt", MALTESE + W10, 0},
223   {"manx", "gv", MANX + W10, 0},
224   {"maori", "mi", MAORI + W10, 0},
225   {"marathi", "mr", MARATHI + W10, HINDI - W4},
226   {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
227   {"moldavian", "mo", ROMANIAN + W10, 0},
228   {"mongolian", "mn", MONGOLIAN + W10, 0},
229   {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
230   {"myanmar", "my", BURMESE + W10, 0},      // Myanmar
231   {"nauru", "na", NAURU + W10, 0},
232   {"ndebele", "nr", NDEBELE + W10, 0},
233   {"nepali", "ne", NEPALI + W10, 0},
234   {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},       // Bokmaal
235   {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
236   {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},        // Bokmaal
237   {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
238   {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},       // Nynorsk
239   {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
240   {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
241   {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
242   {"nyanja", "ny", NYANJA + W10, 0},
243 
244   {"occitan", "oc", OCCITAN + W10, 0},
245   {"oriya", "or", ORIYA + W10, 0},
246   {"oromo", "om", OROMO + W10, 0},
247   {"parsi", "fa", PERSIAN + W10, 0},
248 
249   {"pashto", "ps", PASHTO + W10, 0},
250   {"pedi", "nso", PEDI + W10, 0},
251   {"persian", "fa", PERSIAN + W10, 0},
252   {"polish", "pl", POLISH + W10, 0},
253   {"polska", "pl", POLISH + W10, 0},
254   {"polski", "pl", POLISH + W10, 0},
255   {"portugu", "pt", PORTUGUESE + W10, 0},     // Truncated at non-ASCII
256   {"portuguese", "pt", PORTUGUESE + W10, 0},
257   {"punjabi", "pa", PUNJABI + W10, 0},
258 
259   {"quechua", "qu", QUECHUA + W10, 0},
260 
261   {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
262   {"romanian", "ro", ROMANIAN + W10, 0},
263   {"rundi", "rn", RUNDI + W10, 0},
264   {"russian", "ru", RUSSIAN + W10, 0},
265 
266   {"samoan", "sm", SAMOAN + W10, 0},
267   {"sango", "sg", SANGO + W10, 0},
268   {"sanskrit", "sa", SANSKRIT + W10, 0},
269   {"scots", "sco", SCOTS + W10, ENGLISH - W4},
270   {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
271   {"serbian", "sr", SERBIAN + W10, 0},
272   {"seselwa", "crs", SESELWA + W10, 0},
273   {"sesotho", "st", SESOTHO + W10, 0},
274   {"shift-jis", "ja", JAPANESE + W10, 0},   // Japanese encoding
275   {"shift-js", "ja", JAPANESE + W10, 0},    // Japanese encoding
276   {"shona", "sn", SHONA + W10, 0},
277   {"si-lk", "si", SINHALESE + W10, 0},      // 1:2 Sri Lanka, not ambiguous
278   {"si-si", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
279   {"si-sl", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
280   {"sindhi", "sd", SINDHI + W10, 0},
281   {"sinhalese", "si", SINHALESE + W10, 0},
282   {"siswant", "ss", SISWANT + W10, 0},
283   {"sit-np", "sit", LIMBU + W10, 0},
284   {"slovak", "sk", SLOVAK + W10, CZECH - W4},
285   {"slovenian", "sl", SLOVENIAN + W10, 0},
286   {"somali", "so", SOMALI + W10, 0},
287   {"spanish", "es", SPANISH + W10, 0},
288   {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
289   {"sundanese", "su", SUNDANESE + W10, 0},
290   {"suomi", "fi", FINNISH + W10, 0},        // Finnish
291   {"swahili", "sw", SWAHILI + W10, 0},
292   {"swedish", "sv", SWEDISH + W10, 0},
293   {"syriac", "syr", SYRIAC + W10, 0},
294 
295   {"tagalog", "tl", TAGALOG + W10, 0},
296   {"tajik", "tg", TAJIK + W10, 0},
297   {"tamil", "ta", TAMIL + W10, 0},
298   {"tatar", "tt", TATAR + W10, 0},
299   {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4},        // Tibet
300   {"tchinese", "zhT", CHINESE_T + W10, 0},
301   {"telugu", "te", TELUGU + W10, 0},
302   {"thai", "th", THAI + W10, 0},
303   {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
304   {"tigrinya", "ti", TIGRINYA + W10, 0},
305   {"tonga", "to", TONGA + W10, 0},
306   {"tsonga", "ts", TSONGA + W10, 0},
307   {"tswana", "tn", TSWANA + W10, 0},
308   {"tt-ru", "tt", TATAR + W10, 0},
309   {"tur-tr", "tr", TURKISH + W10, 0},
310   {"turkish", "tr", TURKISH + W10, 0},
311   {"turkmen", "tk", TURKMEN + W10, 0},
312   {"uighur", "ug", UIGHUR + W10, 0},
313   {"ukrainian", "uk", UKRAINIAN + W10, 0},
314   {"urdu", "ur", URDU + W10, 0},
315   {"uzbek", "uz", UZBEK + W10, 0},
316 
317   {"venda", "ve", VENDA + W10, 0},
318   {"vietnam", "vi", VIETNAMESE + W10, 0},
319   {"vietnamese", "vi", VIETNAMESE + W10, 0},
320   {"volapuk", "vo", VOLAPUK + W10, 0},
321 
322   {"welsh", "cy", WELSH + W10, 0},
323   {"wolof", "wo", WOLOF + W10, 0},
324 
325   {"xhosa", "xh", XHOSA + W10, ZULU - W4},
326 
327   {"yiddish", "yi", YIDDISH + W10, 0},
328   {"yoruba", "yo", YORUBA + W10, 0},
329 
330   {"zh-classical", "zhT", CHINESE_T + W10, 0},
331   {"zh-cn", "zh", CHINESE + W10, 0},
332   {"zh-hans", "zh", CHINESE + W10, 0},
333   {"zh-hant", "zhT", CHINESE_T + W10, 0},
334   {"zh-hk", "zhT", CHINESE_T + W10, 0},
335   {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
336   {"zh-sg", "zhT", CHINESE_T + W10, 0},
337   {"zh-tw", "zhT", CHINESE_T + W10, 0},
338   {"zh-yue", "zh", CHINESE + W10, 0},       // Yue (Cantonese) => Chinese
339   {"zhuang", "za", ZHUANG + W10, 0},
340   {"zulu", "zu", ZULU + W10, XHOSA - W4},
341 };
342 
343 
344 
345 // Table to look up lang= tags of two/three characters after truncate at hyphen
346 // In alphabetical order for binary search
347 static const int kCLDTable2Size = 257;
348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
349   {"aa", "aa", AFAR + W10, 0},
350   {"ab", "ab", ABKHAZIAN + W10, 0},
351   {"af", "af", AFRIKAANS + W10, 0},
352   {"ak", "ak", AKAN + W10, 0},
353   {"al", "sq", ALBANIAN + W10, 0},          // Albania
354   {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10},  // 1:2 Amharic Armenian
355   {"ar", "ar", ARABIC + W10, 0},
356   {"ara", "ar", ARABIC + W10, 0},
357   {"arm", "hy", ARMENIAN + W10, 0},         // Armenia
358   {"arz", "ar", ARABIC + W10, 0},           // Egyptian Arabic
359   {"as", "as", ASSAMESE + W10, 0},
360   {"at", "de", GERMAN + W10, 0},            // Austria
361   {"au", "de", GERMAN + W10, 0},            // Austria
362   {"ay", "ay", AYMARA + W10, 0},
363   {"az", "az", AZERBAIJANI + W10, 0},
364   {"aze", "az", AZERBAIJANI + W10, 0},
365 
366   {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10},  // 1:2  Bashkir Bosnia
367   {"be", "be", BELARUSIAN + W10, 0},
368   {"bel", "be", BELARUSIAN + W10, 0},
369   {"bg", "bg", BULGARIAN + W10, 0},
370   {"bh", "bh", BIHARI + W10, HINDI - W4},
371   {"bi", "bi", BISLAMA + W10, 0},
372   {"big", "zhT", CHINESE_T + W10, 0},        // Big5 encoding
373   {"bm", "ms", MALAY + W10, INDONESIAN - W4},             // Bahasa Malaysia
374   {"bn", "bn", BENGALI + W10, 0},
375   {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
376   // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
377   {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
378   {"bs", "bs", BOSNIAN + W10, 0},           // Bosnian => Bosnian
379 
380   {"ca", "ca", CATALAN + W10, 0},
381   {"cat", "ca", CATALAN + W10, 0},
382   {"ch", "de,fr", GERMAN + W10, FRENCH + W10},    // 1:2 Switzerland
383   {"chn", "zh", CHINESE + W10, 0},
384   {"chr", "chr", CHEROKEE + W10, 0},
385   {"ckb", "ku", KURDISH + W10, 0},          // Central Kurdish
386   {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4},   // Ambiguous, so weaker.
387                                                 // Offset by 2 so that TLD=tw or
388                                                 // enc=big5 will put zhT ahead
389   {"co", "co", CORSICAN + W10, 0},
390   {"cro", "hr", CROATIAN + W10, 0},          // Croatia
391   {"crs", "crs", SESELWA + W10, 0},
392   {"cs", "cs", CZECH + W10, SLOVAK - W4},
393   {"ct", "ca", CATALAN + W10, 0},
394   {"cy", "cy", WELSH + W10, 0},
395   {"cym", "cy", WELSH + W10, 0},
396   {"cz", "cs", CZECH + W10, SLOVAK - W4},
397 
398   {"da", "da", DANISH + W10, NORWEGIAN - W4},
399   {"dan", "da", DANISH + W10, NORWEGIAN - W4},
400   {"de", "de", GERMAN + W10, 0},
401   {"deu", "de", GERMAN + W10, 0},
402   {"div", "dv", DHIVEHI + W10, 0},
403   {"dk", "da", DANISH + W10, NORWEGIAN - W4},            // Denmark
404   {"dut", "nl", DUTCH + W10, 0},            // Dutch
405   {"dv", "dv", DHIVEHI + W10, 0},
406   {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
407 
408   {"ee", "et", ESTONIAN + W10, 0},          // Estonia
409   {"eg", "ar", ARABIC + W10, 0},            // Egypt
410   {"el", "el", GREEK + W10, 0},
411   {"en", "en", ENGLISH + W4, 0},
412   {"eng", "en", ENGLISH + W4, 0},
413   {"eo", "eo", ESPERANTO + W10, 0},
414   {"er", "ur", URDU + W10, 0},              // "Erdu"
415   {"es", "es", SPANISH + W10, 0},
416   {"esp", "es", SPANISH + W10, 0},
417   {"est", "et", ESTONIAN + W10, 0},
418   {"et", "et", ESTONIAN + W10, 0},
419   {"eu", "eu", BASQUE + W10, 0},
420 
421   {"fa", "fa", PERSIAN + W10, 0},
422   {"far", "fa", PERSIAN + W10, 0},
423   {"fi", "fi", FINNISH + W10, 0},
424   {"fil", "tl", TAGALOG + W10, 0},          // Philippines
425   {"fj", "fj", FIJIAN + W10, 0},
426   {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
427   {"fr", "fr", FRENCH + W10, 0},
428   {"fra", "fr", FRENCH + W10, 0},
429   {"fre", "fr", FRENCH + W10, 0},
430   {"fy", "fy", FRISIAN + W10, 0},
431 
432   {"ga", "ga,gl", IRISH + W10, GALICIAN + W10},       // 1:2 Irish, Galician
433   {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10},  // 1:2 Gaelic, either
434   {"gal", "gl", GALICIAN + W10, 0},
435   {"gb", "zh", CHINESE + W10, 0},           // GB2312 encoding
436   {"gbk", "zh", CHINESE + W10, 0},          // GBK encoding
437   {"gd", "gd", SCOTS_GAELIC + W10, 0},
438   {"ge", "ka", GEORGIAN + W10, 0},          // Georgia
439   {"geo", "ka", GEORGIAN + W10, 0},
440   {"ger", "de", GERMAN + W10, 0},
441   {"gl", "gl", GALICIAN + W10, 0},          // Also Greenland; hard to confuse
442   {"gn", "gn", GUARANI + W10, 0},
443   {"gr", "el", GREEK + W10, 0},             // Greece
444   {"gu", "gu", GUJARATI + W10, 0},
445   {"gv", "gv", MANX + W10, 0},
446 
447   {"ha", "ha", HAUSA + W10, 0},
448   {"hat", "ht", HAITIAN_CREOLE + W10, 0},   // Haiti
449   {"haw", "haw", HAWAIIAN + W10, 0},
450   {"hb", "he", HEBREW + W10, 0},
451   {"he", "he", HEBREW + W10, 0},
452   {"heb", "he", HEBREW + W10, 0},
453   {"hi", "hi", HINDI + W10, MARATHI - W4},
454   {"hk", "zhT", CHINESE_T + W10, 0},          // Hong Kong
455   {"hr", "hr", CROATIAN + W10, 0},
456   {"ht", "ht", HAITIAN_CREOLE + W10, 0},
457   {"hu", "hu", HUNGARIAN + W10, 0},
458   {"hun", "hu", HUNGARIAN + W10, 0},
459   {"hy", "hy", ARMENIAN + W10, 0},
460 
461   {"ia", "ia", INTERLINGUA + W10, 0},
462   {"ice", "is", ICELANDIC + W10, FAROESE - W4},        // Iceland
463   {"id", "id", INDONESIAN + W10, MALAY - W4},
464   {"ids", "id", INDONESIAN + W10, MALAY - W4},
465   {"ie", "ie", INTERLINGUE + W10, 0},
466   {"ig", "ig", IGBO + W10, 0},
467   // 1:2 iu-Cans ik-Latn
468   {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10},        // 1:2
469   {"in", "id", INDONESIAN + W10, MALAY - W4},
470   {"ind", "id", INDONESIAN + W10, MALAY - W4},       // Indonesia
471   {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},       // 1:2
472   {"is", "is", ICELANDIC + W10, FAROESE - W4},
473   {"it", "it", ITALIAN + W10, 0},
474   {"ita", "it", ITALIAN + W10, 0},
475   {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},        // 1:2
476   {"iw", "he", HEBREW + W10, 0},
477 
478   {"ja", "ja", JAPANESE + W10, 0},
479   {"jp", "ja", JAPANESE + W10, 0},          // Japan
480   {"jpn", "ja", JAPANESE + W10, 0},
481   {"jv", "jw", JAVANESE + W10, 0},
482   {"jw", "jw", JAVANESE + W10, 0},
483 
484   {"ka", "ka", GEORGIAN + W10, 0},
485   {"kc", "qu", QUECHUA + W10, 0},           // (K)Quechua
486   {"kg", "ky", KYRGYZ + W10, 0},            // Kyrgyzstan
487   {"kh", "km", KHMER + W10, 0},             // Country code Khmer (Cambodia)
488   {"kha", "kha", KHASI + W10, 0},
489   {"kk", "kk", KAZAKH + W10, 0},            // Kazakh
490   {"kl", "kl", GREENLANDIC + W10, 0},
491   {"km", "km", KHMER + W10, 0},
492   {"kn", "kn", KANNADA + W10, 0},
493   {"ko", "ko", KOREAN + W10, 0},
494   {"kor", "ko", KOREAN + W10, 0},
495   {"kr", "ko", KOREAN + W10, 0},            // Country code Korea
496   {"ks", "ks", KASHMIRI + W10, 0},
497   {"ksc", "ko", KOREAN + W10, 0},           // KSC encoding
498   {"ku", "ku", KURDISH + W10, 0},
499   {"ky", "ky", KYRGYZ + W10, 0},
500   {"kz", "kk", KAZAKH + W10, 0},            // Kazakhstan
501   {"la", "la", LATIN + W10, 0},
502   {"lao", "lo", LAOTHIAN + W10, 0},         // Laos
503 
504   {"lb", "lb", LUXEMBOURGISH + W10, 0},
505   {"lg", "lg", GANDA + W10, 0},
506   {"lit", "lt", LITHUANIAN + W10, 0},
507   {"ln", "ln", LINGALA + W10, 0},
508   {"lo", "lo", LAOTHIAN + W10, 0},
509   {"lt", "lt", LITHUANIAN + W10, 0},
510   {"ltu", "lt", LITHUANIAN + W10, 0},
511   {"lv", "lv", LATVIAN + W10, 0},
512 
513   {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
514   {"mg", "mg", MALAGASY + W10, 0},
515   {"mi", "mi", MAORI + W10, 0},
516   {"mk", "mk", MACEDONIAN + W10, 0},
517   {"ml", "ml", MALAYALAM + W10, 0},
518   {"mn", "mn", MONGOLIAN + W10, 0},
519   {"mo", "mo", ROMANIAN + W10, 0},
520   {"mon", "mn", MONGOLIAN + W10, 0},        // Mongolian
521   {"mr", "mr", MARATHI + W10, HINDI - W4},
522   {"ms", "ms", MALAY + W10, INDONESIAN - W4},
523   {"mt", "mt", MALTESE + W10, 0},
524   {"mx", "es", SPANISH + W10, 0},           // Mexico
525   {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
526 
527   {"na", "na", NAURU + W10, 0},
528   {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
529   {"ne", "ne", NEPALI + W10, 0},
530   {"nl", "nl", DUTCH + W10, 0},
531   {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
532   {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
533   {"nr", "nr", NDEBELE + W10, 0},
534   {"nso", "nso", PEDI + W10, 0},
535   {"ny", "ny", NYANJA + W10, 0},
536 
537   {"oc", "oc", OCCITAN + W10, 0},
538   {"om", "om", OROMO + W10, 0},
539   {"or", "or", ORIYA + W10, 0},
540 
541   {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10},   // 1:2 pa-Guru ps-Arab
542   {"per", "fa", PERSIAN + W10, 0},
543   {"ph", "tl", TAGALOG + W10, 0},           // Philippines
544   {"pk", "ur", URDU + W10, 0},              // Pakistan
545   {"pl", "pl", POLISH + W10, 0},
546   {"pnb", "pa", PUNJABI + W10, 0},          // Western Punjabi
547   {"pol", "pl", POLISH + W10, 0},
548   {"por", "pt", PORTUGUESE + W10, 0},
549   {"ps", "ps", PASHTO + W10, 0},
550   {"pt", "pt", PORTUGUESE + W10, 0},
551   {"ptg", "pt", PORTUGUESE + W10, 0},
552   {"qc", "fr", FRENCH + W10, 0},            // Quebec "country" code
553   {"qu", "qu", QUECHUA + W10, 0},
554 
555   {"rm", "rm", RHAETO_ROMANCE + W10, 0},
556   {"rn", "rn", RUNDI + W10, 0},
557   {"ro", "ro", ROMANIAN + W10, 0},
558   {"rs", "sr", SERBIAN + W10, 0},           // Serbia country code
559   {"ru", "ru", RUSSIAN + W10, 0},
560   {"rus", "ru", RUSSIAN + W10, 0},
561   {"rw", "rw", KINYARWANDA + W10, 0},
562 
563   {"sa", "sa", SANSKRIT + W10, 0},
564   {"sco", "sco", SCOTS + W10, ENGLISH - W4},
565   {"sd", "sd", SINDHI + W10, 0},
566   {"se", "sv", SWEDISH + W10, 0},
567   {"sg", "sg", SANGO + W10, 0},
568   {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10},  // 1:2 Sinhalese, Slovinia
569   {"sk", "sk", SLOVAK + W10, CZECH - W4},
570   {"sl", "sl", SLOVENIAN + W10, 0},
571   {"slo", "sl", SLOVENIAN + W10, 0},
572   {"sm", "sm", SAMOAN + W10, 0},
573   {"sn", "sn", SHONA + W10, 0},
574   {"so", "so", SOMALI + W10, 0},
575   {"sp", "es", SPANISH + W10, 0},
576   {"sq", "sq", ALBANIAN + W10, 0},
577   {"sr", "sr", SERBIAN + W10, 0},
578   {"srb", "sr", SERBIAN + W10, 0},
579   {"srl", "sr", SERBIAN + W10, 0},          // Serbian Latin
580   {"srp", "sr", SERBIAN + W10, 0},
581   {"ss", "ss", SISWANT + W10, 0},
582   {"st", "st", SESOTHO + W10, 0},
583   {"su", "su", SUNDANESE + W10, 0},
584   {"sv", "sv", SWEDISH + W10, 0},
585   {"sve", "sv", SWEDISH + W10, 0},
586   {"sw", "sw", SWAHILI + W10, 0},
587   {"swe", "sv", SWEDISH + W10, 0},
588   {"sy", "syr", SYRIAC + W10, 0},
589   {"syr", "syr", SYRIAC + W10, 0},
590 
591   {"ta", "ta", TAMIL + W10, 0},
592   {"te", "te", TELUGU + W10, 0},
593   {"tg", "tg", TAJIK + W10, 0},
594   {"th", "th", THAI + W10, 0},
595   {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10},    // 1:2 Tigrinya, Tibet
596   {"tj", "tg", TAJIK + W10, 0},             // Tajikistan
597   {"tk", "tk", TURKMEN + W10, 0},
598   {"tl", "tl", TAGALOG + W10, 0},
599   {"tlh", "tlh", X_KLINGON + W10, 0},
600   {"tn", "tn", TSWANA + W10, 0},
601   {"to", "to", TONGA + W10, 0},
602   {"tr", "tr", TURKISH + W10, 0},
603   {"ts", "ts", TSONGA + W10, 0},
604   {"tt", "tt", TATAR + W10, 0},
605   {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10},   // 1:2 Twi => Akan, Taiwan
606   {"twi", "ak", AKAN + W10, 0},             // Twi => Akan
607 
608   {"ua", "uk", UKRAINIAN + W10, 0},         // Ukraine
609   {"ug", "ug", UIGHUR + W10, 0},
610   {"uk", "uk", UKRAINIAN + W10, 0},
611   {"ur", "ur", URDU + W10, 0},
612   {"uz", "uz", UZBEK + W10, 0},
613 
614   {"va", "ca", CATALAN + W10, 0},           // Valencia => Catalan
615   {"val", "ca", CATALAN + W10, 0},          // Valencia => Catalan
616   {"ve", "ve", VENDA + W10, 0},
617   {"vi", "vi", VIETNAMESE + W10, 0},
618   {"vie", "vi", VIETNAMESE + W10, 0},
619   {"vn", "vi", VIETNAMESE + W10, 0},
620   {"vo", "vo", VOLAPUK + W10, 0},
621 
622   {"wo", "wo", WOLOF + W10, 0},
623 
624   {"xh", "xh", XHOSA + W10, ZULU - W4},
625   {"xho", "xh", XHOSA + W10, ZULU - W4},
626 
627   {"yi", "yi", YIDDISH + W10, 0},
628   {"yo", "yo", YORUBA + W10, 0},
629 
630   {"za", "za", ZHUANG + W10, 0},
631   {"zh", "zh", CHINESE + W10, 0},
632   {"zht", "zhT", CHINESE_T + W10, 0},
633   {"zu", "zu", ZULU + W10, XHOSA - W4},
634 };
635 
636 
637 // Possibly map to tl:
638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray
641 
642 
643 
644 // Table to look up country TLD (no general TLD)
645 // In alphabetical order for binary search
646 static const int kCLDTable3Size = 181;
647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
648   {"ac", JAPANESE + W2, 0},
649   {"ad", CATALAN + W4, 0},
650   {"ae", ARABIC + W4, 0},
651   {"af", PASHTO + W4, PERSIAN + W4},
652   {"ag", GERMAN + W2, 0},                // meager
653   // {"ai", 0, 0},                          // meager
654   {"al", ALBANIAN + W4, 0},
655   {"am", ARMENIAN + W4, 0},
656   {"an", DUTCH + W4, 0},                 // meager
657   {"ao", PORTUGUESE + W4, 0},
658   // {"aq", 0, 0},                          // meager
659   {"ar", SPANISH + W4, 0},
660   // {"as", 0, 0},
661   {"at", GERMAN + W4, 0},
662   {"au", ENGLISH + W2, 0},
663   {"aw", DUTCH + W4, 0},
664   {"ax", SWEDISH + W4, 0},
665   {"az", AZERBAIJANI + W4, 0},
666 
667   {"ba", BOSNIAN + W8, CROATIAN - W4},
668   // {"bb", 0, 0},
669   {"bd", BENGALI + W4, 0},
670   {"be", DUTCH + W4, FRENCH + W4},
671   {"bf", FRENCH + W4, 0},
672   {"bg", BULGARIAN + W4, 0},
673   {"bh", ARABIC + W4, 0},
674   {"bi", RUNDI + W4, FRENCH + W4},
675   {"bj", FRENCH + W4, 0},
676   {"bm", ENGLISH + W2, 0},
677   {"bn", MALAY + W4, INDONESIAN - W4},
678   {"bo", SPANISH + W4, AYMARA + W2},   // and GUARANI QUECHUA
679   {"br", PORTUGUESE + W4, 0},
680   // {"bs", 0, 0},
681   {"bt", DZONGKHA + W10, TIBETAN - W10},      // Strong presumption of Dzongha
682   {"bw", TSWANA + W4, 0},
683   {"by", BELARUSIAN + W4, 0},
684   // {"bz", 0, 0},
685 
686   {"ca", FRENCH + W4, ENGLISH + W2},
687   {"cat", CATALAN + W4, 0},
688   {"cc", 0, 0},
689   {"cd", FRENCH + W4, 0},
690   {"cf", FRENCH + W4, 0},
691   {"cg", FRENCH + W4, 0},
692   {"ch", GERMAN + W4, FRENCH + W4},
693   {"ci", FRENCH + W4, 0},
694   // {"ck", 0, 0},
695   {"cl", SPANISH + W4, 0},
696   {"cm", FRENCH + W4, 0},
697   {"cn", CHINESE + W4, 0},
698   {"co", SPANISH + W4, 0},
699   {"cr", SPANISH + W4, 0},
700   {"cu", SPANISH + W4, 0},
701   {"cv", PORTUGUESE + W4, 0},
702   // {"cx", 0, 0},
703   {"cy", GREEK + W4, TURKISH + W4},
704   {"cz", CZECH + W4, SLOVAK - W4},
705 
706   {"de", GERMAN + W4, 0},
707   {"dj", 0, 0},
708   {"dk", DANISH + W4, NORWEGIAN - W4},
709   {"dm", 0, 0},
710   {"do", SPANISH + W4, 0},
711   {"dz", FRENCH + W4, ARABIC + W4},
712 
713   {"ec", SPANISH + W4, 0},
714   {"ee", ESTONIAN + W4, 0},
715   {"eg", ARABIC + W4, 0},
716   {"er", AFAR + W4, 0},
717   {"es", SPANISH + W4, 0},
718   {"et", AMHARIC + W4, AFAR + W4},
719 
720   {"fi", FINNISH + W4, 0},
721   {"fj", FIJIAN + W4, 0},
722   // {"fk", 0, 0},
723   // {"fm", 0, 0},
724   {"fo", FAROESE + W4, ICELANDIC - W4},
725   {"fr", FRENCH + W4, 0},
726 
727   {"ga", FRENCH + W4, 0},
728   {"gd", 0, 0},
729   {"ge", GEORGIAN + W4, 0},
730   {"gf", FRENCH + W4, 0},
731   // {"gg", 0, 0},
732   // {"gh", 0, 0},
733   // {"gi", 0, 0},
734   {"gl", GREENLANDIC + W4, DANISH + W4},
735   // {"gm", 0, 0},
736   {"gn", FRENCH + W4, 0},
737   // {"gp", 0, 0},
738   // {"gq", 0, 0},
739   {"gr", GREEK + W4, 0},
740   // {"gs", 0, 0},
741   {"gt", SPANISH + W4, 0},
742   // {"gu", 0, 0},
743   // {"gy", 0, 0},
744 
745   {"hk", CHINESE_T + W4, 0},
746   // {"hm", 0, 0},
747   {"hn", SPANISH + W4, 0},
748   {"hr", CROATIAN + W8, BOSNIAN - W4},
749   {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
750   {"hu", HUNGARIAN + W4, 0},
751 
752   {"id", INDONESIAN + W4, MALAY - W4},
753   {"ie", IRISH + W4, 0},
754   {"il", HEBREW + W4, 0},
755   {"im", MANX + W4, 0},
756   // {"in", 0, 0},
757   // {"io", 0, 0},
758   {"iq", ARABIC + W4, 0},
759   {"ir", PERSIAN + W4, 0},
760   {"is", ICELANDIC + W4, FAROESE - W4},
761   {"it", ITALIAN + W4, 0},
762 
763   // {"je", 0, 0},
764   // {"jm", 0, 0},
765   {"jo", ARABIC + W4, 0},
766   {"jp", JAPANESE + W4, 0},
767 
768   // {"ke", 0, 0},
769   {"kg", KYRGYZ + W4, 0},
770   {"kh", KHMER + W4, 0},
771   // {"ki", 0, 0},
772   {"km", FRENCH + W4, 0},
773   // {"kn", 0, 0},
774   {"kp", KOREAN + W4, 0},
775   {"kr", KOREAN + W4, 0},
776   {"kw", ARABIC + W4, 0},
777   // {"ky", 0, 0},
778   {"kz", KAZAKH + W4, 0},
779 
780   {"la", LAOTHIAN + W4, 0},
781   {"lb", ARABIC + W4, FRENCH + W4},
782   // {"lc", 0, 0},
783   {"li", GERMAN + W4, 0},
784   {"lk", SINHALESE + W4, 0},
785   // {"lr", 0, 0},
786   {"ls", SESOTHO + W4, 0},
787   {"lt", LITHUANIAN + W4, 0},
788   {"lu", LUXEMBOURGISH + W4},
789   {"lv", LATVIAN + W4, 0},
790   {"ly", ARABIC + W4, 0},
791 
792   {"ma", FRENCH + W4, 0},
793   {"mc", FRENCH + W4, 0},
794   {"md", ROMANIAN + W4, 0},
795   {"me", MONTENEGRIN + W8, SERBIAN - W4},
796   {"mg", FRENCH + W4, 0},
797   {"mk", MACEDONIAN + W4, 0},
798   {"ml", FRENCH + W4, 0},
799   {"mm", BURMESE + W4, 0},
800   {"mn", MONGOLIAN + W4, 0},
801   {"mo", CHINESE_T + W4, PORTUGUESE + W4},
802   // {"mp", 0, 0},
803   {"mq", FRENCH + W4, 0},
804   {"mr", FRENCH + W4, ARABIC + W4},
805   // {"ms", 0, 0},
806   {"mt", MALTESE + W4, 0},
807   // {"mu", 0, 0},
808   {"mv", DHIVEHI + W4, 0},
809   // {"mw", 0, 0},
810   {"mx", SPANISH + W4, 0},
811   {"my", MALAY + W4, INDONESIAN - W4},
812   {"mz", PORTUGUESE + W4, 0},
813 
814   {"na", 0, 0},            // Namibia
815   {"nc", FRENCH + W4, 0},
816   {"ne", FRENCH + W4, 0},
817   {"nf", FRENCH + W4, 0},
818   // {"ng", 0, 0},
819   {"ni", SPANISH + W4, 0},
820   {"nl", DUTCH + W4, 0},
821   {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
822   {"np", NEPALI + W4, 0},
823   {"nr", NAURU + W4, 0},
824   {"nu", SWEDISH + W4, 0},
825   {"nz", MAORI + W4, ENGLISH + W2},
826 
827   {"om", ARABIC + W4, 0},
828 
829   {"pa", SPANISH + W4, 0},
830   {"pe", SPANISH + W4, QUECHUA + W2},   // also AYMARA
831   {"pf", FRENCH + W4, 0},
832   // {"pg", 0, 0},
833   {"ph", TAGALOG + W4, 0},
834   {"pk", URDU + W4, 0},
835   {"pl", POLISH + W4, 0},
836   // {"pn", 0, 0},
837   {"pr", SPANISH + W4, 0},
838   {"ps", ARABIC + W4, 0},
839   {"pt", PORTUGUESE + W4, 0},
840   {"py", SPANISH + W4, GUARANI + W2},
841 
842   {"qa", ARABIC + W4, 0},
843 
844   {"re", FRENCH + W4, 0},
845   {"ro", ROMANIAN + W4, 0},
846   {"rs", SERBIAN + W8, MONTENEGRIN - W4},
847   {"ru", RUSSIAN + W4, 0},
848   {"rw", KINYARWANDA + W4, FRENCH + W2},
849 
850   {"sa", ARABIC + W4, 0},
851   // {"sb", 0, 0},
852   {"sc", SESELWA + W4, 0},
853   {"sd", ARABIC + W4, 0},
854   {"se", SWEDISH + W4, 0},
855   // {"sg", 0, 0},
856   // {"sh", 0, 0},
857   {"si", SLOVENIAN + W4, 0},
858   {"sk", SLOVAK + W4, CZECH - W4},
859   // {"sl", 0, 0},
860   {"sm", ITALIAN + W4, 0},
861   {"sn", FRENCH + W4, 0},
862   // {"sr", 0, 0},
863   {"ss", ARABIC + W4, 0},     // Presumed South Sudan TLD. dsites 2011.07.07
864   // {"st", 0, 0},
865   {"su", RUSSIAN + W4, 0},
866   {"sv", SPANISH + W4, 0},
867   {"sy", ARABIC + W4, 0},
868   // {"sz", 0, 0},
869 
870   // {"tc", 0, 0},
871   {"td", FRENCH + W4, 0},
872   // {"tf", 0, 0},
873   {"tg", FRENCH + W4, 0},
874   {"th", THAI + W4, 0},
875                               // Tibet has no country code (see .cn)
876   {"tj", TAJIK + W4, 0},
877   // {"tk", 0, 0},
878   // {"tl", 0, 0},
879   {"tm", TURKISH + W4, 0},
880   {"tn", FRENCH + W4, ARABIC + W4},
881   // {"to", 0, 0},
882   {"tp", JAPANESE + W4, 0},
883   {"tr", TURKISH + W4, 0},
884   // {"tt", 0, 0},
885   // {"tv", 0, 0},
886   {"tw", CHINESE_T + W4, 0},
887   {"tz", SWAHILI + W4, AKAN + W4},
888 
889   {"ua", UKRAINIAN + W4, 0},
890   {"ug", GANDA + W4, 0},
891   {"uk", ENGLISH + W2, 0},
892   {"us", ENGLISH + W2, 0},
893   {"uy", SPANISH + W4, 0},
894   {"uz", UZBEK + W4, 0},
895 
896   {"va", ITALIAN + W4, LATIN + W2},
897   // {"vc", 0, 0},
898   {"ve", SPANISH + W4, 0},
899   // {"vg", 0, 0},
900   // {"vi", 0, 0},
901   {"vn", VIETNAMESE + W4, 0},
902   // {"vu", 0, 0},
903 
904   {"wf", FRENCH + W4, 0},
905   // {"ws", 0, 0},
906 
907   {"ye", ARABIC + W4, 0},
908 
909   {"za", AFRIKAANS + W4, 0},
910   // {"zm", 0, 0},
911   // {"zw", 0, 0},
912 };
913 
914 #undef W2
915 #undef W4
916 #undef W6
917 #undef W8
918 #undef W10
919 #undef W12
920 
921 
922 
923 
924 
SetCLDPriorWeight(int w,OneCLDLangPrior * olp)925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
926   *olp = (*olp & 0x3ff) + (w << 10);
927 }
SetCLDPriorLang(Language lang,OneCLDLangPrior * olp)928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
929   *olp = (*olp & ~0x3ff) + lang;
930 }
931 
PackCLDPriorLangWeight(Language lang,int w)932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
933   return (w << 10) + lang;
934 }
935 
MaxInt(int a,int b)936 inline int MaxInt(int a, int b) {
937   return (a >= b) ? a : b;
938 }
939 
940 // Merge in another language prior, taking max if already there
MergeCLDLangPriorsMax(OneCLDLangPrior olp,CLDLangPriors * lps)941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
942   if (olp == 0) {return;}
943   Language target_lang = GetCLDPriorLang(olp);
944   for (int i = 0; i < lps->n; ++i) {
945     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
946       int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
947                               GetCLDPriorWeight(olp));
948       SetCLDPriorWeight(new_weight, &lps->prior[i]);
949       return;
950     }
951   }
952   // Not found; add it if room
953   if (lps->n >= kMaxOneCLDLangPrior) {return;}
954   lps->prior[lps->n++] = olp;
955 }
956 
957 // Merge in another language prior, boosting 10x if already there
MergeCLDLangPriorsBoost(OneCLDLangPrior olp,CLDLangPriors * lps)958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
959   if (olp == 0) {return;}
960   Language target_lang = GetCLDPriorLang(olp);
961   for (int i = 0; i < lps->n; ++i) {
962     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
963       int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
964       SetCLDPriorWeight(new_weight, &lps->prior[i]);
965       return;
966     }
967   }
968   // Not found; add it if room
969   if (lps->n >= kMaxOneCLDLangPrior) {return;}
970   lps->prior[lps->n++] = olp;
971 }
972 
973 
974 // Trim language priors to no more than max_entries, keeping largest abs weights
TrimCLDLangPriors(int max_entries,CLDLangPriors * lps)975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
976   if (lps->n <= max_entries) {return;}
977 
978   // Insertion sort in-place by abs(weight)
979   for (int i = 0; i < lps->n; ++i) {
980     OneCLDLangPrior temp_olp = lps->prior[i];
981     int w = abs(GetCLDPriorWeight(temp_olp));
982     int kk = i;
983     for (; kk > 0; --kk) {
984       if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
985         // Move down and continue
986         lps->prior[kk] = lps->prior[kk - 1];
987       } else {
988         // abs(weight[kk - 1]) >= w, time to stop
989         break;
990       }
991     }
992     lps->prior[kk] = temp_olp;
993   }
994 
995   lps->n = max_entries;
996 }
997 
CountCommas(const string & langtags)998 int CountCommas(const string& langtags) {
999   int commas = 0;
1000   for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
1001     if (langtags[i] == ',') {++commas;}
1002   }
1003   return commas;
1004 }
1005 
1006 // Binary lookup on language tag
DoLangTagLookup(const char * key,const LangTagLookup * tbl,int tbl_size)1007 const LangTagLookup* DoLangTagLookup(const char* key,
1008                                      const LangTagLookup* tbl, int tbl_size) {
1009   // Key is always in range [lo..hi)
1010   int lo = 0;
1011   int hi = tbl_size;
1012   while (lo < hi) {
1013     int mid = (lo + hi) >> 1;
1014     int comp = strcmp(tbl[mid].langtag, key);
1015     if (comp < 0) {
1016       lo = mid + 1;
1017     } else if (comp > 0) {
1018       hi = mid;
1019     } else {
1020       return &tbl[mid];
1021     }
1022   }
1023   return NULL;
1024 }
1025 
1026 // Binary lookup on tld
DoTLDLookup(const char * key,const TLDLookup * tbl,int tbl_size)1027 const TLDLookup* DoTLDLookup(const char* key,
1028                              const TLDLookup* tbl, int tbl_size) {
1029   // Key is always in range [lo..hi)
1030   int lo = 0;
1031   int hi = tbl_size;
1032   while (lo < hi) {
1033     int mid = (lo + hi) >> 1;
1034     int comp = strcmp(tbl[mid].tld, key);
1035     if (comp < 0) {
1036       lo = mid + 1;
1037     } else if (comp > 0) {
1038       hi = mid;
1039     } else {
1040       return &tbl[mid];
1041     }
1042   }
1043   return NULL;
1044 }
1045 
1046 
1047 
1048 // Trim language tag string to canonical form for each language
1049 // Input is from GetLangTagsFromHtml(), already lowercased
TrimCLDLangTagsHint(const string & langtags)1050 string TrimCLDLangTagsHint(const string& langtags) {
1051   string retval;
1052   if (langtags.empty()) {return retval;}
1053   int commas = CountCommas(langtags);
1054   if (commas > 4) {return retval;}       // Ignore if too many language tags
1055 
1056   char temp[20];
1057   int pos = 0;
1058   while (pos < static_cast<int>(langtags.size())) {
1059     int comma = langtags.find(',', pos);
1060     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1061     int len = comma - pos;
1062     if (len <= 16) {
1063       // Short enough to use
1064       memcpy(temp, &langtags[pos], len);
1065       temp[len] = '\0';
1066       const LangTagLookup* entry = DoLangTagLookup(temp,
1067                                                    kCLDLangTagsHintTable1,
1068                                                    kCLDTable1Size);
1069       if (entry != NULL) {
1070         // First table hit
1071         retval.append(entry->langcode);     // may be "code1,code2"
1072         retval.append(1, ',');
1073       } else {
1074         // Try second table with language code truncated at first hyphen
1075         char* hyphen = strchr(temp, '-');
1076         if (hyphen != NULL) {*hyphen = '\0';}
1077         len = strlen(temp);
1078         if (len <= 3) {                 // Short enough to use
1079           entry = DoLangTagLookup(temp,
1080                                   kCLDLangTagsHintTable2,
1081                                   kCLDTable2Size);
1082           if (entry != NULL) {
1083             // Second table hit
1084             retval.append(entry->langcode);     // may be "code1,code2"
1085             retval.append(1, ',');
1086           }
1087         }
1088       }
1089     }
1090     pos = comma + 1;
1091   }
1092 
1093   // Remove trainling comma, if any
1094   if (!retval.empty()) {retval.resize(retval.size() - 1);}
1095   return retval;
1096 }
1097 
1098 
1099 
1100 //==============================================================================
1101 
1102 // Little state machine to scan insides of language attribute quoted-string.
1103 // Each language code is lowercased and copied to the output string. Underscore
1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and
1105 // multiple consecutive commas are removed.
1106 // Each language code in the output list will be followed by a single comma.
1107 
1108 // There are three states, and we start in state 1:
1109 // State 0: After a letter.
1110 //  Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
1111 // State 1: Just after a comma.
1112 //  Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
1113 // State 2: Skipping.
1114 //  All characters except comma skip and stay in [2]. comma goes to [1]
1115 
1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0,
1117 // and always comma when going to state 1 or 2. The design depends on copying
1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
1119 
1120 // We pack all this into 8 bits:
1121 //    +--+---+---+
1122 //    |78|654|321|
1123 //    +--+---+---+
1124 //
1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
1126 // where . is always zero
1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C.
1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
1129 
1130 #define SKIP0 0
1131 #define SKIP1 1
1132 #define SKIP2 2
1133 #define COPY0 4   // copy kLangCodeRemap[c]
1134 #define COPY1 5   // copy ','
1135 #define COPY2 6   // copy ','
1136 
1137 // These combined actions pack three states into one byte.
1138 // Ninth bit must be zero, so all state 2 values must be skips.
1139 //              state[2]       state[1]      state[0]
1140 #define LTR   ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
1143 #define Bad   ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
1144 
1145 // Treat as letter: a-z,  A-Z
1146 // Treat as minus:  2D minus,  5F underscore
1147 // Treat as comma:  09 tab,  20 space,  2C comma
1148 
1149 static const unsigned char kLangCodeAction[256] = {
1150   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
1151   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1152   COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
1153   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1154 
1155   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1156   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
1157   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1158   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
1159 
1160   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1161   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1162   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1163   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1164 
1165   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1166   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1167   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1168   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1169 };
1170 
1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma
1172 static const unsigned char kLangCodeRemap[256] = {
1173   0,0,0,0,0,0,0,0,  0,',',0,0,0,0,0,0,          // 09 tab
1174   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1175   ',',0,0,0,0,0,0,0,  0,0,0,0,',','-',0,0,      // 20 space 2C comma 2D minus
1176   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1177 
1178     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
1179   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,'-',  // 5F underscore
1180     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
1181   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,0,
1182 
1183   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1184   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1185   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1186   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1187 
1188   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1189   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1190   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1191   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
1192 };
1193 
1194 #undef LTR
1195 #undef MINUS
1196 #undef COMMA
1197 #undef Bad
1198 
1199 #undef SKIP0
1200 #undef SKIP1
1201 #undef SKIP2
1202 #undef COPY0
1203 #undef COPY1
1204 #undef COPY2
1205 
1206 
1207 // Find opening '<' for HTML tag
1208 // Note: this is all somewhat insensitive to mismatched quotes
FindTagStart(const char * utf8_body,int32 pos,int32 max_pos)1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
1210   int i = pos;
1211   // Advance i by 4 if none of the next 4 bytes are '<'
1212   for (i = pos; i < (max_pos - 3); i += 4) {
1213     // Fast check for any <
1214     const char* p = &utf8_body[i];
1215     uint32 s0123 = UNALIGNED_LOAD32(p);
1216     uint32 temp = s0123 ^ 0x3c3c3c3c;    // <<<<
1217     if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
1218       // At least one byte is '<'
1219       break;
1220     }
1221   }
1222   // Continue, advancing i by 1
1223   for (; i < max_pos; ++i) {
1224     if (utf8_body[i] == '<') {return i;}
1225   }
1226   return -1;
1227 }
1228 
1229 
1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
FindTagEnd(const char * utf8_body,int32 pos,int32 max_pos)1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1232   // Always outside quotes
1233   for (int i = pos; i < max_pos; ++i) {
1234     char c = utf8_body[i];
1235     if (c == '>') {return i;}
1236     if (c == '<') {return i - 1;}
1237     if (c == '&') {return i - 1;}
1238   }
1239   return -1;              // nothing found
1240 }
1241 
1242 // Find opening quote or apostrophe, skipping spaces
1243 // Note: this is all somewhat insensitive to mismatched quotes
FindQuoteStart(const char * utf8_body,int32 pos,int32 max_pos)1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
1245   for (int i = pos; i < max_pos; ++i) {
1246     char c = utf8_body[i];
1247     if (c == '"') {return i;}
1248     if (c == '\'') {return i;}
1249     if (c != ' ') {return -1;}
1250   }
1251   return -1;
1252 }
1253 
1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
FindQuoteEnd(const char * utf8_body,int32 pos,int32 max_pos)1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1256   // Always outside quotes
1257   for (int i = pos; i < max_pos; ++i) {
1258     char c = utf8_body[i];
1259     if (c == '"') {return i;}
1260     if (c == '\'') {return i;}
1261     if (c == '>') {return i - 1;}
1262     if (c == '=') {return i - 1;}
1263     if (c == '<') {return i - 1;}
1264     if (c == '&') {return i - 1;}
1265   }
1266   return -1;              // nothing found
1267 }
1268 
FindEqualSign(const char * utf8_body,int32 pos,int32 max_pos)1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
1270   // Outside quotes/apostrophes loop
1271   for (int i = pos; i < max_pos; ++i) {
1272     char c = utf8_body[i];
1273     if (c == '=') {       // Found bare equal sign inside tag
1274       return i;
1275     } else if (c == '"') {
1276       // Inside quotes loop
1277       int j;
1278       for (j = i + 1; j < max_pos; ++j) {
1279         if (utf8_body[j] == '"') {
1280           break;
1281         } else if (utf8_body[j] == '\\') {
1282           ++j;
1283         }
1284       }
1285       i = j;
1286     } else if (c == '\'') {
1287       // Inside apostrophes loop
1288       int j;
1289       for (j = i + 1; j < max_pos; ++j) {
1290         if (utf8_body[j] == '\'') {
1291           break;
1292         } else if (utf8_body[j] == '\\') {
1293           ++j;
1294         }
1295       }
1296       i = j;
1297     }
1298 
1299   }
1300   return -1;              // nothing found
1301 }
1302 
1303 // Scan backwards for case-insensitive string s in [min_pos..pos)
1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1305 // Cheap lowercase. Control codes will masquerade as 20..3f
FindBefore(const char * utf8_body,int32 min_pos,int32 pos,const char * s)1306 bool FindBefore(const char* utf8_body,
1307                  int32 min_pos, int32 pos, const char* s) {
1308   int len = strlen(s);
1309   if ((pos - min_pos) < len) {return false;}     // Too small to fit s
1310 
1311   // Skip trailing spaces
1312   int i = pos;
1313   while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
1314   i -= len;
1315   if (i < min_pos) {return false;}   // pos - min_pos < len, so s can't be found
1316 
1317   const char* p = &utf8_body[i];
1318   for (int j = 0; j < len; ++j) {
1319     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
1320   }
1321   return true;                                     // All bytes equal at i
1322 }
1323 
1324 // Scan forwards for case-insensitive string s in [pos..max_pos)
1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1326 // Cheap lowercase. Control codes will masquerade as 20..3f
1327 // Allows but does not require quoted/apostrophe string
FindAfter(const char * utf8_body,int32 pos,int32 max_pos,const char * s)1328 bool FindAfter(const char* utf8_body,
1329                  int32 pos, int32 max_pos, const char* s) {
1330   int len = strlen(s);
1331   if ((max_pos - pos) < len) {return false;}     // Too small to fit s
1332 
1333   // Skip leading spaces, quote, apostrophe
1334   int i = pos;
1335   while (i < (max_pos - len)) {
1336     unsigned char c = utf8_body[i];
1337     if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
1338     else {break;}
1339   }
1340 
1341   const char* p = &utf8_body[i];
1342   for (int j = 0; j < len; ++j) {
1343     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
1344   }
1345   return true;                                     // All bytes equal
1346 }
1347 
1348 
1349 
1350 // Copy attribute value in [pos..max_pos)
1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one
1352 // String must all be on a single line.
1353 // Return slightly-normalized language list, empty or ending in comma
1354 // Does lowercasing and removes excess punctuation/space
CopyOneQuotedString(const char * utf8_body,int32 pos,int32 max_pos)1355 string CopyOneQuotedString(const char* utf8_body,
1356                          int32 pos, int32 max_pos) {
1357   string s;
1358   int state = 1;        // Front is logically just after a comma
1359   for (int i = pos; i < max_pos; ++i) {
1360     unsigned char c = utf8_body[i];
1361     int e = kLangCodeAction[c] >> (3 * state);
1362     state = e & 3;      // Update to next state
1363     if ((e & 4) != 0) {
1364       // Copy a remapped byte if going to state 0, else copy a comma
1365       if (state == 0) {
1366         s.append(1, kLangCodeRemap[c]);
1367       } else {
1368         s.append(1, ',');
1369       }
1370     }
1371   }
1372 
1373   // Add final comma if needed
1374   if (state == 0) {
1375     s.append(1, ',');
1376   }
1377   return s;
1378 }
1379 
1380 // Find and copy attribute value: quoted string in [pos..max_pos)
1381 // Return slightly-normalized language list, empty or ending in comma
CopyQuotedString(const char * utf8_body,int32 pos,int32 max_pos)1382 string CopyQuotedString(const char* utf8_body,
1383                          int32 pos, int32 max_pos) {
1384   int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
1385   if (start_quote < 0) {return string("");}
1386   int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
1387   if (end_quote < 0) {return string("");}
1388 
1389   return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
1390 }
1391 
1392 // Add hints to vector of langpriors
1393 // Input is from GetLangTagsFromHtml(), already lowercased
SetCLDLangTagsHint(const string & langtags,CLDLangPriors * langpriors)1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
1395   if (langtags.empty()) {return;}
1396   int commas = CountCommas(langtags);
1397   if (commas > 4) {return;}       // Ignore if too many language tags
1398 
1399   char temp[20];
1400   int pos = 0;
1401   while (pos < static_cast<int>(langtags.size())) {
1402     int comma = langtags.find(',', pos);
1403     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1404     int len = comma - pos;
1405     if (len <= 16) {
1406       // Short enough to use
1407       memcpy(temp, &langtags[pos], len);
1408       temp[len] = '\0';
1409       const LangTagLookup* entry = DoLangTagLookup(temp,
1410                                                    kCLDLangTagsHintTable1,
1411                                                    kCLDTable1Size);
1412       if (entry != NULL) {
1413         // First table hit
1414         MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1415         MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1416       } else {
1417         // Try second table with language code truncated at first hyphen
1418         char* hyphen = strchr(temp, '-');
1419         if (hyphen != NULL) {*hyphen = '\0';}
1420         len = strlen(temp);
1421         if (len <= 3) {                 // Short enough to use
1422           entry = DoLangTagLookup(temp,
1423                                   kCLDLangTagsHintTable2,
1424                                   kCLDTable2Size);
1425           if (entry != NULL) {
1426             // Second table hit
1427             MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1428             MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1429           }
1430         }
1431       }
1432     }
1433     pos = comma + 1;
1434   }
1435 }
1436 
1437 // Add hints to vector of langpriors
1438 // Input is string after HTTP header Content-Language:
SetCLDContentLangHint(const char * contentlang,CLDLangPriors * langpriors)1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
1440   string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
1441   SetCLDLangTagsHint(langtags, langpriors);
1442 }
1443 
1444 // Add hints to vector of langpriors
1445 // Input is last element of hostname (no dot), e.g. from GetTLD()
SetCLDTLDHint(const char * tld,CLDLangPriors * langpriors)1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
1447   int len = strlen(tld);
1448   if (len > 3) {return;}        // Ignore if more than three letters
1449   char local_tld[4];
1450   strncpy(local_tld, tld, 4);
1451   local_tld[3] = '\0';          // Safety move
1452   // Lowercase
1453   for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
1454   const TLDLookup* entry = DoTLDLookup(local_tld,
1455                                        kCLDTLDHintTable,
1456                                        kCLDTable3Size);
1457   if (entry != NULL) {
1458     // Table hit
1459     MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
1460     MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
1461   }
1462 }
1463 
1464 // Add hints to vector of langpriors
1465 // Input is from DetectEncoding()
SetCLDEncodingHint(Encoding enc,CLDLangPriors * langpriors)1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
1467   OneCLDLangPrior olp;
1468   switch (enc) {
1469   case CHINESE_GB:
1470   case GBK:
1471   case GB18030:
1472   case ISO_2022_CN:
1473   case HZ_GB_2312:
1474     olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
1475     MergeCLDLangPriorsBoost(olp, langpriors);
1476     break;
1477   case CHINESE_BIG5:
1478   case CHINESE_BIG5_CP950:
1479   case BIG5_HKSCS:
1480     olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
1481     MergeCLDLangPriorsBoost(olp, langpriors);
1482     break;
1483   case JAPANESE_EUC_JP:
1484   case JAPANESE_SHIFT_JIS:
1485   case JAPANESE_CP932:
1486   case JAPANESE_JIS:          // ISO-2022-JP
1487     olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
1488     MergeCLDLangPriorsBoost(olp, langpriors);
1489     break;
1490   case KOREAN_EUC_KR:
1491   case ISO_2022_KR:
1492     olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
1493     MergeCLDLangPriorsBoost(olp, langpriors);
1494     break;
1495 
1496   default:
1497     break;
1498   }
1499 }
1500 
1501 // Add hints to vector of langpriors
1502 // Input is from random source
SetCLDLanguageHint(Language lang,CLDLangPriors * langpriors)1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
1504   OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
1505   MergeCLDLangPriorsBoost(olp, langpriors);
1506 }
1507 
1508 
1509 // Make printable string of priors
DumpCLDLangPriors(const CLDLangPriors * langpriors)1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
1511   string retval;
1512   for (int i = 0; i < langpriors->n; ++i) {
1513     char temp[64];
1514     sprintf(temp, "%s.%d ",
1515              LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
1516              GetCLDPriorWeight(langpriors->prior[i]));
1517     retval.append(temp);
1518   }
1519   return retval;
1520 }
1521 
1522 
1523 
1524 
1525 // Look for
1526 //  <html lang="en">
1527 //  <doc xml:lang="en">
1528 //  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
1529 //  <meta http-equiv="content-language" content="en-GB" />
1530 //  <meta name="language" content="Srpski">
1531 //  <meta name="DC.language" scheme="RFCOMMA766" content="en">
1532 //  <SPAN id="msg1" class="info" lang='en'>
1533 //
1534 // Do not trigger on
1535 //  <!-- lang=french ...-->
1536 //  <font lang=postscript ...>
1537 //  <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
1538 //  <META name="Author" lang="fr" content="Arnaud Le Hors">
1539 //
1540 // Stop fairly quickly on mismatched quotes
1541 //
1542 // Allowed language characters
1543 //  a-z A-Z -_ , space\t
1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
1545 //  zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
1546 //  de-x-mtfrom-en  zh-tw-x-mtfrom-en  (machine translation)
1547 // GB2312 => gb
1548 // Big5 => big
1549 // zh_CN.gb18030_C => zh-cn
1550 //
1551 // Remove duplicates and extra spaces as we go
1552 // Lowercase as we go.
1553 
1554 // Get language tag hints from HTML body
1555 // Normalize: remove spaces and make lowercase comma list
1556 
GetLangTagsFromHtml(const char * utf8_body,int32 utf8_body_len,int32 max_scan_bytes)1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
1558                            int32 max_scan_bytes) {
1559   string retval;
1560   if (max_scan_bytes > utf8_body_len) {
1561     max_scan_bytes = utf8_body_len;
1562   }
1563 
1564   int32 k = 0;
1565   while (k < max_scan_bytes) {
1566     int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
1567     if (start_tag < 0) {break;}
1568     int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
1569     // FindTagEnd exits on < > &
1570     if (end_tag < 0) {break;}
1571 
1572     // Skip <!--...>
1573     // Skip <font ...>
1574     // Skip <script ...>
1575     // Skip <link ...>
1576     // Skip <img ...>
1577     // Skip <a ...>
1578     if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
1579         FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
1580         FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
1581         FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
1582         FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
1583         FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
1584       k = end_tag + 1;
1585       continue;
1586     }
1587 
1588     // Remember <meta ...>
1589     bool in_meta = false;
1590     if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
1591       in_meta = true;
1592     }
1593 
1594     // Scan for each equal sign inside tag
1595     bool content_is_lang = false;
1596     int32 kk = start_tag + 1;
1597     int32 equal_sign;
1598     while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
1599       // eq exits on < > &
1600 
1601       // Look inside a meta tag
1602       // <meta ... http-equiv="content-language" ...>
1603       // <meta ... name="language" ...>
1604       // <meta ... name="dc.language" ...>
1605       if (in_meta) {
1606         if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
1607             FindAfter(utf8_body, equal_sign + 1, end_tag,
1608                       "content-language ")) {
1609           content_is_lang = true;
1610         } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
1611                    (FindAfter(utf8_body, equal_sign + 1, end_tag,
1612                               "dc.language ") ||
1613                     FindAfter(utf8_body, equal_sign + 1, end_tag,
1614                               "language "))) {
1615           content_is_lang = true;
1616         }
1617       }
1618 
1619       // Look inside any tag
1620       // <meta ... content="lang-list" ...>
1621       // <... lang="lang-list" ...>
1622       // <... xml:lang="lang-list" ...>
1623       if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
1624                                          " content")) ||
1625           FindBefore(utf8_body, kk, equal_sign, " lang") ||
1626           FindBefore(utf8_body, kk, equal_sign, ":lang")) {
1627         string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
1628 
1629         // Append new lang tag(s) if not a duplicate
1630         if (!temp.empty() && (retval.find(temp) == string::npos)) {
1631           retval.append(temp);
1632         }
1633       }
1634 
1635       kk = equal_sign + 1;
1636     }
1637     k = end_tag + 1;
1638   }
1639 
1640   // Strip last comma
1641   if (retval.size() > 1) {
1642     retval.erase(retval.size() - 1);
1643   }
1644   return retval;
1645 }
1646 
1647 }       // End namespace CLD2
1648 
1649 //==============================================================================
1650