1 /*  GRAPHITE2 LICENSING
2 
3     Copyright 2010, SIL International
4     All rights reserved.
5 
6     This library is free software; you can redistribute it and/or modify
7     it under the terms of the GNU Lesser General Public License as published
8     by the Free Software Foundation; either version 2.1 of License, or
9     (at your option) any later version.
10 
11     This program is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14     Lesser General Public License for more details.
15 
16     You should also have received a copy of the GNU Lesser General Public
17     License along with this library in the file named "LICENSE".
18     If not, write to the Free Software Foundation, 51 Franklin Street,
19     Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20     internet at http://www.fsf.org/licenses/lgpl.html.
21 
22 Alternatively, the contents of this file may be used under the terms of the
23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 License, as published by the Free Software Foundation, either version 2
25 of the License or (at your option) any later version.
26 */
27 #pragma once
28 #include <cstring>
29 #include <cassert>
30 
31 #include "inc/Main.h"
32 
33 
34 namespace graphite2 {
35 
36 struct IsoLangEntry
37 {
38     unsigned short mnLang;
39     char maLangStr[4];
40     char maCountry[3];
41 };
42 
43 // Windows Language ID, Locale ISO-639 language, country code as used in
44 // naming table of OpenType fonts
45 const IsoLangEntry LANG_ENTRIES[] = {
46     { 0x0401, "ar","SA" }, // Arabic Saudi Arabia
47     { 0x0402, "bg","BG" }, // Bulgarian Bulgaria
48     { 0x0403, "ca","ES" }, // Catalan Catalan
49     { 0x0404, "zh","TW" }, // Chinese Taiwan
50     { 0x0405, "cs","CZ" }, // Czech Czech Republic
51     { 0x0406, "da","DK" }, // Danish Denmark
52     { 0x0407, "de","DE" }, // German Germany
53     { 0x0408, "el","GR" }, // Greek Greece
54     { 0x0409, "en","US" }, // English United States
55     { 0x040A, "es","ES" }, // Spanish (Traditional Sort) Spain
56     { 0x040B, "fi","FI" }, // Finnish Finland
57     { 0x040C, "fr","FR" }, // French France
58     { 0x040D, "he","IL" }, // Hebrew Israel
59     { 0x040E, "hu","HU" }, // Hungarian Hungary
60     { 0x040F, "is","IS" }, // Icelandic Iceland
61     { 0x0410, "it","IT" }, // Italian Italy
62     { 0x0411, "jp","JP" }, // Japanese Japan
63     { 0x0412, "ko","KR" }, // Korean Korea
64     { 0x0413, "nl","NL" }, // Dutch Netherlands
65     { 0x0414, "no","NO" }, // Norwegian (Bokmal) Norway
66     { 0x0415, "pl","PL" }, // Polish Poland
67     { 0x0416, "pt","BR" }, // Portuguese Brazil
68     { 0x0417, "rm","CH" }, // Romansh Switzerland
69     { 0x0418, "ro","RO" }, // Romanian Romania
70     { 0x0419, "ru","RU" }, // Russian Russia
71     { 0x041A, "hr","HR" }, // Croatian Croatia
72     { 0x041B, "sk","SK" }, // Slovak Slovakia
73     { 0x041C, "sq","AL" }, // Albanian Albania
74     { 0x041D, "sv","SE" }, // Swedish Sweden
75     { 0x041E, "th","TH" }, // Thai Thailand
76     { 0x041F, "tr","TR" }, // Turkish Turkey
77     { 0x0420, "ur","PK" }, // Urdu Islamic Republic of Pakistan
78     { 0x0421, "id","ID" }, // Indonesian Indonesia
79     { 0x0422, "uk","UA" }, // Ukrainian Ukraine
80     { 0x0423, "be","BY" }, // Belarusian Belarus
81     { 0x0424, "sl","SI" }, // Slovenian Slovenia
82     { 0x0425, "et","EE" }, // Estonian Estonia
83     { 0x0426, "lv","LV" }, // Latvian Latvia
84     { 0x0427, "lt","LT" }, // Lithuanian Lithuania
85     { 0x0428, "tg","TJ" }, // Tajik (Cyrillic) Tajikistan
86     { 0x042A, "vi","VN" }, // Vietnamese Vietnam
87     { 0x042B, "hy","AM" }, // Armenian Armenia
88     { 0x042C, "az","AZ" }, // Azeri (Latin) Azerbaijan
89     { 0x042D, "eu","" }, // Basque Basque
90     { 0x042E, "hsb","DE" }, // Upper Sorbian Germany
91     { 0x042F, "mk","MK" }, // Macedonian (FYROM) Former Yugoslav Republic of Macedonia
92     { 0x0432, "tn","ZA" }, // Setswana South Africa
93     { 0x0434, "xh","ZA" }, // isiXhosa South Africa
94     { 0x0435, "zu","ZA" }, // isiZulu South Africa
95     { 0x0436, "af","ZA" }, // Afrikaans South Africa
96     { 0x0437, "ka","GE" }, // Georgian Georgia
97     { 0x0438, "fo","FO" }, // Faroese Faroe Islands
98     { 0x0439, "hi","IN" }, // Hindi India
99     { 0x043A, "mt","MT" }, // Maltese Malta
100     { 0x043B, "se","NO" }, // Sami (Northern) Norway
101     { 0x043E, "ms","MY" }, // Malay Malaysia
102     { 0x043F, "kk","KZ" }, // Kazakh Kazakhstan
103     { 0x0440, "ky","KG" }, // Kyrgyz Kyrgyzstan
104     { 0x0441, "sw","KE" }, // Kiswahili Kenya
105     { 0x0442, "tk","TM" }, // Turkmen Turkmenistan
106     { 0x0443, "uz","UZ" }, // Uzbek (Latin) Uzbekistan
107     { 0x0444, "tt","RU" }, // Tatar Russia
108     { 0x0445, "bn","IN" }, // Bengali India
109     { 0x0446, "pa","IN" }, // Punjabi India
110     { 0x0447, "gu","IN" }, // Gujarati India
111     { 0x0448, "or","IN" }, // Oriya India
112     { 0x0448, "wo","SN" }, // Wolof Senegal
113     { 0x0449, "ta","IN" }, // Tamil India
114     { 0x044A, "te","IN" }, // Telugu India
115     { 0x044B, "kn","IN" }, // Kannada India
116     { 0x044C, "ml","IN" }, // Malayalam India
117     { 0x044D, "as","IN" }, // Assamese India
118     { 0x044E, "mr","IN" }, // Marathi India
119     { 0x044F, "sa","IN" }, // Sanskrit India
120     { 0x0450, "mn","MN" }, // Mongolian (Cyrillic) Mongolia
121     { 0x0451, "bo","CN" }, // Tibetan PRC
122     { 0x0452, "cy","GB" }, // Welsh United Kingdom
123     { 0x0453, "km","KH" }, // Khmer Cambodia
124     { 0x0454, "lo","LA" }, // Lao Lao P.D.R.
125     { 0x0455, "my","MM" }, // Burmese Myanmar - not listed in Microsoft docs anymore
126     { 0x0456, "gl","ES" }, // Galician Galician
127     { 0x0457, "kok","IN" }, // Konkani India
128     { 0x045A, "syr","TR" }, // Syriac Syria
129     { 0x045B, "si","LK" }, // Sinhala Sri Lanka
130     { 0x045D, "iu","CA" }, // Inuktitut Canada
131     { 0x045E, "am","ET" }, // Amharic Ethiopia
132     { 0x0461, "ne","NP" }, // Nepali Nepal
133     { 0x0462, "fy","NL" }, // Frisian Netherlands
134     { 0x0463, "ps","AF" }, // Pashto Afghanistan
135     { 0x0464, "fil","PH" }, // Filipino Philippines
136     { 0x0465, "dv","MV" }, // Divehi Maldives
137     { 0x0468, "ha","NG" }, // Hausa (Latin) Nigeria
138     { 0x046A, "yo","NG" }, // Yoruba Nigeria
139     { 0x046B, "qu","BO" }, // Quechua Bolivia
140     { 0x046C, "st","ZA" }, // Sesotho sa Leboa South Africa
141     { 0x046D, "ba","RU" }, // Bashkir Russia
142     { 0x046E, "lb","LU" }, // Luxembourgish Luxembourg
143     { 0x046F, "kl","GL" }, // Greenlandic Greenland
144     { 0x0470, "ig","NG" }, // Igbo Nigeria
145     { 0x0478, "ii","CN" }, // Yi PRC
146     { 0x047A, "arn","CL" }, // Mapudungun Chile
147     { 0x047C, "moh","CA" }, // Mohawk Mohawk
148     { 0x047E, "br","FR" }, // Breton France
149     { 0x0480, "ug","CN" }, // Uighur PRC
150     { 0x0481, "mi","NZ" }, // Maori New Zealand
151     { 0x0482, "oc","FR" }, // Occitan France
152     { 0x0483, "co","FR" }, // Corsican France
153     { 0x0484, "gsw","FR" }, // Alsatian France
154     { 0x0485, "sah","RU" }, // Yakut Russia
155     { 0x0486, "qut","GT" }, // K'iche Guatemala
156     { 0x0487, "rw","RW" }, // Kinyarwanda Rwanda
157     { 0x048C, "gbz","AF" }, // Dari Afghanistan
158     { 0x0801, "ar","IQ" }, // Arabic Iraq
159     { 0x0804, "zn","CH" }, // Chinese People's Republic of China
160     { 0x0807, "de","CH" }, // German Switzerland
161     { 0x0809, "en","GB" }, // English United Kingdom
162     { 0x080A, "es","MX" }, // Spanish Mexico
163     { 0x080C, "fr","BE" }, // French Belgium
164     { 0x0810, "it","CH" }, // Italian Switzerland
165     { 0x0813, "nl","BE" }, // Dutch Belgium
166     { 0x0814, "nn","NO" }, // Norwegian (Nynorsk) Norway
167     { 0x0816, "pt","PT" }, // Portuguese Portugal
168     { 0x081A, "sh","RS" }, // Serbian (Latin) Serbia
169     { 0x081D, "sv","FI" }, // Sweden Finland
170     { 0x082C, "az","AZ" }, // Azeri (Cyrillic) Azerbaijan
171     { 0x082E, "dsb","DE" }, // Lower Sorbian Germany
172     { 0x083B, "se","SE" }, // Sami (Northern) Sweden
173     { 0x083C, "ga","IE" }, // Irish Ireland
174     { 0x083E, "ms","BN" }, // Malay Brunei Darussalam
175     { 0x0843, "uz","UZ" }, // Uzbek (Cyrillic) Uzbekistan
176     { 0x0845, "bn","BD" }, // Bengali Bangladesh
177     { 0x0850, "mn","MN" }, // Mongolian (Traditional) People's Republic of China
178     { 0x085D, "iu","CA" }, // Inuktitut (Latin) Canada
179     { 0x085F, "ber","DZ" }, // Tamazight (Latin) Algeria
180     { 0x086B, "es","EC" }, // Quechua Ecuador
181     { 0x0C01, "ar","EG" }, // Arabic Egypt
182     { 0x0C04, "zh","HK" }, // Chinese Hong Kong S.A.R.
183     { 0x0C07, "de","AT" }, // German Austria
184     { 0x0C09, "en","AU" }, // English Australia
185     { 0x0C0A, "es","ES" }, // Spanish (Modern Sort) Spain
186     { 0x0C0C, "fr","CA" }, // French Canada
187     { 0x0C1A, "sr","CS" }, // Serbian (Cyrillic) Serbia
188     { 0x0C3B, "se","FI" }, // Sami (Northern) Finland
189     { 0x0C6B, "qu","PE" }, // Quechua Peru
190     { 0x1001, "ar","LY" }, // Arabic Libya
191     { 0x1004, "zh","SG" }, // Chinese Singapore
192     { 0x1007, "de","LU" }, // German Luxembourg
193     { 0x1009, "en","CA" }, // English Canada
194     { 0x100A, "es","GT" }, // Spanish Guatemala
195     { 0x100C, "fr","CH" }, // French Switzerland
196     { 0x101A, "hr","BA" }, // Croatian (Latin) Bosnia and Herzegovina
197     { 0x103B, "smj","NO" }, // Sami (Lule) Norway
198     { 0x1401, "ar","DZ" }, // Arabic Algeria
199     { 0x1404, "zh","MO" }, // Chinese Macao S.A.R.
200     { 0x1407, "de","LI" }, // German Liechtenstein
201     { 0x1409, "en","NZ" }, // English New Zealand
202     { 0x140A, "es","CR" }, // Spanish Costa Rica
203     { 0x140C, "fr","LU" }, // French Luxembourg
204     { 0x141A, "bs","BA" }, // Bosnian (Latin) Bosnia and Herzegovina
205     { 0x143B, "smj","SE" }, // Sami (Lule) Sweden
206     { 0x1801, "ar","MA" }, // Arabic Morocco
207     { 0x1809, "en","IE" }, // English Ireland
208     { 0x180A, "es","PA" }, // Spanish Panama
209     { 0x180C, "fr","MC" }, // French Principality of Monoco
210     { 0x181A, "sh","BA" }, // Serbian (Latin) Bosnia and Herzegovina
211     { 0x183B, "sma","NO" }, // Sami (Southern) Norway
212     { 0x1C01, "ar","TN" }, // Arabic Tunisia
213     { 0x1C09, "en","ZA" }, // English South Africa
214     { 0x1C0A, "es","DO" }, // Spanish Dominican Republic
215     { 0x1C1A, "sr","BA" }, // Serbian (Cyrillic) Bosnia and Herzegovina
216     { 0x1C3B, "sma","SE" }, // Sami (Southern) Sweden
217     { 0x2001, "ar","OM" }, // Arabic Oman
218     { 0x2009, "en","JM" }, // English Jamaica
219     { 0x200A, "es","VE" }, // Spanish Venezuela
220     { 0x201A, "bs","BA" }, // Bosnian (Cyrillic) Bosnia and Herzegovina
221     { 0x203B, "sms","FI" }, // Sami (Skolt) Finland
222     { 0x2401, "ar","YE" }, // Arabic Yemen
223     { 0x2409, "en","BS" }, // English Caribbean
224     { 0x240A, "es","CO" }, // Spanish Colombia
225     { 0x243B, "smn","FI" }, // Sami (Inari) Finland
226     { 0x2801, "ar","SY" }, // Arabic Syria
227     { 0x2809, "en","BZ" }, // English Belize
228     { 0x280A, "es","PE" }, // Spanish Peru
229     { 0x2C01, "ar","JO" }, // Arabic Jordan
230     { 0x2C09, "en","TT" }, // English Trinidad and Tobago
231     { 0x2C0A, "es","AR" }, // Spanish Argentina
232     { 0x3001, "ar","LB" }, // Arabic Lebanon
233     { 0x3009, "en","ZW" }, // English Zimbabwe
234     { 0x300A, "es","EC" }, // Spanish Ecuador
235     { 0x3401, "ar","KW" }, // Arabic Kuwait
236     { 0x3409, "en","PH" }, // English Republic of the Philippines
237     { 0x340A, "es","CL" }, // Spanish Chile
238     { 0x3801, "ar","AE" }, // Arabic U.A.E.
239     { 0x380A, "es","UY" }, // Spanish Uruguay
240     { 0x3C01, "ar","BH" }, // Arabic Bahrain
241     { 0x3C0A, "es","PY" }, // Spanish Paraguay
242     { 0x4001, "ar","QA" }, // Arabic Qatar
243     { 0x4009, "en","IN" }, // English India
244     { 0x400A, "es","BO" }, // Spanish Bolivia
245     { 0x4409, "en","MY" }, // English Malaysia
246     { 0x440A, "es","SV" }, // Spanish El Salvador
247     { 0x4809, "en","SG" }, // English Singapore
248     { 0x480A, "es","HN" }, // Spanish Honduras
249     { 0x4C0A, "es","NI" }, // Spanish Nicaragua
250     { 0x500A, "es","PR" }, // Spanish Puerto Rico
251     { 0x540A, "es","US" } // Spanish United States
252 };
253 
254 class Locale2Lang
255 {
256     Locale2Lang(const Locale2Lang &);
257     Locale2Lang & operator = (const Locale2Lang &);
258 
259 public:
Locale2Lang()260     Locale2Lang() : mSeedPosition(128)
261     {
262         memset((void*)mLangLookup, 0, sizeof(mLangLookup));
263         // create a tri lookup on first 2 letters of language code
264         static const int maxIndex = sizeof(LANG_ENTRIES)/sizeof(IsoLangEntry);
265         for (int i = 0; i < maxIndex; i++)
266         {
267             size_t a = LANG_ENTRIES[i].maLangStr[0] - 'a';
268             size_t b = LANG_ENTRIES[i].maLangStr[1] - 'a';
269             if (mLangLookup[a][b])
270             {
271                 const IsoLangEntry ** old = mLangLookup[a][b];
272                 int len = 1;
273                 while (old[len]) len++;
274                 len += 2;
275                 mLangLookup[a][b] = gralloc<const IsoLangEntry *>(len);
276                 if (!mLangLookup[a][b])
277                 {
278                     mLangLookup[a][b] = old;
279                     continue;
280                 }
281                 mLangLookup[a][b][--len] = NULL;
282                 mLangLookup[a][b][--len] = &LANG_ENTRIES[i];
283                 while (--len >= 0)
284                 {
285                     assert(len >= 0);
286                     mLangLookup[a][b][len] = old[len];
287                 }
288                 free(old);
289             }
290             else
291             {
292                 mLangLookup[a][b] = gralloc<const IsoLangEntry *>(2);
293                 if (!mLangLookup[a][b]) continue;
294                 mLangLookup[a][b][1] = NULL;
295                 mLangLookup[a][b][0] = &LANG_ENTRIES[i];
296             }
297         }
298         while (2 * mSeedPosition < maxIndex)
299             mSeedPosition *= 2;
300     };
~Locale2Lang()301     ~Locale2Lang()
302     {
303         for (int i = 0; i != 26; ++i)
304             for (int j = 0; j != 26; ++j)
305                 free(mLangLookup[i][j]);
306     }
getMsId(const char * locale)307     unsigned short getMsId(const char * locale) const
308     {
309         size_t length = strlen(locale);
310         size_t langLength = length;
311         const char * language = locale;
312         const char * script = NULL;
313         const char * region = NULL;
314         size_t regionLength = 0;
315         const char * dash = strchr(locale, '-');
316         if (dash && (dash != locale))
317         {
318             langLength = (dash - locale);
319             size_t nextPartLength = length - langLength - 1;
320             if (nextPartLength >= 2)
321             {
322                 script = ++dash;
323                 dash = strchr(dash, '-');
324                 if (dash)
325                 {
326                     nextPartLength = (dash - script);
327                     region = ++dash;
328                 }
329                 if (nextPartLength == 2 &&
330                     (locale[langLength+1] > 0x40) && (locale[langLength+1] < 0x5B) &&
331                     (locale[langLength+2] > 0x40) && (locale[langLength+2] < 0x5B))
332                 {
333                     region = script;
334                     regionLength = nextPartLength;
335                     script = NULL;
336                 }
337                 else if (nextPartLength == 4)
338                 {
339                     if (dash)
340                     {
341                         dash = strchr(dash, '-');
342                         if (dash)
343                         {
344                             nextPartLength = (dash - region);
345                         }
346                         else
347                         {
348                             nextPartLength = langLength - (region - locale);
349                         }
350                         regionLength = nextPartLength;
351                     }
352                 }
353             }
354         }
355         size_t a = 'e' - 'a';
356         size_t b = 'n' - 'a';
357         unsigned short langId = 0;
358         int i = 0;
359         switch (langLength)
360         {
361             case 2:
362             {
363                 a = language[0] - 'a';
364                 b = language[1] - 'a';
365                 if ((a < 26) && (b < 26) && mLangLookup[a][b])
366                 {
367                     while (mLangLookup[a][b][i])
368                     {
369                         if (mLangLookup[a][b][i]->maLangStr[2] != '\0')
370                         {
371                             ++i;
372                             continue;
373                         }
374                         if (region && (strncmp(mLangLookup[a][b][i]->maCountry, region, regionLength) == 0))
375                         {
376                             langId = mLangLookup[a][b][i]->mnLang;
377                             break;
378                         }
379                         else if (langId == 0)
380                         {
381                             // possible fallback code
382                             langId = mLangLookup[a][b][i]->mnLang;
383                         }
384                         ++i;
385                     }
386                 }
387             }
388             break;
389             case 3:
390             {
391                 a = language[0] - 'a';
392                 b = language[1] - 'a';
393                 if (mLangLookup[a][b])
394                 {
395                     while (mLangLookup[a][b][i])
396                     {
397                         if (mLangLookup[a][b][i]->maLangStr[2] != language[2])
398                         {
399                             ++i;
400                             continue;
401                         }
402                         if (region && (strncmp(mLangLookup[a][b][i]->maCountry, region, regionLength) == 0))
403                         {
404                             langId = mLangLookup[a][b][i]->mnLang;
405                             break;
406                         }
407                         else if (langId == 0)
408                         {
409                             // possible fallback code
410                             langId = mLangLookup[a][b][i]->mnLang;
411                         }
412                         ++i;
413                     }
414                 }
415             }
416             break;
417             default:
418                 break;
419         }
420         if (langId == 0) langId = 0x409;
421         return langId;
422     }
findEntryById(unsigned short langId)423     const IsoLangEntry * findEntryById(unsigned short langId) const
424     {
425         static const int maxIndex = sizeof(LANG_ENTRIES)/sizeof(IsoLangEntry);
426         int window = mSeedPosition;
427         int guess = mSeedPosition - 1;
428         while (LANG_ENTRIES[guess].mnLang != langId)
429         {
430             window /= 2;
431             if (window == 0) return NULL;
432             guess += (LANG_ENTRIES[guess].mnLang > langId)? -window : window;
433             while (guess >= maxIndex)
434             {
435                 window /= 2;
436                 guess -= window;
437                 assert(window);
438             }
439         }
440         return &LANG_ENTRIES[guess];
441     }
442 
443     CLASS_NEW_DELETE;
444 
445 private:
446     const IsoLangEntry ** mLangLookup[26][26];
447     int mSeedPosition;
448 };
449 
450 } // namespace graphite2
451