1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 1996-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  *
9  * Provides functionality for mapping between
10  * LCID and Posix IDs or ICU locale to codepage
11  *
12  * Note: All classes and code in this file are
13  *       intended for internal use only.
14  *
15  * Methods of interest:
16  *   unsigned long convertToLCID(const char*);
17  *   const char* convertToPosix(unsigned long);
18  *
19  * Kathleen Wilson, 4/30/96
20  *
21  *  Date        Name        Description
22  *  3/11/97     aliu        Fixed off-by-one bug in assignment operator. Added
23  *                          setId() method and safety check against
24  *                          MAX_ID_LENGTH.
25  * 04/23/99     stephen     Added C wrapper for convertToPosix.
26  * 09/18/00     george      Removed the memory leaks.
27  * 08/23/01     george      Convert to C
28  */
29 
30 #include "locmap.h"
31 #include "cstring.h"
32 #include "cmemory.h"
33 #include "unicode/uloc.h"
34 
35 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
36 #include <windows.h>
37 #include <winnls.h> // LCIDToLocaleName and LocaleNameToLCID
38 #endif
39 
40 /*
41  * Note:
42  * The mapping from Win32 locale ID numbers to POSIX locale strings should
43  * be the faster one.
44  *
45  * Windows LCIDs are defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
46  * [MS-LCID] Windows Language Code Identifier (LCID) Reference
47  */
48 
49 /*
50 ////////////////////////////////////////////////
51 //
52 // Internal Classes for LCID <--> POSIX Mapping
53 //
54 /////////////////////////////////////////////////
55 */
56 
57 typedef struct ILcidPosixElement
58 {
59     const uint32_t hostID;
60     const char * const posixID;
61 } ILcidPosixElement;
62 
63 typedef struct ILcidPosixMap
64 {
65     const uint32_t numRegions;
66     const struct ILcidPosixElement* const regionMaps;
67 } ILcidPosixMap;
68 
69 
70 /*
71 /////////////////////////////////////////////////
72 //
73 // Easy macros to make the LCID <--> POSIX Mapping
74 //
75 /////////////////////////////////////////////////
76 */
77 
78 /**
79  * The standard one language/one country mapping for LCID.
80  * The first element must be the language, and the following
81  * elements are the language with the country.
82  * @param hostID LCID in host format such as 0x044d
83  * @param languageID posix ID of just the language such as 'de'
84  * @param posixID posix ID of the language_TERRITORY such as 'de_CH'
85  */
86 #define ILCID_POSIX_ELEMENT_ARRAY(hostID, languageID, posixID) \
87 static const ILcidPosixElement locmap_ ## languageID [] = { \
88     {LANGUAGE_LCID(hostID), #languageID},     /* parent locale */ \
89     {hostID, #posixID}, \
90 };
91 
92 /**
93  * Define a subtable by ID
94  * @param id the POSIX ID, either a language or language_TERRITORY
95  */
96 #define ILCID_POSIX_SUBTABLE(id) \
97 static const ILcidPosixElement locmap_ ## id [] =
98 
99 
100 /**
101  * Create the map for the posixID. This macro supposes that the language string
102  * name is the same as the global variable name, and that the first element
103  * in the ILcidPosixElement is just the language.
104  * @param _posixID the full POSIX ID for this entry.
105  */
106 #define ILCID_POSIX_MAP(_posixID) \
107     {UPRV_LENGTHOF(locmap_ ## _posixID), locmap_ ## _posixID}
108 
109 /*
110 ////////////////////////////////////////////
111 //
112 // Create the table of LCID to POSIX Mapping
113 // None of it should be dynamically created.
114 //
115 // Keep static locale variables inside the function so that
116 // it can be created properly during static init.
117 //
118 // Note: This table should be updated periodically. Check the [MS-LCID] Windows Language Code Identifier
119 //       (LCID) Reference defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
120 //
121 //       Microsoft is moving away from LCID in favor of locale name as of Vista.  This table needs to be
122 //       maintained for support of older Windows version.
123 //       Update: Windows 7 (091130)
124 //
125 // Note: Microsoft assign a different LCID if a locale has a sorting variant. POSIX IDs below may contain
126 //       @collation=XXX, but no other keywords are allowed (at least for now). When uprv_convertToLCID() is
127 //       called from uloc_getLCID(), keywords other than collation are already removed. If we really need
128 //       to support other keywords in this mapping data, we must update the implementation.
129 ////////////////////////////////////////////
130 */
131 
132 // TODO: For Windows ideally this table would be a list of exceptions rather than a complete list as
133 // LocaleNameToLCID and LCIDToLocaleName provide 90% of these.
134 
135 ILCID_POSIX_ELEMENT_ARRAY(0x0436, af, af_ZA)
136 
ILCID_POSIX_SUBTABLE(ar)137 ILCID_POSIX_SUBTABLE(ar) {
138     {0x01,   "ar"},
139     {0x3801, "ar_AE"},
140     {0x3c01, "ar_BH"},
141     {0x1401, "ar_DZ"},
142     {0x0c01, "ar_EG"},
143     {0x0801, "ar_IQ"},
144     {0x2c01, "ar_JO"},
145     {0x3401, "ar_KW"},
146     {0x3001, "ar_LB"},
147     {0x1001, "ar_LY"},
148     {0x1801, "ar_MA"},
149     {0x1801, "ar_MO"},
150     {0x2001, "ar_OM"},
151     {0x4001, "ar_QA"},
152     {0x0401, "ar_SA"},
153     {0x2801, "ar_SY"},
154     {0x1c01, "ar_TN"},
155     {0x2401, "ar_YE"}
156 };
157 
158 ILCID_POSIX_ELEMENT_ARRAY(0x044d, as, as_IN)
159 ILCID_POSIX_ELEMENT_ARRAY(0x045e, am, am_ET)
160 ILCID_POSIX_ELEMENT_ARRAY(0x047a, arn,arn_CL)
161 
ILCID_POSIX_SUBTABLE(az)162 ILCID_POSIX_SUBTABLE(az) {
163     {0x2c,   "az"},
164     {0x082c, "az_Cyrl_AZ"},  /* Cyrillic based */
165     {0x742c, "az_Cyrl"},  /* Cyrillic based */
166     {0x042c, "az_Latn_AZ"}, /* Latin based */
167     {0x782c, "az_Latn"}, /* Latin based */
168     {0x042c, "az_AZ"} /* Latin based */
169 };
170 
171 ILCID_POSIX_ELEMENT_ARRAY(0x046d, ba, ba_RU)
172 ILCID_POSIX_ELEMENT_ARRAY(0x0423, be, be_BY)
173 
174 /*ILCID_POSIX_SUBTABLE(ber) {
175     {0x5f,   "ber"},
176     {0x045f, "ber_Arab_DZ"},
177     {0x045f, "ber_Arab"},
178     {0x085f, "ber_Latn_DZ"},
179     {0x085f, "ber_Latn"}
180 };*/
181 
182 ILCID_POSIX_ELEMENT_ARRAY(0x0402, bg, bg_BG)
183 
ILCID_POSIX_SUBTABLE(bin)184 ILCID_POSIX_SUBTABLE(bin) {
185     {0x66, "bin"},
186     {0x0466, "bin_NG"}
187 };
188 
ILCID_POSIX_SUBTABLE(bn)189 ILCID_POSIX_SUBTABLE(bn) {
190     {0x45,   "bn"},
191     {0x0845, "bn_BD"},
192     {0x0445, "bn_IN"}
193 };
194 
ILCID_POSIX_SUBTABLE(bo)195 ILCID_POSIX_SUBTABLE(bo) {
196     {0x51,   "bo"},
197     {0x0851, "bo_BT"},
198     {0x0451, "bo_CN"},
199     {0x0c51, "dz_BT"}
200 };
201 
202 ILCID_POSIX_ELEMENT_ARRAY(0x047e, br, br_FR)
203 
ILCID_POSIX_SUBTABLE(ca)204 ILCID_POSIX_SUBTABLE(ca) {
205     {0x03,   "ca"},
206     {0x0403, "ca_ES"},
207     {0x0803, "ca_ES_VALENCIA"}
208 };
209 
210 ILCID_POSIX_ELEMENT_ARRAY(0x0483, co, co_FR)
211 
ILCID_POSIX_SUBTABLE(chr)212 ILCID_POSIX_SUBTABLE(chr) {
213     {0x05c,  "chr"},
214     {0x7c5c, "chr_Cher"},
215     {0x045c, "chr_Cher_US"},
216     {0x045c, "chr_US"}
217 };
218 
219 // ICU has chosen different names for these.
ILCID_POSIX_SUBTABLE(ckb)220 ILCID_POSIX_SUBTABLE(ckb) {
221     {0x92,   "ckb"},
222     {0x7c92, "ckb_Arab"},
223     {0x0492, "ckb_Arab_IQ"}
224 };
225 
226 /* Declared as cs_CZ to get around compiler errors on z/OS, which defines cs as a function */
227 ILCID_POSIX_ELEMENT_ARRAY(0x0405, cs, cs_CZ)
228 
229 ILCID_POSIX_ELEMENT_ARRAY(0x0452, cy, cy_GB)
230 ILCID_POSIX_ELEMENT_ARRAY(0x0406, da, da_DK)
231 
232 // Windows doesn't know POSIX or BCP47 Unicode phonebook sort names
ILCID_POSIX_SUBTABLE(de)233 ILCID_POSIX_SUBTABLE(de) {
234     {0x07,   "de"},
235     {0x0c07, "de_AT"},
236     {0x0807, "de_CH"},
237     {0x0407, "de_DE"},
238     {0x1407, "de_LI"},
239     {0x1007, "de_LU"},
240     {0x10407,"de_DE@collation=phonebook"},  /*This is really de_DE_PHONEBOOK on Windows*/
241     {0x10407,"de@collation=phonebook"}  /*This is really de_DE_PHONEBOOK on Windows*/
242 };
243 
244 ILCID_POSIX_ELEMENT_ARRAY(0x0465, dv, dv_MV)
245 ILCID_POSIX_ELEMENT_ARRAY(0x0408, el, el_GR)
246 
247 // Windows uses an empty string for 'invariant'
ILCID_POSIX_SUBTABLE(en)248 ILCID_POSIX_SUBTABLE(en) {
249     {0x09,   "en"},
250     {0x0c09, "en_AU"},
251     {0x2809, "en_BZ"},
252     {0x1009, "en_CA"},
253     {0x0809, "en_GB"},
254     {0x3c09, "en_HK"},
255     {0x3809, "en_ID"},
256     {0x1809, "en_IE"},
257     {0x4009, "en_IN"},
258     {0x2009, "en_JM"},
259     {0x4409, "en_MY"},
260     {0x1409, "en_NZ"},
261     {0x3409, "en_PH"},
262     {0x4809, "en_SG"},
263     {0x2C09, "en_TT"},
264     {0x0409, "en_US"},
265     {0x007f, "en_US_POSIX"}, /* duplicate for round-tripping */
266     {0x2409, "en_029"},
267     {0x1c09, "en_ZA"},
268     {0x3009, "en_ZW"},
269     {0x2409, "en_VI"},  /* Virgin Islands AKA Caribbean Islands (en_CB). On Windows8+ This is 0x1000 or dynamically assigned */
270     {0x0409, "en_AS"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
271     {0x0409, "en_GU"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
272     {0x0409, "en_MH"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
273     {0x0409, "en_MP"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
274     {0x0409, "en_UM"}   /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
275 };
276 
ILCID_POSIX_SUBTABLE(en_US_POSIX)277 ILCID_POSIX_SUBTABLE(en_US_POSIX) {
278     {0x007f, "en_US_POSIX"} /* duplicate for roundtripping */
279 };
280 
281 // Windows doesn't know POSIX or BCP47 Unicode traditional sort names
ILCID_POSIX_SUBTABLE(es)282 ILCID_POSIX_SUBTABLE(es) {
283     {0x0a,   "es"},
284     {0x2c0a, "es_AR"},
285     {0x400a, "es_BO"},
286     {0x340a, "es_CL"},
287     {0x240a, "es_CO"},
288     {0x140a, "es_CR"},
289     {0x5c0a, "es_CU"},
290     {0x1c0a, "es_DO"},
291     {0x300a, "es_EC"},
292     {0x0c0a, "es_ES"},      /*Modern sort.*/
293     {0x100a, "es_GT"},
294     {0x480a, "es_HN"},
295     {0x080a, "es_MX"},
296     {0x4c0a, "es_NI"},
297     {0x180a, "es_PA"},
298     {0x280a, "es_PE"},
299     {0x500a, "es_PR"},
300     {0x3c0a, "es_PY"},
301     {0x440a, "es_SV"},
302     {0x540a, "es_US"},
303     {0x380a, "es_UY"},
304     {0x200a, "es_VE"},
305     {0x580a, "es_419"},
306     {0x040a, "es_ES@collation=traditional"},
307     {0x040a, "es@collation=traditional"}        // Windows will treat this as es-ES@collation=traditional
308 };
309 
310 ILCID_POSIX_ELEMENT_ARRAY(0x0425, et, et_EE)
311 ILCID_POSIX_ELEMENT_ARRAY(0x042d, eu, eu_ES)
312 
313 /* ISO-639 doesn't distinguish between Persian and Dari.*/
ILCID_POSIX_SUBTABLE(fa)314 ILCID_POSIX_SUBTABLE(fa) {
315     {0x29,   "fa"},
316     {0x0429, "fa_IR"},  /* Persian/Farsi (Iran) */
317     {0x048c, "fa_AF"}   /* Persian/Dari (Afghanistan) */
318 };
319 
320 
321 /* duplicate for roundtripping */
ILCID_POSIX_SUBTABLE(fa_AF)322 ILCID_POSIX_SUBTABLE(fa_AF) {
323     {0x8c,   "fa_AF"},  /* Persian/Dari (Afghanistan) */
324     {0x048c, "fa_AF"}   /* Persian/Dari (Afghanistan) */
325 };
326 
ILCID_POSIX_SUBTABLE(ff)327 ILCID_POSIX_SUBTABLE(ff) {
328     {0x67,   "ff"},
329     {0x7c67, "ff_Latn"},
330     {0x0867, "ff_Latn_SN"},
331     {0x0467, "ff_NG"}
332 };
333 
334 ILCID_POSIX_ELEMENT_ARRAY(0x040b, fi, fi_FI)
335 ILCID_POSIX_ELEMENT_ARRAY(0x0464, fil,fil_PH)
336 ILCID_POSIX_ELEMENT_ARRAY(0x0438, fo, fo_FO)
337 
ILCID_POSIX_SUBTABLE(fr)338 ILCID_POSIX_SUBTABLE(fr) {
339     {0x0c,   "fr"},
340     {0x080c, "fr_BE"},
341     {0x0c0c, "fr_CA"},
342     {0x240c, "fr_CD"},
343     {0x240c, "fr_CG"},
344     {0x100c, "fr_CH"},
345     {0x300c, "fr_CI"},
346     {0x2c0c, "fr_CM"},
347     {0x040c, "fr_FR"},
348     {0x3c0c, "fr_HT"},
349     {0x140c, "fr_LU"},
350     {0x380c, "fr_MA"},
351     {0x180c, "fr_MC"},
352     {0x340c, "fr_ML"},
353     {0x200c, "fr_RE"},
354     {0x280c, "fr_SN"},
355     {0xe40c, "fr_015"},
356     {0x1c0c, "fr_029"}
357 };
358 
359 ILCID_POSIX_ELEMENT_ARRAY(0x0467, fuv, fuv_NG)
360 
361 ILCID_POSIX_ELEMENT_ARRAY(0x0462, fy, fy_NL)
362 
ILCID_POSIX_SUBTABLE(ga)363 ILCID_POSIX_SUBTABLE(ga) { /* Gaelic (Ireland) */
364     {0x3c,   "ga"},
365     {0x083c, "ga_IE"},
366     {0x043c, "gd_GB"}
367 };
368 
ILCID_POSIX_SUBTABLE(gd)369 ILCID_POSIX_SUBTABLE(gd) { /* Gaelic (Scotland) */
370     {0x91,   "gd"},
371     {0x0491, "gd_GB"}
372 };
373 
374 ILCID_POSIX_ELEMENT_ARRAY(0x0456, gl, gl_ES)
375 ILCID_POSIX_ELEMENT_ARRAY(0x0447, gu, gu_IN)
376 ILCID_POSIX_ELEMENT_ARRAY(0x0474, gn, gn_PY)
377 ILCID_POSIX_ELEMENT_ARRAY(0x0484, gsw,gsw_FR)
378 
ILCID_POSIX_SUBTABLE(ha)379 ILCID_POSIX_SUBTABLE(ha) {
380     {0x68,   "ha"},
381     {0x7c68, "ha_Latn"},
382     {0x0468, "ha_Latn_NG"},
383 };
384 
385 ILCID_POSIX_ELEMENT_ARRAY(0x0475, haw,haw_US)
386 ILCID_POSIX_ELEMENT_ARRAY(0x040d, he, he_IL)
387 ILCID_POSIX_ELEMENT_ARRAY(0x0439, hi, hi_IN)
388 
389 /* This LCID is really four different locales.*/
ILCID_POSIX_SUBTABLE(hr)390 ILCID_POSIX_SUBTABLE(hr) {
391     {0x1a,   "hr"},
392     {0x141a, "bs_Latn_BA"},  /* Bosnian, Bosnia and Herzegovina */
393     {0x681a, "bs_Latn"},  /* Bosnian, Bosnia and Herzegovina */
394     {0x141a, "bs_BA"},  /* Bosnian, Bosnia and Herzegovina */
395     {0x781a, "bs"},     /* Bosnian */
396     {0x201a, "bs_Cyrl_BA"},  /* Bosnian, Bosnia and Herzegovina */
397     {0x641a, "bs_Cyrl"},  /* Bosnian, Bosnia and Herzegovina */
398     {0x101a, "hr_BA"},  /* Croatian in Bosnia */
399     {0x041a, "hr_HR"},  /* Croatian*/
400     {0x2c1a, "sr_Latn_ME"},
401     {0x241a, "sr_Latn_RS"},
402     {0x181a, "sr_Latn_BA"}, /* Serbo-Croatian in Bosnia */
403     {0x081a, "sr_Latn_CS"}, /* Serbo-Croatian*/
404     {0x701a, "sr_Latn"},    /* It's 0x1a or 0x081a, pick one to make the test program happy. */
405     {0x1c1a, "sr_Cyrl_BA"}, /* Serbo-Croatian in Bosnia */
406     {0x0c1a, "sr_Cyrl_CS"}, /* Serbian*/
407     {0x301a, "sr_Cyrl_ME"},
408     {0x281a, "sr_Cyrl_RS"},
409     {0x6c1a, "sr_Cyrl"},    /* It's 0x1a or 0x0c1a, pick one to make the test program happy. */
410     {0x7c1a, "sr"}          /* In CLDR sr is sr_Cyrl. */
411 };
412 
ILCID_POSIX_SUBTABLE(hsb)413 ILCID_POSIX_SUBTABLE(hsb) {
414     {0x2E,   "hsb"},
415     {0x042E, "hsb_DE"},
416     {0x082E, "dsb_DE"},
417     {0x7C2E, "dsb"},
418 };
419 
420 ILCID_POSIX_ELEMENT_ARRAY(0x040e, hu, hu_HU)
421 ILCID_POSIX_ELEMENT_ARRAY(0x042b, hy, hy_AM)
422 
ILCID_POSIX_SUBTABLE(ibb)423 ILCID_POSIX_SUBTABLE(ibb) {
424     {0x69, "ibb"},
425     {0x0469, "ibb_NG"}
426 };
427 
428 ILCID_POSIX_ELEMENT_ARRAY(0x0421, id, id_ID)
429 ILCID_POSIX_ELEMENT_ARRAY(0x0470, ig, ig_NG)
430 ILCID_POSIX_ELEMENT_ARRAY(0x0478, ii, ii_CN)
431 ILCID_POSIX_ELEMENT_ARRAY(0x040f, is, is_IS)
432 
ILCID_POSIX_SUBTABLE(it)433 ILCID_POSIX_SUBTABLE(it) {
434     {0x10,   "it"},
435     {0x0810, "it_CH"},
436     {0x0410, "it_IT"}
437 };
438 
ILCID_POSIX_SUBTABLE(iu)439 ILCID_POSIX_SUBTABLE(iu) {
440     {0x5d,   "iu"},
441     {0x045d, "iu_Cans_CA"},
442     {0x785d, "iu_Cans"},
443     {0x085d, "iu_Latn_CA"},
444     {0x7c5d, "iu_Latn"}
445 };
446 
447 ILCID_POSIX_ELEMENT_ARRAY(0x040d, iw, iw_IL)    /*Left in for compatibility*/
448 ILCID_POSIX_ELEMENT_ARRAY(0x0411, ja, ja_JP)
449 ILCID_POSIX_ELEMENT_ARRAY(0x0437, ka, ka_GE)
450 ILCID_POSIX_ELEMENT_ARRAY(0x043f, kk, kk_KZ)
451 ILCID_POSIX_ELEMENT_ARRAY(0x046f, kl, kl_GL)
452 ILCID_POSIX_ELEMENT_ARRAY(0x0453, km, km_KH)
453 ILCID_POSIX_ELEMENT_ARRAY(0x044b, kn, kn_IN)
454 
ILCID_POSIX_SUBTABLE(ko)455 ILCID_POSIX_SUBTABLE(ko) {
456     {0x12,   "ko"},
457     {0x0812, "ko_KP"},
458     {0x0412, "ko_KR"}
459 };
460 
461 ILCID_POSIX_ELEMENT_ARRAY(0x0457, kok, kok_IN)
462 ILCID_POSIX_ELEMENT_ARRAY(0x0471, kr,  kr_NG)
463 
ILCID_POSIX_SUBTABLE(ks)464 ILCID_POSIX_SUBTABLE(ks) {         /* We could add PK and CN too */
465     {0x60,   "ks"},
466     {0x0460, "ks_Arab_IN"},
467     {0x0860, "ks_Deva_IN"}
468 };
469 
470 ILCID_POSIX_ELEMENT_ARRAY(0x0440, ky, ky_KG)   /* Kyrgyz is spoken in Kyrgyzstan */
471 
ILCID_POSIX_SUBTABLE(la)472 ILCID_POSIX_SUBTABLE(la) {
473     {0x76,   "la"},
474     {0x0476, "la_001"},
475     {0x0476, "la_IT"}       /*Left in for compatibility*/
476 };
477 
478 ILCID_POSIX_ELEMENT_ARRAY(0x046e, lb, lb_LU)
479 ILCID_POSIX_ELEMENT_ARRAY(0x0454, lo, lo_LA)
480 ILCID_POSIX_ELEMENT_ARRAY(0x0427, lt, lt_LT)
481 ILCID_POSIX_ELEMENT_ARRAY(0x0426, lv, lv_LV)
482 ILCID_POSIX_ELEMENT_ARRAY(0x0481, mi, mi_NZ)
483 ILCID_POSIX_ELEMENT_ARRAY(0x042f, mk, mk_MK)
484 ILCID_POSIX_ELEMENT_ARRAY(0x044c, ml, ml_IN)
485 
ILCID_POSIX_SUBTABLE(mn)486 ILCID_POSIX_SUBTABLE(mn) {
487     {0x50,   "mn"},
488     {0x0450, "mn_MN"},
489     {0x7c50, "mn_Mong"},
490     {0x0850, "mn_Mong_CN"},
491     {0x0850, "mn_CN"},
492     {0x7850, "mn_Cyrl"},
493     {0x0c50, "mn_Mong_MN"}
494 };
495 
496 ILCID_POSIX_ELEMENT_ARRAY(0x0458, mni,mni_IN)
497 ILCID_POSIX_ELEMENT_ARRAY(0x047c, moh,moh_CA)
498 ILCID_POSIX_ELEMENT_ARRAY(0x044e, mr, mr_IN)
499 
ILCID_POSIX_SUBTABLE(ms)500 ILCID_POSIX_SUBTABLE(ms) {
501     {0x3e,   "ms"},
502     {0x083e, "ms_BN"},   /* Brunei Darussalam*/
503     {0x043e, "ms_MY"}    /* Malaysia*/
504 };
505 
506 ILCID_POSIX_ELEMENT_ARRAY(0x043a, mt, mt_MT)
507 ILCID_POSIX_ELEMENT_ARRAY(0x0455, my, my_MM)
508 
ILCID_POSIX_SUBTABLE(ne)509 ILCID_POSIX_SUBTABLE(ne) {
510     {0x61,   "ne"},
511     {0x0861, "ne_IN"},   /* India*/
512     {0x0461, "ne_NP"}    /* Nepal*/
513 };
514 
ILCID_POSIX_SUBTABLE(nl)515 ILCID_POSIX_SUBTABLE(nl) {
516     {0x13,   "nl"},
517     {0x0813, "nl_BE"},
518     {0x0413, "nl_NL"}
519 };
520 
521 /* The "no" locale split into nb and nn.  By default in ICU, "no" is nb.*/
522 // TODO: Not all of these are needed on Windows, but I don't know how ICU treats preferred ones here.
ILCID_POSIX_SUBTABLE(no)523 ILCID_POSIX_SUBTABLE(no) {
524     {0x14,   "no"},     /* really nb_NO - actually Windows differentiates between neutral (no region) and specific (with region) */
525     {0x7c14, "nb"},     /* really nb */
526     {0x0414, "nb_NO"},  /* really nb_NO. Keep first in the 414 list. */
527     {0x0414, "no_NO"},  /* really nb_NO */
528     {0x0814, "nn_NO"},  /* really nn_NO. Keep first in the 814 list.  */
529     {0x7814, "nn"},     /* It's 0x14 or 0x814, pick one to make the test program happy. */
530     {0x0814, "no_NO_NY"}/* really nn_NO */
531 };
532 
533 ILCID_POSIX_ELEMENT_ARRAY(0x046c, nso,nso_ZA)   /* TODO: Verify the ISO-639 code */
534 ILCID_POSIX_ELEMENT_ARRAY(0x0482, oc, oc_FR)
535 
ILCID_POSIX_SUBTABLE(om)536 ILCID_POSIX_SUBTABLE(om) { /* TODO: Verify the country */
537     {0x72,   "om"},
538     {0x0472, "om_ET"},
539     {0x0472, "gaz_ET"}
540 };
541 
542 /* Declared as or_IN to get around compiler errors*/
ILCID_POSIX_SUBTABLE(or_IN)543 ILCID_POSIX_SUBTABLE(or_IN) {
544     {0x48,   "or"},
545     {0x0448, "or_IN"},
546 };
547 
ILCID_POSIX_SUBTABLE(pa)548 ILCID_POSIX_SUBTABLE(pa) {
549     {0x46,   "pa"},
550     {0x0446, "pa_IN"},
551     {0x0846, "pa_Arab_PK"},
552     {0x0846, "pa_PK"}
553 };
554 
ILCID_POSIX_SUBTABLE(pap)555 ILCID_POSIX_SUBTABLE(pap) {
556     {0x79, "pap"},
557     {0x0479, "pap_029"},
558     {0x0479, "pap_AN"}     /*Left in for compatibility*/
559 };
560 
561 ILCID_POSIX_ELEMENT_ARRAY(0x0415, pl, pl_PL)
562 ILCID_POSIX_ELEMENT_ARRAY(0x0463, ps, ps_AF)
563 
ILCID_POSIX_SUBTABLE(pt)564 ILCID_POSIX_SUBTABLE(pt) {
565     {0x16,   "pt"},
566     {0x0416, "pt_BR"},
567     {0x0816, "pt_PT"}
568 };
569 
ILCID_POSIX_SUBTABLE(qu)570 ILCID_POSIX_SUBTABLE(qu) {
571     {0x6b,   "qu"},
572     {0x046b, "qu_BO"},
573     {0x086b, "qu_EC"},
574     {0x0C6b, "qu_PE"},
575     {0x046b, "quz_BO"},
576     {0x086b, "quz_EC"},
577     {0x0C6b, "quz_PE"}
578 };
579 
ILCID_POSIX_SUBTABLE(quc)580 ILCID_POSIX_SUBTABLE(quc) {
581     {0x93,   "quc"},
582     {0x0493, "quc_CO"},
583     /*
584         "quc_Latn_GT" is an exceptional case. Language ID of "quc"
585         is 0x93, but LCID of "quc_Latn_GT" is 0x486, which should be
586         under the group of "qut". "qut" is a retired ISO 639-3 language
587         code for West Central Quiche, and merged to "quc".
588         It looks Windows previously reserved "qut" for K'iche', but,
589         decided to use "quc" when adding a locale for K'iche' (Guatemala).
590 
591         This data structure used here assumes language ID bits in
592         LCID is unique for alphabetic language code. But this is not true
593         for "quc_Latn_GT". If we don't have the data below, LCID look up
594         by alphabetic locale ID (POSIX) will fail. The same entry is found
595         under "qut" below, which is required for reverse look up.
596     */
597     {0x0486, "quc_Latn_GT"}
598 };
599 
ILCID_POSIX_SUBTABLE(qut)600 ILCID_POSIX_SUBTABLE(qut) {
601     {0x86,   "qut"},
602     {0x0486, "qut_GT"},
603     /*
604         See the note in "quc" above.
605     */
606     {0x0486, "quc_Latn_GT"}
607 };
608 
609 ILCID_POSIX_ELEMENT_ARRAY(0x0417, rm, rm_CH)
610 
ILCID_POSIX_SUBTABLE(ro)611 ILCID_POSIX_SUBTABLE(ro) {
612     {0x18,   "ro"},
613     {0x0418, "ro_RO"},
614     {0x0818, "ro_MD"}
615 };
616 
617 // TODO: This is almost certainly 'wrong'.  0 in Windows is a synonym for LOCALE_USER_DEFAULT.
618 // More likely this is a similar concept to the Windows 0x7f Invariant locale ""
619 // (Except that it's not invariant in ICU)
ILCID_POSIX_SUBTABLE(root)620 ILCID_POSIX_SUBTABLE(root) {
621     {0x00,   "root"}
622 };
623 
ILCID_POSIX_SUBTABLE(ru)624 ILCID_POSIX_SUBTABLE(ru) {
625     {0x19,   "ru"},
626     {0x0419, "ru_RU"},
627     {0x0819, "ru_MD"}
628 };
629 
630 ILCID_POSIX_ELEMENT_ARRAY(0x0487, rw, rw_RW)
631 ILCID_POSIX_ELEMENT_ARRAY(0x044f, sa, sa_IN)
632 ILCID_POSIX_ELEMENT_ARRAY(0x0485, sah,sah_RU)
633 
ILCID_POSIX_SUBTABLE(sd)634 ILCID_POSIX_SUBTABLE(sd) {
635     {0x59,   "sd"},
636     {0x0459, "sd_Deva_IN"},
637     {0x0459, "sd_IN"},
638     {0x0859, "sd_Arab_PK"},
639     {0x0859, "sd_PK"},
640     {0x7c59, "sd_Arab"}
641 };
642 
ILCID_POSIX_SUBTABLE(se)643 ILCID_POSIX_SUBTABLE(se) {
644     {0x3b,   "se"},
645     {0x0c3b, "se_FI"},
646     {0x043b, "se_NO"},
647     {0x083b, "se_SE"},
648     {0x783b, "sma"},
649     {0x183b, "sma_NO"},
650     {0x1c3b, "sma_SE"},
651     {0x7c3b, "smj"},
652     {0x703b, "smn"},
653     {0x743b, "sms"},
654     {0x103b, "smj_NO"},
655     {0x143b, "smj_SE"},
656     {0x243b, "smn_FI"},
657     {0x203b, "sms_FI"},
658 };
659 
660 ILCID_POSIX_ELEMENT_ARRAY(0x045b, si, si_LK)
661 ILCID_POSIX_ELEMENT_ARRAY(0x041b, sk, sk_SK)
662 ILCID_POSIX_ELEMENT_ARRAY(0x0424, sl, sl_SI)
663 
ILCID_POSIX_SUBTABLE(so)664 ILCID_POSIX_SUBTABLE(so) {
665     {0x77,   "so"},
666     {0x0477, "so_SO"}
667 };
668 
669 ILCID_POSIX_ELEMENT_ARRAY(0x041c, sq, sq_AL)
670 ILCID_POSIX_ELEMENT_ARRAY(0x0430, st, st_ZA)
671 
ILCID_POSIX_SUBTABLE(sv)672 ILCID_POSIX_SUBTABLE(sv) {
673     {0x1d,   "sv"},
674     {0x081d, "sv_FI"},
675     {0x041d, "sv_SE"}
676 };
677 
678 ILCID_POSIX_ELEMENT_ARRAY(0x0441, sw, sw_KE)
679 ILCID_POSIX_ELEMENT_ARRAY(0x045A, syr, syr_SY)
680 
ILCID_POSIX_SUBTABLE(ta)681 ILCID_POSIX_SUBTABLE(ta) {
682     {0x49,   "ta"},
683     {0x0449, "ta_IN"},
684     {0x0849, "ta_LK"}
685 };
686 
687 ILCID_POSIX_ELEMENT_ARRAY(0x044a, te, te_IN)
688 
689 /* Cyrillic based by default */
ILCID_POSIX_SUBTABLE(tg)690 ILCID_POSIX_SUBTABLE(tg) {
691     {0x28,   "tg"},
692     {0x7c28, "tg_Cyrl"},
693     {0x0428, "tg_Cyrl_TJ"}
694 };
695 
696 ILCID_POSIX_ELEMENT_ARRAY(0x041e, th, th_TH)
697 
ILCID_POSIX_SUBTABLE(ti)698 ILCID_POSIX_SUBTABLE(ti) {
699     {0x73,   "ti"},
700     {0x0873, "ti_ER"},
701     {0x0473, "ti_ET"}
702 };
703 
704 ILCID_POSIX_ELEMENT_ARRAY(0x0442, tk, tk_TM)
705 
ILCID_POSIX_SUBTABLE(tn)706 ILCID_POSIX_SUBTABLE(tn) {
707     {0x32,   "tn"},
708     {0x0832, "tn_BW"},
709     {0x0432, "tn_ZA"}
710 };
711 
712 ILCID_POSIX_ELEMENT_ARRAY(0x041f, tr, tr_TR)
713 ILCID_POSIX_ELEMENT_ARRAY(0x0431, ts, ts_ZA)
714 ILCID_POSIX_ELEMENT_ARRAY(0x0444, tt, tt_RU)
715 
ILCID_POSIX_SUBTABLE(tzm)716 ILCID_POSIX_SUBTABLE(tzm) {
717     {0x5f,   "tzm"},
718     {0x7c5f, "tzm_Latn"},
719     {0x085f, "tzm_Latn_DZ"},
720     {0x105f, "tzm_Tfng_MA"},
721     {0x045f, "tzm_Arab_MA"},
722     {0x045f, "tmz"}
723 };
724 
ILCID_POSIX_SUBTABLE(ug)725 ILCID_POSIX_SUBTABLE(ug) {
726     {0x80,   "ug"},
727     {0x0480, "ug_CN"},
728     {0x0480, "ug_Arab_CN"}
729 };
730 
731 ILCID_POSIX_ELEMENT_ARRAY(0x0422, uk, uk_UA)
732 
ILCID_POSIX_SUBTABLE(ur)733 ILCID_POSIX_SUBTABLE(ur) {
734     {0x20,   "ur"},
735     {0x0820, "ur_IN"},
736     {0x0420, "ur_PK"}
737 };
738 
ILCID_POSIX_SUBTABLE(uz)739 ILCID_POSIX_SUBTABLE(uz) {
740     {0x43,   "uz"},
741     {0x0843, "uz_Cyrl_UZ"},  /* Cyrillic based */
742     {0x7843, "uz_Cyrl"},  /* Cyrillic based */
743     {0x0843, "uz_UZ"},  /* Cyrillic based */
744     {0x0443, "uz_Latn_UZ"}, /* Latin based */
745     {0x7c43, "uz_Latn"} /* Latin based */
746 };
747 
ILCID_POSIX_SUBTABLE(ve)748 ILCID_POSIX_SUBTABLE(ve) { /* TODO: Verify the country */
749     {0x33,   "ve"},
750     {0x0433, "ve_ZA"},
751     {0x0433, "ven_ZA"}
752 };
753 
754 ILCID_POSIX_ELEMENT_ARRAY(0x042a, vi, vi_VN)
755 ILCID_POSIX_ELEMENT_ARRAY(0x0488, wo, wo_SN)
756 ILCID_POSIX_ELEMENT_ARRAY(0x0434, xh, xh_ZA)
757 
ILCID_POSIX_SUBTABLE(yi)758 ILCID_POSIX_SUBTABLE(yi) {
759     {0x003d, "yi"},
760     {0x043d, "yi_001"}
761 };
762 
763 ILCID_POSIX_ELEMENT_ARRAY(0x046a, yo, yo_NG)
764 
765 // Windows & ICU tend to different names for some of these
766 // TODO: Windows probably does not need all of these entries, but I don't know how the precedence works.
ILCID_POSIX_SUBTABLE(zh)767 ILCID_POSIX_SUBTABLE(zh) {
768     {0x0004, "zh_Hans"},
769     {0x7804, "zh"},
770     {0x0804, "zh_CN"},
771     {0x0804, "zh_Hans_CN"},
772     {0x0c04, "zh_Hant_HK"},
773     {0x0c04, "zh_HK"},
774     {0x1404, "zh_Hant_MO"},
775     {0x1404, "zh_MO"},
776     {0x1004, "zh_Hans_SG"},
777     {0x1004, "zh_SG"},
778     {0x0404, "zh_Hant_TW"},
779     {0x7c04, "zh_Hant"},
780     {0x0404, "zh_TW"},
781     {0x30404,"zh_Hant_TW"},     /* Bopomofo order */
782     {0x30404,"zh_TW"},          /* Bopomofo order */
783     {0x20004,"zh@collation=stroke"},
784     {0x20404,"zh_Hant@collation=stroke"},
785     {0x20404,"zh_Hant_TW@collation=stroke"},
786     {0x20404,"zh_TW@collation=stroke"},
787     {0x20804,"zh_Hans@collation=stroke"},
788     {0x20804,"zh_Hans_CN@collation=stroke"},
789     {0x20804,"zh_CN@collation=stroke"}
790     // TODO: Alternate collations for other LCIDs are missing, eg: 0x50804
791 };
792 
793 ILCID_POSIX_ELEMENT_ARRAY(0x0435, zu, zu_ZA)
794 
795 /* This must be static and grouped by LCID. */
796 static const ILcidPosixMap gPosixIDmap[] = {
797     ILCID_POSIX_MAP(af),    /*  af  Afrikaans                 0x36 */
798     ILCID_POSIX_MAP(am),    /*  am  Amharic                   0x5e */
799     ILCID_POSIX_MAP(ar),    /*  ar  Arabic                    0x01 */
800     ILCID_POSIX_MAP(arn),   /*  arn Araucanian/Mapudungun     0x7a */
801     ILCID_POSIX_MAP(as),    /*  as  Assamese                  0x4d */
802     ILCID_POSIX_MAP(az),    /*  az  Azerbaijani               0x2c */
803     ILCID_POSIX_MAP(ba),    /*  ba  Bashkir                   0x6d */
804     ILCID_POSIX_MAP(be),    /*  be  Belarusian                0x23 */
805 /*    ILCID_POSIX_MAP(ber),     ber Berber/Tamazight          0x5f */
806     ILCID_POSIX_MAP(bg),    /*  bg  Bulgarian                 0x02 */
807     ILCID_POSIX_MAP(bin),   /*  bin Edo                       0x66 */
808     ILCID_POSIX_MAP(bn),    /*  bn  Bengali; Bangla           0x45 */
809     ILCID_POSIX_MAP(bo),    /*  bo  Tibetan                   0x51 */
810     ILCID_POSIX_MAP(br),    /*  br  Breton                    0x7e */
811     ILCID_POSIX_MAP(ca),    /*  ca  Catalan                   0x03 */
812     ILCID_POSIX_MAP(chr),   /*  chr Cherokee                  0x5c */
813     ILCID_POSIX_MAP(ckb),   /*  ckb Sorani (Central Kurdish)  0x92 */
814     ILCID_POSIX_MAP(co),    /*  co  Corsican                  0x83 */
815     ILCID_POSIX_MAP(cs),    /*  cs  Czech                     0x05 */
816     ILCID_POSIX_MAP(cy),    /*  cy  Welsh                     0x52 */
817     ILCID_POSIX_MAP(da),    /*  da  Danish                    0x06 */
818     ILCID_POSIX_MAP(de),    /*  de  German                    0x07 */
819     ILCID_POSIX_MAP(dv),    /*  dv  Divehi                    0x65 */
820     ILCID_POSIX_MAP(el),    /*  el  Greek                     0x08 */
821     ILCID_POSIX_MAP(en),    /*  en  English                   0x09 */
822     ILCID_POSIX_MAP(en_US_POSIX), /*    invariant             0x7f */
823     ILCID_POSIX_MAP(es),    /*  es  Spanish                   0x0a */
824     ILCID_POSIX_MAP(et),    /*  et  Estonian                  0x25 */
825     ILCID_POSIX_MAP(eu),    /*  eu  Basque                    0x2d */
826     ILCID_POSIX_MAP(fa),    /*  fa  Persian/Farsi             0x29 */
827     ILCID_POSIX_MAP(fa_AF), /*  fa  Persian/Dari              0x8c */
828     ILCID_POSIX_MAP(ff),    /*  ff  Fula                      0x67 */
829     ILCID_POSIX_MAP(fi),    /*  fi  Finnish                   0x0b */
830     ILCID_POSIX_MAP(fil),   /*  fil Filipino                  0x64 */
831     ILCID_POSIX_MAP(fo),    /*  fo  Faroese                   0x38 */
832     ILCID_POSIX_MAP(fr),    /*  fr  French                    0x0c */
833     ILCID_POSIX_MAP(fuv),   /*  fuv Fulfulde - Nigeria        0x67 */
834     ILCID_POSIX_MAP(fy),    /*  fy  Frisian                   0x62 */
835     ILCID_POSIX_MAP(ga),    /*  *   Gaelic (Ireland,Scotland) 0x3c */
836     ILCID_POSIX_MAP(gd),    /*  gd  Gaelic (United Kingdom)   0x91 */
837     ILCID_POSIX_MAP(gl),    /*  gl  Galician                  0x56 */
838     ILCID_POSIX_MAP(gn),    /*  gn  Guarani                   0x74 */
839     ILCID_POSIX_MAP(gsw),   /*  gsw Alemanic/Alsatian/Swiss German 0x84 */
840     ILCID_POSIX_MAP(gu),    /*  gu  Gujarati                  0x47 */
841     ILCID_POSIX_MAP(ha),    /*  ha  Hausa                     0x68 */
842     ILCID_POSIX_MAP(haw),   /*  haw Hawaiian                  0x75 */
843     ILCID_POSIX_MAP(he),    /*  he  Hebrew (formerly iw)      0x0d */
844     ILCID_POSIX_MAP(hi),    /*  hi  Hindi                     0x39 */
845     ILCID_POSIX_MAP(hr),    /*  *   Croatian and others       0x1a */
846     ILCID_POSIX_MAP(hsb),   /*  hsb Upper Sorbian             0x2e */
847     ILCID_POSIX_MAP(hu),    /*  hu  Hungarian                 0x0e */
848     ILCID_POSIX_MAP(hy),    /*  hy  Armenian                  0x2b */
849     ILCID_POSIX_MAP(ibb),   /*  ibb Ibibio - Nigeria          0x69 */
850     ILCID_POSIX_MAP(id),    /*  id  Indonesian (formerly in)  0x21 */
851     ILCID_POSIX_MAP(ig),    /*  ig  Igbo                      0x70 */
852     ILCID_POSIX_MAP(ii),    /*  ii  Sichuan Yi                0x78 */
853     ILCID_POSIX_MAP(is),    /*  is  Icelandic                 0x0f */
854     ILCID_POSIX_MAP(it),    /*  it  Italian                   0x10 */
855     ILCID_POSIX_MAP(iu),    /*  iu  Inuktitut                 0x5d */
856     ILCID_POSIX_MAP(iw),    /*  iw  Hebrew                    0x0d */
857     ILCID_POSIX_MAP(ja),    /*  ja  Japanese                  0x11 */
858     ILCID_POSIX_MAP(ka),    /*  ka  Georgian                  0x37 */
859     ILCID_POSIX_MAP(kk),    /*  kk  Kazakh                    0x3f */
860     ILCID_POSIX_MAP(kl),    /*  kl  Kalaallisut               0x6f */
861     ILCID_POSIX_MAP(km),    /*  km  Khmer                     0x53 */
862     ILCID_POSIX_MAP(kn),    /*  kn  Kannada                   0x4b */
863     ILCID_POSIX_MAP(ko),    /*  ko  Korean                    0x12 */
864     ILCID_POSIX_MAP(kok),   /*  kok Konkani                   0x57 */
865     ILCID_POSIX_MAP(kr),    /*  kr  Kanuri                    0x71 */
866     ILCID_POSIX_MAP(ks),    /*  ks  Kashmiri                  0x60 */
867     ILCID_POSIX_MAP(ky),    /*  ky  Kyrgyz                    0x40 */
868     ILCID_POSIX_MAP(lb),    /*  lb  Luxembourgish             0x6e */
869     ILCID_POSIX_MAP(la),    /*  la  Latin                     0x76 */
870     ILCID_POSIX_MAP(lo),    /*  lo  Lao                       0x54 */
871     ILCID_POSIX_MAP(lt),    /*  lt  Lithuanian                0x27 */
872     ILCID_POSIX_MAP(lv),    /*  lv  Latvian, Lettish          0x26 */
873     ILCID_POSIX_MAP(mi),    /*  mi  Maori                     0x81 */
874     ILCID_POSIX_MAP(mk),    /*  mk  Macedonian                0x2f */
875     ILCID_POSIX_MAP(ml),    /*  ml  Malayalam                 0x4c */
876     ILCID_POSIX_MAP(mn),    /*  mn  Mongolian                 0x50 */
877     ILCID_POSIX_MAP(mni),   /*  mni Manipuri                  0x58 */
878     ILCID_POSIX_MAP(moh),   /*  moh Mohawk                    0x7c */
879     ILCID_POSIX_MAP(mr),    /*  mr  Marathi                   0x4e */
880     ILCID_POSIX_MAP(ms),    /*  ms  Malay                     0x3e */
881     ILCID_POSIX_MAP(mt),    /*  mt  Maltese                   0x3a */
882     ILCID_POSIX_MAP(my),    /*  my  Burmese                   0x55 */
883 /*    ILCID_POSIX_MAP(nb),    //  no  Norwegian                 0x14 */
884     ILCID_POSIX_MAP(ne),    /*  ne  Nepali                    0x61 */
885     ILCID_POSIX_MAP(nl),    /*  nl  Dutch                     0x13 */
886 /*    ILCID_POSIX_MAP(nn),    //  no  Norwegian                 0x14 */
887     ILCID_POSIX_MAP(no),    /*  *   Norwegian                 0x14 */
888     ILCID_POSIX_MAP(nso),   /*  nso Sotho, Northern (Sepedi dialect) 0x6c */
889     ILCID_POSIX_MAP(oc),    /*  oc  Occitan                   0x82 */
890     ILCID_POSIX_MAP(om),    /*  om  Oromo                     0x72 */
891     ILCID_POSIX_MAP(or_IN), /*  or  Oriya                     0x48 */
892     ILCID_POSIX_MAP(pa),    /*  pa  Punjabi                   0x46 */
893     ILCID_POSIX_MAP(pap),   /*  pap Papiamentu                0x79 */
894     ILCID_POSIX_MAP(pl),    /*  pl  Polish                    0x15 */
895     ILCID_POSIX_MAP(ps),    /*  ps  Pashto                    0x63 */
896     ILCID_POSIX_MAP(pt),    /*  pt  Portuguese                0x16 */
897     ILCID_POSIX_MAP(qu),    /*  qu  Quechua                   0x6B */
898     ILCID_POSIX_MAP(quc),   /*  quc K'iche                    0x93 */
899     ILCID_POSIX_MAP(qut),   /*  qut K'iche                    0x86 */
900     ILCID_POSIX_MAP(rm),    /*  rm  Raeto-Romance/Romansh     0x17 */
901     ILCID_POSIX_MAP(ro),    /*  ro  Romanian                  0x18 */
902     ILCID_POSIX_MAP(root),  /*  root                          0x00 */
903     ILCID_POSIX_MAP(ru),    /*  ru  Russian                   0x19 */
904     ILCID_POSIX_MAP(rw),    /*  rw  Kinyarwanda               0x87 */
905     ILCID_POSIX_MAP(sa),    /*  sa  Sanskrit                  0x4f */
906     ILCID_POSIX_MAP(sah),   /*  sah Yakut                     0x85 */
907     ILCID_POSIX_MAP(sd),    /*  sd  Sindhi                    0x59 */
908     ILCID_POSIX_MAP(se),    /*  se  Sami                      0x3b */
909 /*    ILCID_POSIX_MAP(sh),    //  sh  Serbo-Croatian            0x1a */
910     ILCID_POSIX_MAP(si),    /*  si  Sinhalese                 0x5b */
911     ILCID_POSIX_MAP(sk),    /*  sk  Slovak                    0x1b */
912     ILCID_POSIX_MAP(sl),    /*  sl  Slovenian                 0x24 */
913     ILCID_POSIX_MAP(so),    /*  so  Somali                    0x77 */
914     ILCID_POSIX_MAP(sq),    /*  sq  Albanian                  0x1c */
915 /*    ILCID_POSIX_MAP(sr),    //  sr  Serbian                   0x1a */
916     ILCID_POSIX_MAP(st),    /*  st  Sutu                      0x30 */
917     ILCID_POSIX_MAP(sv),    /*  sv  Swedish                   0x1d */
918     ILCID_POSIX_MAP(sw),    /*  sw  Swahili                   0x41 */
919     ILCID_POSIX_MAP(syr),   /*  syr Syriac                    0x5A */
920     ILCID_POSIX_MAP(ta),    /*  ta  Tamil                     0x49 */
921     ILCID_POSIX_MAP(te),    /*  te  Telugu                    0x4a */
922     ILCID_POSIX_MAP(tg),    /*  tg  Tajik                     0x28 */
923     ILCID_POSIX_MAP(th),    /*  th  Thai                      0x1e */
924     ILCID_POSIX_MAP(ti),    /*  ti  Tigrigna                  0x73 */
925     ILCID_POSIX_MAP(tk),    /*  tk  Turkmen                   0x42 */
926     ILCID_POSIX_MAP(tn),    /*  tn  Tswana                    0x32 */
927     ILCID_POSIX_MAP(tr),    /*  tr  Turkish                   0x1f */
928     ILCID_POSIX_MAP(ts),    /*  ts  Tsonga                    0x31 */
929     ILCID_POSIX_MAP(tt),    /*  tt  Tatar                     0x44 */
930     ILCID_POSIX_MAP(tzm),   /*  tzm Tamazight                 0x5f */
931     ILCID_POSIX_MAP(ug),    /*  ug  Uighur                    0x80 */
932     ILCID_POSIX_MAP(uk),    /*  uk  Ukrainian                 0x22 */
933     ILCID_POSIX_MAP(ur),    /*  ur  Urdu                      0x20 */
934     ILCID_POSIX_MAP(uz),    /*  uz  Uzbek                     0x43 */
935     ILCID_POSIX_MAP(ve),    /*  ve  Venda                     0x33 */
936     ILCID_POSIX_MAP(vi),    /*  vi  Vietnamese                0x2a */
937     ILCID_POSIX_MAP(wo),    /*  wo  Wolof                     0x88 */
938     ILCID_POSIX_MAP(xh),    /*  xh  Xhosa                     0x34 */
939     ILCID_POSIX_MAP(yi),    /*  yi  Yiddish                   0x3d */
940     ILCID_POSIX_MAP(yo),    /*  yo  Yoruba                    0x6a */
941     ILCID_POSIX_MAP(zh),    /*  zh  Chinese                   0x04 */
942     ILCID_POSIX_MAP(zu),    /*  zu  Zulu                      0x35 */
943 };
944 
945 static const uint32_t gLocaleCount = UPRV_LENGTHOF(gPosixIDmap);
946 
947 /**
948  * Do not call this function. It is called by hostID.
949  * The function is not private because this struct must stay as a C struct,
950  * and this is an internal class.
951  */
952 static int32_t
idCmp(const char * id1,const char * id2)953 idCmp(const char* id1, const char* id2)
954 {
955     int32_t diffIdx = 0;
956     while (*id1 == *id2 && *id1 != 0) {
957         diffIdx++;
958         id1++;
959         id2++;
960     }
961     return diffIdx;
962 }
963 
964 /**
965  * Searches for a Windows LCID
966  *
967  * @param posixID the Posix style locale id.
968  * @param status gets set to U_ILLEGAL_ARGUMENT_ERROR when the Posix ID has
969  *               no equivalent Windows LCID.
970  * @return the LCID
971  */
972 static uint32_t
getHostID(const ILcidPosixMap * this_0,const char * posixID,UErrorCode * status)973 getHostID(const ILcidPosixMap *this_0, const char* posixID, UErrorCode* status)
974 {
975     int32_t bestIdx = 0;
976     int32_t bestIdxDiff = 0;
977     int32_t posixIDlen = (int32_t)uprv_strlen(posixID);
978     uint32_t idx;
979 
980     for (idx = 0; idx < this_0->numRegions; idx++ ) {
981         int32_t sameChars = idCmp(posixID, this_0->regionMaps[idx].posixID);
982         if (sameChars > bestIdxDiff && this_0->regionMaps[idx].posixID[sameChars] == 0) {
983             if (posixIDlen == sameChars) {
984                 /* Exact match */
985                 return this_0->regionMaps[idx].hostID;
986             }
987             bestIdxDiff = sameChars;
988             bestIdx = idx;
989         }
990     }
991     /* We asked for something unusual, like en_ZZ, and we try to return the number for the same language. */
992     /* We also have to make sure that sid and si and similar string subsets don't match. */
993     if ((posixID[bestIdxDiff] == '_' || posixID[bestIdxDiff] == '@')
994         && this_0->regionMaps[bestIdx].posixID[bestIdxDiff] == 0)
995     {
996         *status = U_USING_FALLBACK_WARNING;
997         return this_0->regionMaps[bestIdx].hostID;
998     }
999 
1000     /*no match found */
1001     *status = U_ILLEGAL_ARGUMENT_ERROR;
1002     return this_0->regionMaps->hostID;
1003 }
1004 
1005 static const char*
getPosixID(const ILcidPosixMap * this_0,uint32_t hostID)1006 getPosixID(const ILcidPosixMap *this_0, uint32_t hostID)
1007 {
1008     uint32_t i;
1009     for (i = 0; i < this_0->numRegions; i++)
1010     {
1011         if (this_0->regionMaps[i].hostID == hostID)
1012         {
1013             return this_0->regionMaps[i].posixID;
1014         }
1015     }
1016 
1017     /* If you get here, then no matching region was found,
1018        so return the language id with the wild card region. */
1019     return this_0->regionMaps[0].posixID;
1020 }
1021 
1022 /*
1023 //////////////////////////////////////
1024 //
1025 // LCID --> POSIX
1026 //
1027 /////////////////////////////////////
1028 */
1029 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1030 /*
1031  * Various language tags needs to be changed:
1032  * quz -> qu
1033  * prs -> fa
1034  */
1035 #define FIX_LANGUAGE_ID_TAG(buffer, len) \
1036     if (len >= 3) { \
1037         if (buffer[0] == 'q' && buffer[1] == 'u' && buffer[2] == 'z') {\
1038             buffer[2] = 0; \
1039             uprv_strcat(buffer, buffer+3); \
1040         } else if (buffer[0] == 'p' && buffer[1] == 'r' && buffer[2] == 's') {\
1041             buffer[0] = 'f'; buffer[1] = 'a'; buffer[2] = 0; \
1042             uprv_strcat(buffer, buffer+3); \
1043         } \
1044     }
1045 
1046 #endif
1047 
1048 U_CAPI int32_t
uprv_convertToPosix(uint32_t hostid,char * posixID,int32_t posixIDCapacity,UErrorCode * status)1049 uprv_convertToPosix(uint32_t hostid, char *posixID, int32_t posixIDCapacity, UErrorCode* status)
1050 {
1051     uint16_t langID;
1052     uint32_t localeIndex;
1053     UBool bLookup = TRUE;
1054     const char *pPosixID = NULL;
1055 
1056 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1057     static_assert(ULOC_FULLNAME_CAPACITY > LOCALE_NAME_MAX_LENGTH, "Windows locale names have smaller length than ICU locale names.");
1058 
1059     char locName[LOCALE_NAME_MAX_LENGTH] = {};
1060 
1061     // Note: Windows primary lang ID 0x92 in LCID is used for Central Kurdish and
1062     // GetLocaleInfo() maps such LCID to "ku". However, CLDR uses "ku" for
1063     // Northern Kurdish and "ckb" for Central Kurdish. For this reason, we cannot
1064     // use the Windows API to resolve locale ID for this specific case.
1065     if ((hostid & 0x3FF) != 0x92) {
1066         int32_t tmpLen = 0;
1067         char16_t windowsLocaleName[LOCALE_NAME_MAX_LENGTH] = {};
1068 
1069         // Note: LOCALE_ALLOW_NEUTRAL_NAMES was enabled in Windows7+, prior versions did not handle neutral (no-region) locale names.
1070         tmpLen = LCIDToLocaleName(hostid, (PWSTR)windowsLocaleName, UPRV_LENGTHOF(windowsLocaleName), LOCALE_ALLOW_NEUTRAL_NAMES);
1071         if (tmpLen > 1) {
1072             int32_t i = 0;
1073             // Only need to look up in table if have _, eg for de-de_phoneb type alternate sort.
1074             bLookup = FALSE;
1075             for (i = 0; i < UPRV_LENGTHOF(locName); i++)
1076             {
1077                 locName[i] = (char)(windowsLocaleName[i]);
1078 
1079                 // Windows locale name may contain sorting variant, such as "es-ES_tradnl".
1080                 // In such cases, we need special mapping data found in the hardcoded table
1081                 // in this source file.
1082                 if (windowsLocaleName[i] == L'_')
1083                 {
1084                     // Keep the base locale, without variant
1085                     // TODO: Should these be mapped from _phoneb to @collation=phonebook, etc.?
1086                     locName[i] = '\0';
1087                     tmpLen = i;
1088                     bLookup = TRUE;
1089                     break;
1090                 }
1091                 else if (windowsLocaleName[i] == L'-')
1092                 {
1093                     // Windows names use -, ICU uses _
1094                     locName[i] = '_';
1095                 }
1096                 else if (windowsLocaleName[i] == L'\0')
1097                 {
1098                     // No point in doing more work than necessary
1099                     break;
1100                 }
1101             }
1102             // TODO: Need to understand this better, why isn't it an alias?
1103             FIX_LANGUAGE_ID_TAG(locName, tmpLen);
1104             pPosixID = locName;
1105         }
1106     }
1107 #endif
1108 
1109     if (bLookup) {
1110         const char *pCandidate = NULL;
1111         langID = LANGUAGE_LCID(hostid);
1112 
1113         for (localeIndex = 0; localeIndex < gLocaleCount; localeIndex++) {
1114             if (langID == gPosixIDmap[localeIndex].regionMaps->hostID) {
1115                 pCandidate = getPosixID(&gPosixIDmap[localeIndex], hostid);
1116                 break;
1117             }
1118         }
1119 
1120         /* On Windows, when locale name has a variant, we still look up the hardcoded table.
1121            If a match in the hardcoded table is longer than the Windows locale name without
1122            variant, we use the one as the result */
1123         if (pCandidate && (pPosixID == NULL || uprv_strlen(pCandidate) > uprv_strlen(pPosixID))) {
1124             pPosixID = pCandidate;
1125         }
1126     }
1127 
1128     if (pPosixID) {
1129         int32_t resLen = static_cast<int32_t>(uprv_strlen(pPosixID));
1130         int32_t copyLen = resLen <= posixIDCapacity ? resLen : posixIDCapacity;
1131         uprv_memcpy(posixID, pPosixID, copyLen);
1132         if (resLen < posixIDCapacity) {
1133             posixID[resLen] = 0;
1134             if (*status == U_STRING_NOT_TERMINATED_WARNING) {
1135                 *status = U_ZERO_ERROR;
1136             }
1137         } else if (resLen == posixIDCapacity) {
1138             *status = U_STRING_NOT_TERMINATED_WARNING;
1139         } else {
1140             *status = U_BUFFER_OVERFLOW_ERROR;
1141         }
1142         return resLen;
1143     }
1144 
1145     /* no match found */
1146     *status = U_ILLEGAL_ARGUMENT_ERROR;
1147     return -1;
1148 }
1149 
1150 /*
1151 //////////////////////////////////////
1152 //
1153 // POSIX --> LCID
1154 // This should only be called from uloc_getLCID.
1155 // The locale ID must be in canonical form.
1156 //
1157 /////////////////////////////////////
1158 */
1159 U_CAPI uint32_t
uprv_convertToLCIDPlatform(const char * localeID,UErrorCode * status)1160 uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
1161 {
1162     if (U_FAILURE(*status)) {
1163         return 0;
1164     }
1165 
1166     // The purpose of this function is to leverage the Windows platform name->lcid
1167     // conversion functionality when available.
1168 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1169     int32_t len;
1170     char collVal[ULOC_KEYWORDS_CAPACITY] = {};
1171     char baseName[ULOC_FULLNAME_CAPACITY] = {};
1172     const char * mylocaleID = localeID;
1173 
1174     // Check any for keywords.
1175     if (uprv_strchr(localeID, '@'))
1176     {
1177         len = uloc_getKeywordValue(localeID, "collation", collVal, UPRV_LENGTHOF(collVal) - 1, status);
1178         if (U_SUCCESS(*status) && len > 0)
1179         {
1180             // If it contains the keyword collation, return 0 so that the LCID lookup table will be used.
1181             return 0;
1182         }
1183         else
1184         {
1185             // If the locale ID contains keywords other than collation, just use the base name.
1186             len = uloc_getBaseName(localeID, baseName, UPRV_LENGTHOF(baseName) - 1, status);
1187 
1188             if (U_SUCCESS(*status) && len > 0)
1189             {
1190                 baseName[len] = 0;
1191                 mylocaleID = baseName;
1192             }
1193         }
1194     }
1195 
1196     char asciiBCP47Tag[LOCALE_NAME_MAX_LENGTH] = {};
1197     // this will change it from de_DE@collation=phonebook to de-DE-u-co-phonebk form
1198     (void)uloc_toLanguageTag(mylocaleID, asciiBCP47Tag, UPRV_LENGTHOF(asciiBCP47Tag), FALSE, status);
1199 
1200     if (U_SUCCESS(*status))
1201     {
1202         // Need it to be UTF-16, not 8-bit
1203         wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {};
1204         int32_t i;
1205         for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++)
1206         {
1207             if (asciiBCP47Tag[i] == '\0')
1208             {
1209                 break;
1210             }
1211             else
1212             {
1213                 // Copy the character
1214                 bcp47Tag[i] = static_cast<wchar_t>(asciiBCP47Tag[i]);
1215             }
1216         }
1217 
1218         if (i < (UPRV_LENGTHOF(bcp47Tag) - 1))
1219         {
1220             // Ensure it's null terminated
1221             bcp47Tag[i] = L'\0';
1222             LCID lcid = LocaleNameToLCID(bcp47Tag, LOCALE_ALLOW_NEUTRAL_NAMES);
1223             if (lcid > 0)
1224             {
1225                 // Found LCID from windows, return that one, unless its completely ambiguous
1226                 // LOCALE_USER_DEFAULT and transients are OK because they will round trip
1227                 // for this process.
1228                 if (lcid != LOCALE_CUSTOM_UNSPECIFIED)
1229                 {
1230                     return lcid;
1231                 }
1232             }
1233         }
1234     }
1235 #else
1236     (void) localeID; // Suppress unused variable warning.
1237 #endif
1238 
1239     // Nothing found, or not implemented.
1240     return 0;
1241 }
1242 
1243 U_CAPI uint32_t
uprv_convertToLCID(const char * langID,const char * posixID,UErrorCode * status)1244 uprv_convertToLCID(const char *langID, const char* posixID, UErrorCode* status)
1245 {
1246     // This function does the table lookup when native platform name->lcid conversion isn't available,
1247     // or for locales that don't follow patterns the platform expects.
1248     uint32_t   low    = 0;
1249     uint32_t   high   = gLocaleCount;
1250     uint32_t   mid;
1251     uint32_t   oldmid = 0;
1252     int32_t    compVal;
1253 
1254     uint32_t   value         = 0;
1255     uint32_t   fallbackValue = (uint32_t)-1;
1256     UErrorCode myStatus;
1257     uint32_t   idx;
1258 
1259     /* Check for incomplete id. */
1260     if (!langID || !posixID || uprv_strlen(langID) < 2 || uprv_strlen(posixID) < 2) {
1261         return 0;
1262     }
1263 
1264     /*Binary search for the map entry for normal cases */
1265 
1266     while (high > low)  /*binary search*/{
1267 
1268         mid = (high+low) >> 1; /*Finds median*/
1269 
1270         if (mid == oldmid)
1271             break;
1272 
1273         compVal = uprv_strcmp(langID, gPosixIDmap[mid].regionMaps->posixID);
1274         if (compVal < 0){
1275             high = mid;
1276         }
1277         else if (compVal > 0){
1278             low = mid;
1279         }
1280         else /*we found it*/{
1281             return getHostID(&gPosixIDmap[mid], posixID, status);
1282         }
1283         oldmid = mid;
1284     }
1285 
1286     /*
1287      * Sometimes we can't do a binary search on posixID because some LCIDs
1288      * go to different locales.  We hit one of those special cases.
1289      */
1290     for (idx = 0; idx < gLocaleCount; idx++ ) {
1291         myStatus = U_ZERO_ERROR;
1292         value = getHostID(&gPosixIDmap[idx], posixID, &myStatus);
1293         if (myStatus == U_ZERO_ERROR) {
1294             return value;
1295         }
1296         else if (myStatus == U_USING_FALLBACK_WARNING) {
1297             fallbackValue = value;
1298         }
1299     }
1300 
1301     if (fallbackValue != (uint32_t)-1) {
1302         *status = U_USING_FALLBACK_WARNING;
1303         return fallbackValue;
1304     }
1305 
1306     /* no match found */
1307     *status = U_ILLEGAL_ARGUMENT_ERROR;
1308     return 0;   /* return international (root) */
1309 }
1310