1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 1996-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  *
9  * Provides functionality for mapping between
10  * LCID and Posix IDs or ICU locale to codepage
11  *
12  * Note: All classes and code in this file are
13  *       intended for internal use only.
14  *
15  * Methods of interest:
16  *   unsigned long convertToLCID(const char*);
17  *   const char* convertToPosix(unsigned long);
18  *
19  * Kathleen Wilson, 4/30/96
20  *
21  *  Date        Name        Description
22  *  3/11/97     aliu        Fixed off-by-one bug in assignment operator. Added
23  *                          setId() method and safety check against
24  *                          MAX_ID_LENGTH.
25  * 04/23/99     stephen     Added C wrapper for convertToPosix.
26  * 09/18/00     george      Removed the memory leaks.
27  * 08/23/01     george      Convert to C
28  */
29 
30 #include "locmap.h"
31 #include "bytesinkutil.h"
32 #include "charstr.h"
33 #include "cstring.h"
34 #include "cmemory.h"
35 #include "ulocimp.h"
36 #include "unicode/uloc.h"
37 
38 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
39 #include <windows.h>
40 #include <winnls.h> // LCIDToLocaleName and LocaleNameToLCID
41 #endif
42 
43 /*
44  * Note:
45  * The mapping from Win32 locale ID numbers to POSIX locale strings should
46  * be the faster one.
47  *
48  * Windows LCIDs are defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
49  * [MS-LCID] Windows Language Code Identifier (LCID) Reference
50  */
51 
52 /*
53 ////////////////////////////////////////////////
54 //
55 // Internal Classes for LCID <--> POSIX Mapping
56 //
57 /////////////////////////////////////////////////
58 */
59 
60 typedef struct ILcidPosixElement
61 {
62     const uint32_t hostID;
63     const char * const posixID;
64 } ILcidPosixElement;
65 
66 typedef struct ILcidPosixMap
67 {
68     const uint32_t numRegions;
69     const struct ILcidPosixElement* const regionMaps;
70 } ILcidPosixMap;
71 
72 
73 /*
74 /////////////////////////////////////////////////
75 //
76 // Easy macros to make the LCID <--> POSIX Mapping
77 //
78 /////////////////////////////////////////////////
79 */
80 
81 /**
82  * The standard one language/one country mapping for LCID.
83  * The first element must be the language, and the following
84  * elements are the language with the country.
85  * @param hostID LCID in host format such as 0x044d
86  * @param languageID posix ID of just the language such as 'de'
87  * @param posixID posix ID of the language_TERRITORY such as 'de_CH'
88  */
89 #define ILCID_POSIX_ELEMENT_ARRAY(hostID, languageID, posixID) \
90 static const ILcidPosixElement locmap_ ## languageID [] = { \
91     {LANGUAGE_LCID(hostID), #languageID},     /* parent locale */ \
92     {hostID, #posixID}, \
93 };
94 
95 /**
96  * Define a subtable by ID
97  * @param id the POSIX ID, either a language or language_TERRITORY
98  */
99 #define ILCID_POSIX_SUBTABLE(id) \
100 static const ILcidPosixElement locmap_ ## id [] =
101 
102 
103 /**
104  * Create the map for the posixID. This macro supposes that the language string
105  * name is the same as the global variable name, and that the first element
106  * in the ILcidPosixElement is just the language.
107  * @param _posixID the full POSIX ID for this entry.
108  */
109 #define ILCID_POSIX_MAP(_posixID) \
110     {UPRV_LENGTHOF(locmap_ ## _posixID), locmap_ ## _posixID}
111 
112 /*
113 ////////////////////////////////////////////
114 //
115 // Create the table of LCID to POSIX Mapping
116 // None of it should be dynamically created.
117 //
118 // Keep static locale variables inside the function so that
119 // it can be created properly during static init.
120 //
121 // Note: This table should be updated periodically. Check the [MS-LCID] Windows Language Code Identifier
122 //       (LCID) Reference defined at https://msdn.microsoft.com/en-us/library/cc233965.aspx
123 //
124 //       Microsoft is moving away from LCID in favor of locale name as of Vista.  This table needs to be
125 //       maintained for support of older Windows version.
126 //       Update: Windows 7 (091130)
127 //
128 // Note: Microsoft assign a different LCID if a locale has a sorting variant. POSIX IDs below may contain
129 //       @collation=XXX, but no other keywords are allowed (at least for now). When uprv_convertToLCID() is
130 //       called from uloc_getLCID(), keywords other than collation are already removed. If we really need
131 //       to support other keywords in this mapping data, we must update the implementation.
132 ////////////////////////////////////////////
133 */
134 
135 // TODO: For Windows ideally this table would be a list of exceptions rather than a complete list as
136 // LocaleNameToLCID and LCIDToLocaleName provide 90% of these.
137 
138 ILCID_POSIX_ELEMENT_ARRAY(0x0436, af, af_ZA)
139 
ILCID_POSIX_SUBTABLE(ar)140 ILCID_POSIX_SUBTABLE(ar) {
141     {0x01,   "ar"},
142     {0x3801, "ar_AE"},
143     {0x3c01, "ar_BH"},
144     {0x1401, "ar_DZ"},
145     {0x0c01, "ar_EG"},
146     {0x0801, "ar_IQ"},
147     {0x2c01, "ar_JO"},
148     {0x3401, "ar_KW"},
149     {0x3001, "ar_LB"},
150     {0x1001, "ar_LY"},
151     {0x1801, "ar_MA"},
152     {0x1801, "ar_MO"},
153     {0x2001, "ar_OM"},
154     {0x4001, "ar_QA"},
155     {0x0401, "ar_SA"},
156     {0x2801, "ar_SY"},
157     {0x1c01, "ar_TN"},
158     {0x2401, "ar_YE"}
159 };
160 
161 ILCID_POSIX_ELEMENT_ARRAY(0x044d, as, as_IN)
162 ILCID_POSIX_ELEMENT_ARRAY(0x045e, am, am_ET)
163 ILCID_POSIX_ELEMENT_ARRAY(0x047a, arn,arn_CL)
164 
ILCID_POSIX_SUBTABLE(az)165 ILCID_POSIX_SUBTABLE(az) {
166     {0x2c,   "az"},
167     {0x082c, "az_Cyrl_AZ"},  /* Cyrillic based */
168     {0x742c, "az_Cyrl"},  /* Cyrillic based */
169     {0x042c, "az_Latn_AZ"}, /* Latin based */
170     {0x782c, "az_Latn"}, /* Latin based */
171     {0x042c, "az_AZ"} /* Latin based */
172 };
173 
174 ILCID_POSIX_ELEMENT_ARRAY(0x046d, ba, ba_RU)
175 ILCID_POSIX_ELEMENT_ARRAY(0x0423, be, be_BY)
176 
177 /*ILCID_POSIX_SUBTABLE(ber) {
178     {0x5f,   "ber"},
179     {0x045f, "ber_Arab_DZ"},
180     {0x045f, "ber_Arab"},
181     {0x085f, "ber_Latn_DZ"},
182     {0x085f, "ber_Latn"}
183 };*/
184 
185 ILCID_POSIX_ELEMENT_ARRAY(0x0402, bg, bg_BG)
186 
ILCID_POSIX_SUBTABLE(bin)187 ILCID_POSIX_SUBTABLE(bin) {
188     {0x66, "bin"},
189     {0x0466, "bin_NG"}
190 };
191 
ILCID_POSIX_SUBTABLE(bn)192 ILCID_POSIX_SUBTABLE(bn) {
193     {0x45,   "bn"},
194     {0x0845, "bn_BD"},
195     {0x0445, "bn_IN"}
196 };
197 
ILCID_POSIX_SUBTABLE(bo)198 ILCID_POSIX_SUBTABLE(bo) {
199     {0x51,   "bo"},
200     {0x0851, "bo_BT"},
201     {0x0451, "bo_CN"},
202     {0x0c51, "dz_BT"}
203 };
204 
205 ILCID_POSIX_ELEMENT_ARRAY(0x047e, br, br_FR)
206 
ILCID_POSIX_SUBTABLE(ca)207 ILCID_POSIX_SUBTABLE(ca) {
208     {0x03,   "ca"},
209     {0x0403, "ca_ES"},
210     {0x0803, "ca_ES_VALENCIA"}
211 };
212 
213 ILCID_POSIX_ELEMENT_ARRAY(0x0483, co, co_FR)
214 
ILCID_POSIX_SUBTABLE(chr)215 ILCID_POSIX_SUBTABLE(chr) {
216     {0x05c,  "chr"},
217     {0x7c5c, "chr_Cher"},
218     {0x045c, "chr_Cher_US"},
219     {0x045c, "chr_US"}
220 };
221 
222 // ICU has chosen different names for these.
ILCID_POSIX_SUBTABLE(ckb)223 ILCID_POSIX_SUBTABLE(ckb) {
224     {0x92,   "ckb"},
225     {0x7c92, "ckb_Arab"},
226     {0x0492, "ckb_Arab_IQ"}
227 };
228 
229 /* Declared as cs_CZ to get around compiler errors on z/OS, which defines cs as a function */
230 ILCID_POSIX_ELEMENT_ARRAY(0x0405, cs, cs_CZ)
231 
232 ILCID_POSIX_ELEMENT_ARRAY(0x0452, cy, cy_GB)
233 ILCID_POSIX_ELEMENT_ARRAY(0x0406, da, da_DK)
234 
235 // Windows doesn't know POSIX or BCP47 Unicode phonebook sort names
ILCID_POSIX_SUBTABLE(de)236 ILCID_POSIX_SUBTABLE(de) {
237     {0x07,   "de"},
238     {0x0c07, "de_AT"},
239     {0x0807, "de_CH"},
240     {0x0407, "de_DE"},
241     {0x1407, "de_LI"},
242     {0x1007, "de_LU"},
243     {0x10407,"de_DE@collation=phonebook"},  /*This is really de_DE_PHONEBOOK on Windows*/
244     {0x10407,"de@collation=phonebook"}  /*This is really de_DE_PHONEBOOK on Windows*/
245 };
246 
247 ILCID_POSIX_ELEMENT_ARRAY(0x0465, dv, dv_MV)
248 ILCID_POSIX_ELEMENT_ARRAY(0x0408, el, el_GR)
249 
250 // Windows uses an empty string for 'invariant'
ILCID_POSIX_SUBTABLE(en)251 ILCID_POSIX_SUBTABLE(en) {
252     {0x09,   "en"},
253     {0x0c09, "en_AU"},
254     {0x2809, "en_BZ"},
255     {0x1009, "en_CA"},
256     {0x0809, "en_GB"},
257     {0x3c09, "en_HK"},
258     {0x3809, "en_ID"},
259     {0x1809, "en_IE"},
260     {0x4009, "en_IN"},
261     {0x2009, "en_JM"},
262     {0x4409, "en_MY"},
263     {0x1409, "en_NZ"},
264     {0x3409, "en_PH"},
265     {0x4809, "en_SG"},
266     {0x2C09, "en_TT"},
267     {0x0409, "en_US"},
268     {0x007f, "en_US_POSIX"}, /* duplicate for round-tripping */
269     {0x2409, "en_029"},
270     {0x1c09, "en_ZA"},
271     {0x3009, "en_ZW"},
272     {0x2409, "en_VI"},  /* Virgin Islands AKA Caribbean Islands (en_CB). On Windows8+ This is 0x1000 or dynamically assigned */
273     {0x0409, "en_AS"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
274     {0x0409, "en_GU"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
275     {0x0409, "en_MH"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
276     {0x0409, "en_MP"},  /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
277     {0x0409, "en_UM"}   /* Alias for en_US. Leave last.  On Windows8+ This is 0x1000 or dynamically assigned */
278 };
279 
ILCID_POSIX_SUBTABLE(en_US_POSIX)280 ILCID_POSIX_SUBTABLE(en_US_POSIX) {
281     {0x007f, "en_US_POSIX"} /* duplicate for roundtripping */
282 };
283 
284 // Windows doesn't know POSIX or BCP47 Unicode traditional sort names
ILCID_POSIX_SUBTABLE(es)285 ILCID_POSIX_SUBTABLE(es) {
286     {0x0a,   "es"},
287     {0x2c0a, "es_AR"},
288     {0x400a, "es_BO"},
289     {0x340a, "es_CL"},
290     {0x240a, "es_CO"},
291     {0x140a, "es_CR"},
292     {0x5c0a, "es_CU"},
293     {0x1c0a, "es_DO"},
294     {0x300a, "es_EC"},
295     {0x0c0a, "es_ES"},      /*Modern sort.*/
296     {0x100a, "es_GT"},
297     {0x480a, "es_HN"},
298     {0x080a, "es_MX"},
299     {0x4c0a, "es_NI"},
300     {0x180a, "es_PA"},
301     {0x280a, "es_PE"},
302     {0x500a, "es_PR"},
303     {0x3c0a, "es_PY"},
304     {0x440a, "es_SV"},
305     {0x540a, "es_US"},
306     {0x380a, "es_UY"},
307     {0x200a, "es_VE"},
308     {0x580a, "es_419"},
309     {0x040a, "es_ES@collation=traditional"},
310     {0x040a, "es@collation=traditional"}        // Windows will treat this as es-ES@collation=traditional
311 };
312 
313 ILCID_POSIX_ELEMENT_ARRAY(0x0425, et, et_EE)
314 ILCID_POSIX_ELEMENT_ARRAY(0x042d, eu, eu_ES)
315 
316 /* ISO-639 doesn't distinguish between Persian and Dari.*/
ILCID_POSIX_SUBTABLE(fa)317 ILCID_POSIX_SUBTABLE(fa) {
318     {0x29,   "fa"},
319     {0x0429, "fa_IR"},  /* Persian/Farsi (Iran) */
320     {0x048c, "fa_AF"}   /* Persian/Dari (Afghanistan) */
321 };
322 
323 
324 /* duplicate for roundtripping */
ILCID_POSIX_SUBTABLE(fa_AF)325 ILCID_POSIX_SUBTABLE(fa_AF) {
326     {0x8c,   "fa_AF"},  /* Persian/Dari (Afghanistan) */
327     {0x048c, "fa_AF"}   /* Persian/Dari (Afghanistan) */
328 };
329 
ILCID_POSIX_SUBTABLE(ff)330 ILCID_POSIX_SUBTABLE(ff) {
331     {0x67,   "ff"},
332     {0x7c67, "ff_Latn"},
333     {0x0867, "ff_Latn_SN"},
334     {0x0467, "ff_NG"}
335 };
336 
337 ILCID_POSIX_ELEMENT_ARRAY(0x040b, fi, fi_FI)
338 ILCID_POSIX_ELEMENT_ARRAY(0x0464, fil,fil_PH)
339 ILCID_POSIX_ELEMENT_ARRAY(0x0438, fo, fo_FO)
340 
ILCID_POSIX_SUBTABLE(fr)341 ILCID_POSIX_SUBTABLE(fr) {
342     {0x0c,   "fr"},
343     {0x080c, "fr_BE"},
344     {0x0c0c, "fr_CA"},
345     {0x240c, "fr_CD"},
346     {0x240c, "fr_CG"},
347     {0x100c, "fr_CH"},
348     {0x300c, "fr_CI"},
349     {0x2c0c, "fr_CM"},
350     {0x040c, "fr_FR"},
351     {0x3c0c, "fr_HT"},
352     {0x140c, "fr_LU"},
353     {0x380c, "fr_MA"},
354     {0x180c, "fr_MC"},
355     {0x340c, "fr_ML"},
356     {0x200c, "fr_RE"},
357     {0x280c, "fr_SN"},
358     {0xe40c, "fr_015"},
359     {0x1c0c, "fr_029"}
360 };
361 
362 ILCID_POSIX_ELEMENT_ARRAY(0x0467, fuv, fuv_NG)
363 
364 ILCID_POSIX_ELEMENT_ARRAY(0x0462, fy, fy_NL)
365 
ILCID_POSIX_SUBTABLE(ga)366 ILCID_POSIX_SUBTABLE(ga) { /* Gaelic (Ireland) */
367     {0x3c,   "ga"},
368     {0x083c, "ga_IE"},
369     {0x043c, "gd_GB"}
370 };
371 
ILCID_POSIX_SUBTABLE(gd)372 ILCID_POSIX_SUBTABLE(gd) { /* Gaelic (Scotland) */
373     {0x91,   "gd"},
374     {0x0491, "gd_GB"}
375 };
376 
377 ILCID_POSIX_ELEMENT_ARRAY(0x0456, gl, gl_ES)
378 ILCID_POSIX_ELEMENT_ARRAY(0x0447, gu, gu_IN)
379 ILCID_POSIX_ELEMENT_ARRAY(0x0474, gn, gn_PY)
380 ILCID_POSIX_ELEMENT_ARRAY(0x0484, gsw,gsw_FR)
381 
ILCID_POSIX_SUBTABLE(ha)382 ILCID_POSIX_SUBTABLE(ha) {
383     {0x68,   "ha"},
384     {0x7c68, "ha_Latn"},
385     {0x0468, "ha_Latn_NG"},
386 };
387 
388 ILCID_POSIX_ELEMENT_ARRAY(0x0475, haw,haw_US)
389 ILCID_POSIX_ELEMENT_ARRAY(0x040d, he, he_IL)
390 ILCID_POSIX_ELEMENT_ARRAY(0x0439, hi, hi_IN)
391 
392 /* This LCID is really four different locales.*/
ILCID_POSIX_SUBTABLE(hr)393 ILCID_POSIX_SUBTABLE(hr) {
394     {0x1a,   "hr"},
395     {0x141a, "bs_Latn_BA"},  /* Bosnian, Bosnia and Herzegovina */
396     {0x681a, "bs_Latn"},  /* Bosnian, Bosnia and Herzegovina */
397     {0x141a, "bs_BA"},  /* Bosnian, Bosnia and Herzegovina */
398     {0x781a, "bs"},     /* Bosnian */
399     {0x201a, "bs_Cyrl_BA"},  /* Bosnian, Bosnia and Herzegovina */
400     {0x641a, "bs_Cyrl"},  /* Bosnian, Bosnia and Herzegovina */
401     {0x101a, "hr_BA"},  /* Croatian in Bosnia */
402     {0x041a, "hr_HR"},  /* Croatian*/
403     {0x2c1a, "sr_Latn_ME"},
404     {0x241a, "sr_Latn_RS"},
405     {0x181a, "sr_Latn_BA"}, /* Serbo-Croatian in Bosnia */
406     {0x081a, "sr_Latn_CS"}, /* Serbo-Croatian*/
407     {0x701a, "sr_Latn"},    /* It's 0x1a or 0x081a, pick one to make the test program happy. */
408     {0x1c1a, "sr_Cyrl_BA"}, /* Serbo-Croatian in Bosnia */
409     {0x0c1a, "sr_Cyrl_CS"}, /* Serbian*/
410     {0x301a, "sr_Cyrl_ME"},
411     {0x281a, "sr_Cyrl_RS"},
412     {0x6c1a, "sr_Cyrl"},    /* It's 0x1a or 0x0c1a, pick one to make the test program happy. */
413     {0x7c1a, "sr"}          /* In CLDR sr is sr_Cyrl. */
414 };
415 
ILCID_POSIX_SUBTABLE(hsb)416 ILCID_POSIX_SUBTABLE(hsb) {
417     {0x2E,   "hsb"},
418     {0x042E, "hsb_DE"},
419     {0x082E, "dsb_DE"},
420     {0x7C2E, "dsb"},
421 };
422 
423 ILCID_POSIX_ELEMENT_ARRAY(0x040e, hu, hu_HU)
424 ILCID_POSIX_ELEMENT_ARRAY(0x042b, hy, hy_AM)
425 
ILCID_POSIX_SUBTABLE(ibb)426 ILCID_POSIX_SUBTABLE(ibb) {
427     {0x69, "ibb"},
428     {0x0469, "ibb_NG"}
429 };
430 
431 ILCID_POSIX_ELEMENT_ARRAY(0x0421, id, id_ID)
432 ILCID_POSIX_ELEMENT_ARRAY(0x0470, ig, ig_NG)
433 ILCID_POSIX_ELEMENT_ARRAY(0x0478, ii, ii_CN)
434 ILCID_POSIX_ELEMENT_ARRAY(0x040f, is, is_IS)
435 
ILCID_POSIX_SUBTABLE(it)436 ILCID_POSIX_SUBTABLE(it) {
437     {0x10,   "it"},
438     {0x0810, "it_CH"},
439     {0x0410, "it_IT"}
440 };
441 
ILCID_POSIX_SUBTABLE(iu)442 ILCID_POSIX_SUBTABLE(iu) {
443     {0x5d,   "iu"},
444     {0x045d, "iu_Cans_CA"},
445     {0x785d, "iu_Cans"},
446     {0x085d, "iu_Latn_CA"},
447     {0x7c5d, "iu_Latn"}
448 };
449 
450 ILCID_POSIX_ELEMENT_ARRAY(0x040d, iw, iw_IL)    /*Left in for compatibility*/
451 ILCID_POSIX_ELEMENT_ARRAY(0x0411, ja, ja_JP)
452 ILCID_POSIX_ELEMENT_ARRAY(0x0437, ka, ka_GE)
453 ILCID_POSIX_ELEMENT_ARRAY(0x043f, kk, kk_KZ)
454 ILCID_POSIX_ELEMENT_ARRAY(0x046f, kl, kl_GL)
455 ILCID_POSIX_ELEMENT_ARRAY(0x0453, km, km_KH)
456 ILCID_POSIX_ELEMENT_ARRAY(0x044b, kn, kn_IN)
457 
ILCID_POSIX_SUBTABLE(ko)458 ILCID_POSIX_SUBTABLE(ko) {
459     {0x12,   "ko"},
460     {0x0812, "ko_KP"},
461     {0x0412, "ko_KR"}
462 };
463 
464 ILCID_POSIX_ELEMENT_ARRAY(0x0457, kok, kok_IN)
465 ILCID_POSIX_ELEMENT_ARRAY(0x0471, kr,  kr_NG)
466 
ILCID_POSIX_SUBTABLE(ks)467 ILCID_POSIX_SUBTABLE(ks) {         /* We could add PK and CN too */
468     {0x60,   "ks"},
469     {0x0460, "ks_Arab_IN"},
470     {0x0860, "ks_Deva_IN"}
471 };
472 
473 ILCID_POSIX_ELEMENT_ARRAY(0x0440, ky, ky_KG)   /* Kyrgyz is spoken in Kyrgyzstan */
474 
ILCID_POSIX_SUBTABLE(la)475 ILCID_POSIX_SUBTABLE(la) {
476     {0x76,   "la"},
477     {0x0476, "la_001"},
478     {0x0476, "la_IT"}       /*Left in for compatibility*/
479 };
480 
481 ILCID_POSIX_ELEMENT_ARRAY(0x046e, lb, lb_LU)
482 ILCID_POSIX_ELEMENT_ARRAY(0x0454, lo, lo_LA)
483 ILCID_POSIX_ELEMENT_ARRAY(0x0427, lt, lt_LT)
484 ILCID_POSIX_ELEMENT_ARRAY(0x0426, lv, lv_LV)
485 ILCID_POSIX_ELEMENT_ARRAY(0x0481, mi, mi_NZ)
486 ILCID_POSIX_ELEMENT_ARRAY(0x042f, mk, mk_MK)
487 ILCID_POSIX_ELEMENT_ARRAY(0x044c, ml, ml_IN)
488 
ILCID_POSIX_SUBTABLE(mn)489 ILCID_POSIX_SUBTABLE(mn) {
490     {0x50,   "mn"},
491     {0x0450, "mn_MN"},
492     {0x7c50, "mn_Mong"},
493     {0x0850, "mn_Mong_CN"},
494     {0x0850, "mn_CN"},
495     {0x7850, "mn_Cyrl"},
496     {0x0c50, "mn_Mong_MN"}
497 };
498 
499 ILCID_POSIX_ELEMENT_ARRAY(0x0458, mni,mni_IN)
500 ILCID_POSIX_ELEMENT_ARRAY(0x047c, moh,moh_CA)
501 ILCID_POSIX_ELEMENT_ARRAY(0x044e, mr, mr_IN)
502 
ILCID_POSIX_SUBTABLE(ms)503 ILCID_POSIX_SUBTABLE(ms) {
504     {0x3e,   "ms"},
505     {0x083e, "ms_BN"},   /* Brunei Darussalam*/
506     {0x043e, "ms_MY"}    /* Malaysia*/
507 };
508 
509 ILCID_POSIX_ELEMENT_ARRAY(0x043a, mt, mt_MT)
510 ILCID_POSIX_ELEMENT_ARRAY(0x0455, my, my_MM)
511 
ILCID_POSIX_SUBTABLE(ne)512 ILCID_POSIX_SUBTABLE(ne) {
513     {0x61,   "ne"},
514     {0x0861, "ne_IN"},   /* India*/
515     {0x0461, "ne_NP"}    /* Nepal*/
516 };
517 
ILCID_POSIX_SUBTABLE(nl)518 ILCID_POSIX_SUBTABLE(nl) {
519     {0x13,   "nl"},
520     {0x0813, "nl_BE"},
521     {0x0413, "nl_NL"}
522 };
523 
524 /* The "no" locale split into nb and nn.  By default in ICU, "no" is nb.*/
525 // TODO: Not all of these are needed on Windows, but I don't know how ICU treats preferred ones here.
ILCID_POSIX_SUBTABLE(no)526 ILCID_POSIX_SUBTABLE(no) {
527     {0x14,   "no"},     /* really nb_NO - actually Windows differentiates between neutral (no region) and specific (with region) */
528     {0x7c14, "nb"},     /* really nb */
529     {0x0414, "nb_NO"},  /* really nb_NO. Keep first in the 414 list. */
530     {0x0414, "no_NO"},  /* really nb_NO */
531     {0x0814, "nn_NO"},  /* really nn_NO. Keep first in the 814 list.  */
532     {0x7814, "nn"},     /* It's 0x14 or 0x814, pick one to make the test program happy. */
533     {0x0814, "no_NO_NY"}/* really nn_NO */
534 };
535 
536 ILCID_POSIX_ELEMENT_ARRAY(0x046c, nso,nso_ZA)   /* TODO: Verify the ISO-639 code */
537 ILCID_POSIX_ELEMENT_ARRAY(0x0482, oc, oc_FR)
538 
ILCID_POSIX_SUBTABLE(om)539 ILCID_POSIX_SUBTABLE(om) { /* TODO: Verify the country */
540     {0x72,   "om"},
541     {0x0472, "om_ET"},
542     {0x0472, "gaz_ET"}
543 };
544 
545 /* Declared as or_IN to get around compiler errors*/
ILCID_POSIX_SUBTABLE(or_IN)546 ILCID_POSIX_SUBTABLE(or_IN) {
547     {0x48,   "or"},
548     {0x0448, "or_IN"},
549 };
550 
ILCID_POSIX_SUBTABLE(pa)551 ILCID_POSIX_SUBTABLE(pa) {
552     {0x46,   "pa"},
553     {0x0446, "pa_IN"},
554     {0x0846, "pa_Arab_PK"},
555     {0x0846, "pa_PK"}
556 };
557 
ILCID_POSIX_SUBTABLE(pap)558 ILCID_POSIX_SUBTABLE(pap) {
559     {0x79, "pap"},
560     {0x0479, "pap_029"},
561     {0x0479, "pap_AN"}     /*Left in for compatibility*/
562 };
563 
564 ILCID_POSIX_ELEMENT_ARRAY(0x0415, pl, pl_PL)
565 ILCID_POSIX_ELEMENT_ARRAY(0x0463, ps, ps_AF)
566 
ILCID_POSIX_SUBTABLE(pt)567 ILCID_POSIX_SUBTABLE(pt) {
568     {0x16,   "pt"},
569     {0x0416, "pt_BR"},
570     {0x0816, "pt_PT"}
571 };
572 
ILCID_POSIX_SUBTABLE(qu)573 ILCID_POSIX_SUBTABLE(qu) {
574     {0x6b,   "qu"},
575     {0x046b, "qu_BO"},
576     {0x086b, "qu_EC"},
577     {0x0C6b, "qu_PE"},
578     {0x046b, "quz_BO"},
579     {0x086b, "quz_EC"},
580     {0x0C6b, "quz_PE"}
581 };
582 
ILCID_POSIX_SUBTABLE(quc)583 ILCID_POSIX_SUBTABLE(quc) {
584     {0x93,   "quc"},
585     {0x0493, "quc_CO"},
586     /*
587         "quc_Latn_GT" is an exceptional case. Language ID of "quc"
588         is 0x93, but LCID of "quc_Latn_GT" is 0x486, which should be
589         under the group of "qut". "qut" is a retired ISO 639-3 language
590         code for West Central Quiche, and merged to "quc".
591         It looks Windows previously reserved "qut" for K'iche', but,
592         decided to use "quc" when adding a locale for K'iche' (Guatemala).
593 
594         This data structure used here assumes language ID bits in
595         LCID is unique for alphabetic language code. But this is not true
596         for "quc_Latn_GT". If we don't have the data below, LCID look up
597         by alphabetic locale ID (POSIX) will fail. The same entry is found
598         under "qut" below, which is required for reverse look up.
599     */
600     {0x0486, "quc_Latn_GT"}
601 };
602 
ILCID_POSIX_SUBTABLE(qut)603 ILCID_POSIX_SUBTABLE(qut) {
604     {0x86,   "qut"},
605     {0x0486, "qut_GT"},
606     /*
607         See the note in "quc" above.
608     */
609     {0x0486, "quc_Latn_GT"}
610 };
611 
612 ILCID_POSIX_ELEMENT_ARRAY(0x0417, rm, rm_CH)
613 
ILCID_POSIX_SUBTABLE(ro)614 ILCID_POSIX_SUBTABLE(ro) {
615     {0x18,   "ro"},
616     {0x0418, "ro_RO"},
617     {0x0818, "ro_MD"}
618 };
619 
620 // TODO: This is almost certainly 'wrong'.  0 in Windows is a synonym for LOCALE_USER_DEFAULT.
621 // More likely this is a similar concept to the Windows 0x7f Invariant locale ""
622 // (Except that it's not invariant in ICU)
ILCID_POSIX_SUBTABLE(root)623 ILCID_POSIX_SUBTABLE(root) {
624     {0x00,   "root"}
625 };
626 
ILCID_POSIX_SUBTABLE(ru)627 ILCID_POSIX_SUBTABLE(ru) {
628     {0x19,   "ru"},
629     {0x0419, "ru_RU"},
630     {0x0819, "ru_MD"}
631 };
632 
633 ILCID_POSIX_ELEMENT_ARRAY(0x0487, rw, rw_RW)
634 ILCID_POSIX_ELEMENT_ARRAY(0x044f, sa, sa_IN)
635 ILCID_POSIX_ELEMENT_ARRAY(0x0485, sah,sah_RU)
636 
ILCID_POSIX_SUBTABLE(sd)637 ILCID_POSIX_SUBTABLE(sd) {
638     {0x59,   "sd"},
639     {0x0459, "sd_Deva_IN"},
640     {0x0459, "sd_IN"},
641     {0x0859, "sd_Arab_PK"},
642     {0x0859, "sd_PK"},
643     {0x7c59, "sd_Arab"}
644 };
645 
ILCID_POSIX_SUBTABLE(se)646 ILCID_POSIX_SUBTABLE(se) {
647     {0x3b,   "se"},
648     {0x0c3b, "se_FI"},
649     {0x043b, "se_NO"},
650     {0x083b, "se_SE"},
651     {0x783b, "sma"},
652     {0x183b, "sma_NO"},
653     {0x1c3b, "sma_SE"},
654     {0x7c3b, "smj"},
655     {0x703b, "smn"},
656     {0x743b, "sms"},
657     {0x103b, "smj_NO"},
658     {0x143b, "smj_SE"},
659     {0x243b, "smn_FI"},
660     {0x203b, "sms_FI"},
661 };
662 
663 ILCID_POSIX_ELEMENT_ARRAY(0x045b, si, si_LK)
664 ILCID_POSIX_ELEMENT_ARRAY(0x041b, sk, sk_SK)
665 ILCID_POSIX_ELEMENT_ARRAY(0x0424, sl, sl_SI)
666 
ILCID_POSIX_SUBTABLE(so)667 ILCID_POSIX_SUBTABLE(so) {
668     {0x77,   "so"},
669     {0x0477, "so_SO"}
670 };
671 
672 ILCID_POSIX_ELEMENT_ARRAY(0x041c, sq, sq_AL)
673 ILCID_POSIX_ELEMENT_ARRAY(0x0430, st, st_ZA)
674 
ILCID_POSIX_SUBTABLE(sv)675 ILCID_POSIX_SUBTABLE(sv) {
676     {0x1d,   "sv"},
677     {0x081d, "sv_FI"},
678     {0x041d, "sv_SE"}
679 };
680 
681 ILCID_POSIX_ELEMENT_ARRAY(0x0441, sw, sw_KE)
682 ILCID_POSIX_ELEMENT_ARRAY(0x045A, syr, syr_SY)
683 
ILCID_POSIX_SUBTABLE(ta)684 ILCID_POSIX_SUBTABLE(ta) {
685     {0x49,   "ta"},
686     {0x0449, "ta_IN"},
687     {0x0849, "ta_LK"}
688 };
689 
690 ILCID_POSIX_ELEMENT_ARRAY(0x044a, te, te_IN)
691 
692 /* Cyrillic based by default */
ILCID_POSIX_SUBTABLE(tg)693 ILCID_POSIX_SUBTABLE(tg) {
694     {0x28,   "tg"},
695     {0x7c28, "tg_Cyrl"},
696     {0x0428, "tg_Cyrl_TJ"}
697 };
698 
699 ILCID_POSIX_ELEMENT_ARRAY(0x041e, th, th_TH)
700 
ILCID_POSIX_SUBTABLE(ti)701 ILCID_POSIX_SUBTABLE(ti) {
702     {0x73,   "ti"},
703     {0x0873, "ti_ER"},
704     {0x0473, "ti_ET"}
705 };
706 
707 ILCID_POSIX_ELEMENT_ARRAY(0x0442, tk, tk_TM)
708 
ILCID_POSIX_SUBTABLE(tn)709 ILCID_POSIX_SUBTABLE(tn) {
710     {0x32,   "tn"},
711     {0x0832, "tn_BW"},
712     {0x0432, "tn_ZA"}
713 };
714 
715 ILCID_POSIX_ELEMENT_ARRAY(0x041f, tr, tr_TR)
716 ILCID_POSIX_ELEMENT_ARRAY(0x0431, ts, ts_ZA)
717 ILCID_POSIX_ELEMENT_ARRAY(0x0444, tt, tt_RU)
718 
ILCID_POSIX_SUBTABLE(tzm)719 ILCID_POSIX_SUBTABLE(tzm) {
720     {0x5f,   "tzm"},
721     {0x7c5f, "tzm_Latn"},
722     {0x085f, "tzm_Latn_DZ"},
723     {0x105f, "tzm_Tfng_MA"},
724     {0x045f, "tzm_Arab_MA"},
725     {0x045f, "tmz"}
726 };
727 
ILCID_POSIX_SUBTABLE(ug)728 ILCID_POSIX_SUBTABLE(ug) {
729     {0x80,   "ug"},
730     {0x0480, "ug_CN"},
731     {0x0480, "ug_Arab_CN"}
732 };
733 
734 ILCID_POSIX_ELEMENT_ARRAY(0x0422, uk, uk_UA)
735 
ILCID_POSIX_SUBTABLE(ur)736 ILCID_POSIX_SUBTABLE(ur) {
737     {0x20,   "ur"},
738     {0x0820, "ur_IN"},
739     {0x0420, "ur_PK"}
740 };
741 
ILCID_POSIX_SUBTABLE(uz)742 ILCID_POSIX_SUBTABLE(uz) {
743     {0x43,   "uz"},
744     {0x0843, "uz_Cyrl_UZ"},  /* Cyrillic based */
745     {0x7843, "uz_Cyrl"},  /* Cyrillic based */
746     {0x0843, "uz_UZ"},  /* Cyrillic based */
747     {0x0443, "uz_Latn_UZ"}, /* Latin based */
748     {0x7c43, "uz_Latn"} /* Latin based */
749 };
750 
ILCID_POSIX_SUBTABLE(ve)751 ILCID_POSIX_SUBTABLE(ve) { /* TODO: Verify the country */
752     {0x33,   "ve"},
753     {0x0433, "ve_ZA"},
754     {0x0433, "ven_ZA"}
755 };
756 
757 ILCID_POSIX_ELEMENT_ARRAY(0x042a, vi, vi_VN)
758 ILCID_POSIX_ELEMENT_ARRAY(0x0488, wo, wo_SN)
759 ILCID_POSIX_ELEMENT_ARRAY(0x0434, xh, xh_ZA)
760 
ILCID_POSIX_SUBTABLE(yi)761 ILCID_POSIX_SUBTABLE(yi) {
762     {0x003d, "yi"},
763     {0x043d, "yi_001"}
764 };
765 
766 ILCID_POSIX_ELEMENT_ARRAY(0x046a, yo, yo_NG)
767 
768 // Windows & ICU tend to different names for some of these
769 // TODO: Windows probably does not need all of these entries, but I don't know how the precedence works.
ILCID_POSIX_SUBTABLE(zh)770 ILCID_POSIX_SUBTABLE(zh) {
771     {0x0004, "zh_Hans"},
772     {0x7804, "zh"},
773     {0x0804, "zh_CN"},
774     {0x0804, "zh_Hans_CN"},
775     {0x0c04, "zh_Hant_HK"},
776     {0x0c04, "zh_HK"},
777     {0x1404, "zh_Hant_MO"},
778     {0x1404, "zh_MO"},
779     {0x1004, "zh_Hans_SG"},
780     {0x1004, "zh_SG"},
781     {0x0404, "zh_Hant_TW"},
782     {0x7c04, "zh_Hant"},
783     {0x0404, "zh_TW"},
784     {0x30404,"zh_Hant_TW"},     /* Bopomofo order */
785     {0x30404,"zh_TW"},          /* Bopomofo order */
786     {0x20004,"zh@collation=stroke"},
787     {0x20404,"zh_Hant@collation=stroke"},
788     {0x20404,"zh_Hant_TW@collation=stroke"},
789     {0x20404,"zh_TW@collation=stroke"},
790     {0x20804,"zh_Hans@collation=stroke"},
791     {0x20804,"zh_Hans_CN@collation=stroke"},
792     {0x20804,"zh_CN@collation=stroke"}
793     // TODO: Alternate collations for other LCIDs are missing, eg: 0x50804
794 };
795 
796 ILCID_POSIX_ELEMENT_ARRAY(0x0435, zu, zu_ZA)
797 
798 /* This must be static and grouped by LCID. */
799 static const ILcidPosixMap gPosixIDmap[] = {
800     ILCID_POSIX_MAP(af),    /*  af  Afrikaans                 0x36 */
801     ILCID_POSIX_MAP(am),    /*  am  Amharic                   0x5e */
802     ILCID_POSIX_MAP(ar),    /*  ar  Arabic                    0x01 */
803     ILCID_POSIX_MAP(arn),   /*  arn Araucanian/Mapudungun     0x7a */
804     ILCID_POSIX_MAP(as),    /*  as  Assamese                  0x4d */
805     ILCID_POSIX_MAP(az),    /*  az  Azerbaijani               0x2c */
806     ILCID_POSIX_MAP(ba),    /*  ba  Bashkir                   0x6d */
807     ILCID_POSIX_MAP(be),    /*  be  Belarusian                0x23 */
808 /*    ILCID_POSIX_MAP(ber),     ber Berber/Tamazight          0x5f */
809     ILCID_POSIX_MAP(bg),    /*  bg  Bulgarian                 0x02 */
810     ILCID_POSIX_MAP(bin),   /*  bin Edo                       0x66 */
811     ILCID_POSIX_MAP(bn),    /*  bn  Bengali; Bangla           0x45 */
812     ILCID_POSIX_MAP(bo),    /*  bo  Tibetan                   0x51 */
813     ILCID_POSIX_MAP(br),    /*  br  Breton                    0x7e */
814     ILCID_POSIX_MAP(ca),    /*  ca  Catalan                   0x03 */
815     ILCID_POSIX_MAP(chr),   /*  chr Cherokee                  0x5c */
816     ILCID_POSIX_MAP(ckb),   /*  ckb Sorani (Central Kurdish)  0x92 */
817     ILCID_POSIX_MAP(co),    /*  co  Corsican                  0x83 */
818     ILCID_POSIX_MAP(cs),    /*  cs  Czech                     0x05 */
819     ILCID_POSIX_MAP(cy),    /*  cy  Welsh                     0x52 */
820     ILCID_POSIX_MAP(da),    /*  da  Danish                    0x06 */
821     ILCID_POSIX_MAP(de),    /*  de  German                    0x07 */
822     ILCID_POSIX_MAP(dv),    /*  dv  Divehi                    0x65 */
823     ILCID_POSIX_MAP(el),    /*  el  Greek                     0x08 */
824     ILCID_POSIX_MAP(en),    /*  en  English                   0x09 */
825     ILCID_POSIX_MAP(en_US_POSIX), /*    invariant             0x7f */
826     ILCID_POSIX_MAP(es),    /*  es  Spanish                   0x0a */
827     ILCID_POSIX_MAP(et),    /*  et  Estonian                  0x25 */
828     ILCID_POSIX_MAP(eu),    /*  eu  Basque                    0x2d */
829     ILCID_POSIX_MAP(fa),    /*  fa  Persian/Farsi             0x29 */
830     ILCID_POSIX_MAP(fa_AF), /*  fa  Persian/Dari              0x8c */
831     ILCID_POSIX_MAP(ff),    /*  ff  Fula                      0x67 */
832     ILCID_POSIX_MAP(fi),    /*  fi  Finnish                   0x0b */
833     ILCID_POSIX_MAP(fil),   /*  fil Filipino                  0x64 */
834     ILCID_POSIX_MAP(fo),    /*  fo  Faroese                   0x38 */
835     ILCID_POSIX_MAP(fr),    /*  fr  French                    0x0c */
836     ILCID_POSIX_MAP(fuv),   /*  fuv Fulfulde - Nigeria        0x67 */
837     ILCID_POSIX_MAP(fy),    /*  fy  Frisian                   0x62 */
838     ILCID_POSIX_MAP(ga),    /*  *   Gaelic (Ireland,Scotland) 0x3c */
839     ILCID_POSIX_MAP(gd),    /*  gd  Gaelic (United Kingdom)   0x91 */
840     ILCID_POSIX_MAP(gl),    /*  gl  Galician                  0x56 */
841     ILCID_POSIX_MAP(gn),    /*  gn  Guarani                   0x74 */
842     ILCID_POSIX_MAP(gsw),   /*  gsw Alemanic/Alsatian/Swiss German 0x84 */
843     ILCID_POSIX_MAP(gu),    /*  gu  Gujarati                  0x47 */
844     ILCID_POSIX_MAP(ha),    /*  ha  Hausa                     0x68 */
845     ILCID_POSIX_MAP(haw),   /*  haw Hawaiian                  0x75 */
846     ILCID_POSIX_MAP(he),    /*  he  Hebrew (formerly iw)      0x0d */
847     ILCID_POSIX_MAP(hi),    /*  hi  Hindi                     0x39 */
848     ILCID_POSIX_MAP(hr),    /*  *   Croatian and others       0x1a */
849     ILCID_POSIX_MAP(hsb),   /*  hsb Upper Sorbian             0x2e */
850     ILCID_POSIX_MAP(hu),    /*  hu  Hungarian                 0x0e */
851     ILCID_POSIX_MAP(hy),    /*  hy  Armenian                  0x2b */
852     ILCID_POSIX_MAP(ibb),   /*  ibb Ibibio - Nigeria          0x69 */
853     ILCID_POSIX_MAP(id),    /*  id  Indonesian (formerly in)  0x21 */
854     ILCID_POSIX_MAP(ig),    /*  ig  Igbo                      0x70 */
855     ILCID_POSIX_MAP(ii),    /*  ii  Sichuan Yi                0x78 */
856     ILCID_POSIX_MAP(is),    /*  is  Icelandic                 0x0f */
857     ILCID_POSIX_MAP(it),    /*  it  Italian                   0x10 */
858     ILCID_POSIX_MAP(iu),    /*  iu  Inuktitut                 0x5d */
859     ILCID_POSIX_MAP(iw),    /*  iw  Hebrew                    0x0d */
860     ILCID_POSIX_MAP(ja),    /*  ja  Japanese                  0x11 */
861     ILCID_POSIX_MAP(ka),    /*  ka  Georgian                  0x37 */
862     ILCID_POSIX_MAP(kk),    /*  kk  Kazakh                    0x3f */
863     ILCID_POSIX_MAP(kl),    /*  kl  Kalaallisut               0x6f */
864     ILCID_POSIX_MAP(km),    /*  km  Khmer                     0x53 */
865     ILCID_POSIX_MAP(kn),    /*  kn  Kannada                   0x4b */
866     ILCID_POSIX_MAP(ko),    /*  ko  Korean                    0x12 */
867     ILCID_POSIX_MAP(kok),   /*  kok Konkani                   0x57 */
868     ILCID_POSIX_MAP(kr),    /*  kr  Kanuri                    0x71 */
869     ILCID_POSIX_MAP(ks),    /*  ks  Kashmiri                  0x60 */
870     ILCID_POSIX_MAP(ky),    /*  ky  Kyrgyz                    0x40 */
871     ILCID_POSIX_MAP(lb),    /*  lb  Luxembourgish             0x6e */
872     ILCID_POSIX_MAP(la),    /*  la  Latin                     0x76 */
873     ILCID_POSIX_MAP(lo),    /*  lo  Lao                       0x54 */
874     ILCID_POSIX_MAP(lt),    /*  lt  Lithuanian                0x27 */
875     ILCID_POSIX_MAP(lv),    /*  lv  Latvian, Lettish          0x26 */
876     ILCID_POSIX_MAP(mi),    /*  mi  Maori                     0x81 */
877     ILCID_POSIX_MAP(mk),    /*  mk  Macedonian                0x2f */
878     ILCID_POSIX_MAP(ml),    /*  ml  Malayalam                 0x4c */
879     ILCID_POSIX_MAP(mn),    /*  mn  Mongolian                 0x50 */
880     ILCID_POSIX_MAP(mni),   /*  mni Manipuri                  0x58 */
881     ILCID_POSIX_MAP(moh),   /*  moh Mohawk                    0x7c */
882     ILCID_POSIX_MAP(mr),    /*  mr  Marathi                   0x4e */
883     ILCID_POSIX_MAP(ms),    /*  ms  Malay                     0x3e */
884     ILCID_POSIX_MAP(mt),    /*  mt  Maltese                   0x3a */
885     ILCID_POSIX_MAP(my),    /*  my  Burmese                   0x55 */
886 /*    ILCID_POSIX_MAP(nb),    //  no  Norwegian                 0x14 */
887     ILCID_POSIX_MAP(ne),    /*  ne  Nepali                    0x61 */
888     ILCID_POSIX_MAP(nl),    /*  nl  Dutch                     0x13 */
889 /*    ILCID_POSIX_MAP(nn),    //  no  Norwegian                 0x14 */
890     ILCID_POSIX_MAP(no),    /*  *   Norwegian                 0x14 */
891     ILCID_POSIX_MAP(nso),   /*  nso Sotho, Northern (Sepedi dialect) 0x6c */
892     ILCID_POSIX_MAP(oc),    /*  oc  Occitan                   0x82 */
893     ILCID_POSIX_MAP(om),    /*  om  Oromo                     0x72 */
894     ILCID_POSIX_MAP(or_IN), /*  or  Oriya                     0x48 */
895     ILCID_POSIX_MAP(pa),    /*  pa  Punjabi                   0x46 */
896     ILCID_POSIX_MAP(pap),   /*  pap Papiamentu                0x79 */
897     ILCID_POSIX_MAP(pl),    /*  pl  Polish                    0x15 */
898     ILCID_POSIX_MAP(ps),    /*  ps  Pashto                    0x63 */
899     ILCID_POSIX_MAP(pt),    /*  pt  Portuguese                0x16 */
900     ILCID_POSIX_MAP(qu),    /*  qu  Quechua                   0x6B */
901     ILCID_POSIX_MAP(quc),   /*  quc K'iche                    0x93 */
902     ILCID_POSIX_MAP(qut),   /*  qut K'iche                    0x86 */
903     ILCID_POSIX_MAP(rm),    /*  rm  Raeto-Romance/Romansh     0x17 */
904     ILCID_POSIX_MAP(ro),    /*  ro  Romanian                  0x18 */
905     ILCID_POSIX_MAP(root),  /*  root                          0x00 */
906     ILCID_POSIX_MAP(ru),    /*  ru  Russian                   0x19 */
907     ILCID_POSIX_MAP(rw),    /*  rw  Kinyarwanda               0x87 */
908     ILCID_POSIX_MAP(sa),    /*  sa  Sanskrit                  0x4f */
909     ILCID_POSIX_MAP(sah),   /*  sah Yakut                     0x85 */
910     ILCID_POSIX_MAP(sd),    /*  sd  Sindhi                    0x59 */
911     ILCID_POSIX_MAP(se),    /*  se  Sami                      0x3b */
912 /*    ILCID_POSIX_MAP(sh),    //  sh  Serbo-Croatian            0x1a */
913     ILCID_POSIX_MAP(si),    /*  si  Sinhalese                 0x5b */
914     ILCID_POSIX_MAP(sk),    /*  sk  Slovak                    0x1b */
915     ILCID_POSIX_MAP(sl),    /*  sl  Slovenian                 0x24 */
916     ILCID_POSIX_MAP(so),    /*  so  Somali                    0x77 */
917     ILCID_POSIX_MAP(sq),    /*  sq  Albanian                  0x1c */
918 /*    ILCID_POSIX_MAP(sr),    //  sr  Serbian                   0x1a */
919     ILCID_POSIX_MAP(st),    /*  st  Sutu                      0x30 */
920     ILCID_POSIX_MAP(sv),    /*  sv  Swedish                   0x1d */
921     ILCID_POSIX_MAP(sw),    /*  sw  Swahili                   0x41 */
922     ILCID_POSIX_MAP(syr),   /*  syr Syriac                    0x5A */
923     ILCID_POSIX_MAP(ta),    /*  ta  Tamil                     0x49 */
924     ILCID_POSIX_MAP(te),    /*  te  Telugu                    0x4a */
925     ILCID_POSIX_MAP(tg),    /*  tg  Tajik                     0x28 */
926     ILCID_POSIX_MAP(th),    /*  th  Thai                      0x1e */
927     ILCID_POSIX_MAP(ti),    /*  ti  Tigrigna                  0x73 */
928     ILCID_POSIX_MAP(tk),    /*  tk  Turkmen                   0x42 */
929     ILCID_POSIX_MAP(tn),    /*  tn  Tswana                    0x32 */
930     ILCID_POSIX_MAP(tr),    /*  tr  Turkish                   0x1f */
931     ILCID_POSIX_MAP(ts),    /*  ts  Tsonga                    0x31 */
932     ILCID_POSIX_MAP(tt),    /*  tt  Tatar                     0x44 */
933     ILCID_POSIX_MAP(tzm),   /*  tzm Tamazight                 0x5f */
934     ILCID_POSIX_MAP(ug),    /*  ug  Uighur                    0x80 */
935     ILCID_POSIX_MAP(uk),    /*  uk  Ukrainian                 0x22 */
936     ILCID_POSIX_MAP(ur),    /*  ur  Urdu                      0x20 */
937     ILCID_POSIX_MAP(uz),    /*  uz  Uzbek                     0x43 */
938     ILCID_POSIX_MAP(ve),    /*  ve  Venda                     0x33 */
939     ILCID_POSIX_MAP(vi),    /*  vi  Vietnamese                0x2a */
940     ILCID_POSIX_MAP(wo),    /*  wo  Wolof                     0x88 */
941     ILCID_POSIX_MAP(xh),    /*  xh  Xhosa                     0x34 */
942     ILCID_POSIX_MAP(yi),    /*  yi  Yiddish                   0x3d */
943     ILCID_POSIX_MAP(yo),    /*  yo  Yoruba                    0x6a */
944     ILCID_POSIX_MAP(zh),    /*  zh  Chinese                   0x04 */
945     ILCID_POSIX_MAP(zu),    /*  zu  Zulu                      0x35 */
946 };
947 
948 static const uint32_t gLocaleCount = UPRV_LENGTHOF(gPosixIDmap);
949 
950 /**
951  * Do not call this function. It is called by hostID.
952  * The function is not private because this struct must stay as a C struct,
953  * and this is an internal class.
954  */
955 static int32_t
idCmp(const char * id1,const char * id2)956 idCmp(const char* id1, const char* id2)
957 {
958     int32_t diffIdx = 0;
959     while (*id1 == *id2 && *id1 != 0) {
960         diffIdx++;
961         id1++;
962         id2++;
963     }
964     return diffIdx;
965 }
966 
967 /**
968  * Searches for a Windows LCID
969  *
970  * @param posixID the Posix style locale id.
971  * @param status gets set to U_ILLEGAL_ARGUMENT_ERROR when the Posix ID has
972  *               no equivalent Windows LCID.
973  * @return the LCID
974  */
975 static uint32_t
getHostID(const ILcidPosixMap * this_0,const char * posixID,UErrorCode * status)976 getHostID(const ILcidPosixMap *this_0, const char* posixID, UErrorCode* status)
977 {
978     int32_t bestIdx = 0;
979     int32_t bestIdxDiff = 0;
980     int32_t posixIDlen = (int32_t)uprv_strlen(posixID);
981     uint32_t idx;
982 
983     for (idx = 0; idx < this_0->numRegions; idx++ ) {
984         int32_t sameChars = idCmp(posixID, this_0->regionMaps[idx].posixID);
985         if (sameChars > bestIdxDiff && this_0->regionMaps[idx].posixID[sameChars] == 0) {
986             if (posixIDlen == sameChars) {
987                 /* Exact match */
988                 return this_0->regionMaps[idx].hostID;
989             }
990             bestIdxDiff = sameChars;
991             bestIdx = idx;
992         }
993     }
994     /* We asked for something unusual, like en_ZZ, and we try to return the number for the same language. */
995     /* We also have to make sure that sid and si and similar string subsets don't match. */
996     if ((posixID[bestIdxDiff] == '_' || posixID[bestIdxDiff] == '@')
997         && this_0->regionMaps[bestIdx].posixID[bestIdxDiff] == 0)
998     {
999         *status = U_USING_FALLBACK_WARNING;
1000         return this_0->regionMaps[bestIdx].hostID;
1001     }
1002 
1003     /*no match found */
1004     *status = U_ILLEGAL_ARGUMENT_ERROR;
1005     return this_0->regionMaps->hostID;
1006 }
1007 
1008 static const char*
getPosixID(const ILcidPosixMap * this_0,uint32_t hostID)1009 getPosixID(const ILcidPosixMap *this_0, uint32_t hostID)
1010 {
1011     uint32_t i;
1012     for (i = 0; i < this_0->numRegions; i++)
1013     {
1014         if (this_0->regionMaps[i].hostID == hostID)
1015         {
1016             return this_0->regionMaps[i].posixID;
1017         }
1018     }
1019 
1020     /* If you get here, then no matching region was found,
1021        so return the language id with the wild card region. */
1022     return this_0->regionMaps[0].posixID;
1023 }
1024 
1025 /*
1026 //////////////////////////////////////
1027 //
1028 // LCID --> POSIX
1029 //
1030 /////////////////////////////////////
1031 */
1032 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1033 /*
1034  * Various language tags needs to be changed:
1035  * quz -> qu
1036  * prs -> fa
1037  */
1038 #define FIX_LANGUAGE_ID_TAG(buffer, len) \
1039     if (len >= 3) { \
1040         if (buffer[0] == 'q' && buffer[1] == 'u' && buffer[2] == 'z') {\
1041             buffer[2] = 0; \
1042             uprv_strcat(buffer, buffer+3); \
1043         } else if (buffer[0] == 'p' && buffer[1] == 'r' && buffer[2] == 's') {\
1044             buffer[0] = 'f'; buffer[1] = 'a'; buffer[2] = 0; \
1045             uprv_strcat(buffer, buffer+3); \
1046         } \
1047     }
1048 
1049 #endif
1050 
1051 U_CAPI int32_t
uprv_convertToPosix(uint32_t hostid,char * posixID,int32_t posixIDCapacity,UErrorCode * status)1052 uprv_convertToPosix(uint32_t hostid, char *posixID, int32_t posixIDCapacity, UErrorCode* status)
1053 {
1054     uint16_t langID;
1055     uint32_t localeIndex;
1056     UBool bLookup = TRUE;
1057     const char *pPosixID = NULL;
1058 
1059 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1060     static_assert(ULOC_FULLNAME_CAPACITY > LOCALE_NAME_MAX_LENGTH, "Windows locale names have smaller length than ICU locale names.");
1061 
1062     char locName[LOCALE_NAME_MAX_LENGTH] = {};
1063 
1064     // Note: Windows primary lang ID 0x92 in LCID is used for Central Kurdish and
1065     // GetLocaleInfo() maps such LCID to "ku". However, CLDR uses "ku" for
1066     // Northern Kurdish and "ckb" for Central Kurdish. For this reason, we cannot
1067     // use the Windows API to resolve locale ID for this specific case.
1068     if ((hostid & 0x3FF) != 0x92) {
1069         int32_t tmpLen = 0;
1070         char16_t windowsLocaleName[LOCALE_NAME_MAX_LENGTH] = {};
1071 
1072         // Note: LOCALE_ALLOW_NEUTRAL_NAMES was enabled in Windows7+, prior versions did not handle neutral (no-region) locale names.
1073         tmpLen = LCIDToLocaleName(hostid, (PWSTR)windowsLocaleName, UPRV_LENGTHOF(windowsLocaleName), LOCALE_ALLOW_NEUTRAL_NAMES);
1074         if (tmpLen > 1) {
1075             int32_t i = 0;
1076             // Only need to look up in table if have _, eg for de-de_phoneb type alternate sort.
1077             bLookup = FALSE;
1078             for (i = 0; i < UPRV_LENGTHOF(locName); i++)
1079             {
1080                 locName[i] = (char)(windowsLocaleName[i]);
1081 
1082                 // Windows locale name may contain sorting variant, such as "es-ES_tradnl".
1083                 // In such cases, we need special mapping data found in the hardcoded table
1084                 // in this source file.
1085                 if (windowsLocaleName[i] == L'_')
1086                 {
1087                     // Keep the base locale, without variant
1088                     // TODO: Should these be mapped from _phoneb to @collation=phonebook, etc.?
1089                     locName[i] = '\0';
1090                     tmpLen = i;
1091                     bLookup = TRUE;
1092                     break;
1093                 }
1094                 else if (windowsLocaleName[i] == L'-')
1095                 {
1096                     // Windows names use -, ICU uses _
1097                     locName[i] = '_';
1098                 }
1099                 else if (windowsLocaleName[i] == L'\0')
1100                 {
1101                     // No point in doing more work than necessary
1102                     break;
1103                 }
1104             }
1105             // TODO: Need to understand this better, why isn't it an alias?
1106             FIX_LANGUAGE_ID_TAG(locName, tmpLen);
1107             pPosixID = locName;
1108         }
1109     }
1110 #endif
1111 
1112     if (bLookup) {
1113         const char *pCandidate = NULL;
1114         langID = LANGUAGE_LCID(hostid);
1115 
1116         for (localeIndex = 0; localeIndex < gLocaleCount; localeIndex++) {
1117             if (langID == gPosixIDmap[localeIndex].regionMaps->hostID) {
1118                 pCandidate = getPosixID(&gPosixIDmap[localeIndex], hostid);
1119                 break;
1120             }
1121         }
1122 
1123         /* On Windows, when locale name has a variant, we still look up the hardcoded table.
1124            If a match in the hardcoded table is longer than the Windows locale name without
1125            variant, we use the one as the result */
1126         if (pCandidate && (pPosixID == NULL || uprv_strlen(pCandidate) > uprv_strlen(pPosixID))) {
1127             pPosixID = pCandidate;
1128         }
1129     }
1130 
1131     if (pPosixID) {
1132         int32_t resLen = static_cast<int32_t>(uprv_strlen(pPosixID));
1133         int32_t copyLen = resLen <= posixIDCapacity ? resLen : posixIDCapacity;
1134         uprv_memcpy(posixID, pPosixID, copyLen);
1135         if (resLen < posixIDCapacity) {
1136             posixID[resLen] = 0;
1137             if (*status == U_STRING_NOT_TERMINATED_WARNING) {
1138                 *status = U_ZERO_ERROR;
1139             }
1140         } else if (resLen == posixIDCapacity) {
1141             *status = U_STRING_NOT_TERMINATED_WARNING;
1142         } else {
1143             *status = U_BUFFER_OVERFLOW_ERROR;
1144         }
1145         return resLen;
1146     }
1147 
1148     /* no match found */
1149     *status = U_ILLEGAL_ARGUMENT_ERROR;
1150     return -1;
1151 }
1152 
1153 /*
1154 //////////////////////////////////////
1155 //
1156 // POSIX --> LCID
1157 // This should only be called from uloc_getLCID.
1158 // The locale ID must be in canonical form.
1159 //
1160 /////////////////////////////////////
1161 */
1162 U_CAPI uint32_t
uprv_convertToLCIDPlatform(const char * localeID,UErrorCode * status)1163 uprv_convertToLCIDPlatform(const char* localeID, UErrorCode* status)
1164 {
1165     if (U_FAILURE(*status)) {
1166         return 0;
1167     }
1168 
1169     // The purpose of this function is to leverage the Windows platform name->lcid
1170     // conversion functionality when available.
1171 #if U_PLATFORM_HAS_WIN32_API && UCONFIG_USE_WINDOWS_LCID_MAPPING_API
1172     int32_t len;
1173     char baseName[ULOC_FULLNAME_CAPACITY] = {};
1174     const char * mylocaleID = localeID;
1175 
1176     // Check any for keywords.
1177     if (uprv_strchr(localeID, '@'))
1178     {
1179         icu::CharString collVal;
1180         {
1181             icu::CharStringByteSink sink(&collVal);
1182             ulocimp_getKeywordValue(localeID, "collation", sink, status);
1183         }
1184         if (U_SUCCESS(*status) && !collVal.isEmpty())
1185         {
1186             // If it contains the keyword collation, return 0 so that the LCID lookup table will be used.
1187             return 0;
1188         }
1189         else
1190         {
1191             // If the locale ID contains keywords other than collation, just use the base name.
1192             len = uloc_getBaseName(localeID, baseName, UPRV_LENGTHOF(baseName) - 1, status);
1193 
1194             if (U_SUCCESS(*status) && len > 0)
1195             {
1196                 baseName[len] = 0;
1197                 mylocaleID = baseName;
1198             }
1199         }
1200     }
1201 
1202     char asciiBCP47Tag[LOCALE_NAME_MAX_LENGTH] = {};
1203     // this will change it from de_DE@collation=phonebook to de-DE-u-co-phonebk form
1204     (void)uloc_toLanguageTag(mylocaleID, asciiBCP47Tag, UPRV_LENGTHOF(asciiBCP47Tag), FALSE, status);
1205 
1206     if (U_SUCCESS(*status))
1207     {
1208         // Need it to be UTF-16, not 8-bit
1209         wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {};
1210         int32_t i;
1211         for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++)
1212         {
1213             if (asciiBCP47Tag[i] == '\0')
1214             {
1215                 break;
1216             }
1217             else
1218             {
1219                 // Copy the character
1220                 bcp47Tag[i] = static_cast<wchar_t>(asciiBCP47Tag[i]);
1221             }
1222         }
1223 
1224         if (i < (UPRV_LENGTHOF(bcp47Tag) - 1))
1225         {
1226             // Ensure it's null terminated
1227             bcp47Tag[i] = L'\0';
1228             LCID lcid = LocaleNameToLCID(bcp47Tag, LOCALE_ALLOW_NEUTRAL_NAMES);
1229             if (lcid > 0)
1230             {
1231                 // Found LCID from windows, return that one, unless its completely ambiguous
1232                 // LOCALE_USER_DEFAULT and transients are OK because they will round trip
1233                 // for this process.
1234                 if (lcid != LOCALE_CUSTOM_UNSPECIFIED)
1235                 {
1236                     return lcid;
1237                 }
1238             }
1239         }
1240     }
1241 #else
1242     (void) localeID; // Suppress unused variable warning.
1243 #endif
1244 
1245     // Nothing found, or not implemented.
1246     return 0;
1247 }
1248 
1249 U_CAPI uint32_t
uprv_convertToLCID(const char * langID,const char * posixID,UErrorCode * status)1250 uprv_convertToLCID(const char *langID, const char* posixID, UErrorCode* status)
1251 {
1252     // This function does the table lookup when native platform name->lcid conversion isn't available,
1253     // or for locales that don't follow patterns the platform expects.
1254     uint32_t   low    = 0;
1255     uint32_t   high   = gLocaleCount;
1256     uint32_t   mid;
1257     uint32_t   oldmid = 0;
1258     int32_t    compVal;
1259 
1260     uint32_t   value         = 0;
1261     uint32_t   fallbackValue = (uint32_t)-1;
1262     UErrorCode myStatus;
1263     uint32_t   idx;
1264 
1265     /* Check for incomplete id. */
1266     if (!langID || !posixID || uprv_strlen(langID) < 2 || uprv_strlen(posixID) < 2) {
1267         return 0;
1268     }
1269 
1270     /*Binary search for the map entry for normal cases */
1271 
1272     while (high > low)  /*binary search*/{
1273 
1274         mid = (high+low) >> 1; /*Finds median*/
1275 
1276         if (mid == oldmid)
1277             break;
1278 
1279         compVal = uprv_strcmp(langID, gPosixIDmap[mid].regionMaps->posixID);
1280         if (compVal < 0){
1281             high = mid;
1282         }
1283         else if (compVal > 0){
1284             low = mid;
1285         }
1286         else /*we found it*/{
1287             return getHostID(&gPosixIDmap[mid], posixID, status);
1288         }
1289         oldmid = mid;
1290     }
1291 
1292     /*
1293      * Sometimes we can't do a binary search on posixID because some LCIDs
1294      * go to different locales.  We hit one of those special cases.
1295      */
1296     for (idx = 0; idx < gLocaleCount; idx++ ) {
1297         myStatus = U_ZERO_ERROR;
1298         value = getHostID(&gPosixIDmap[idx], posixID, &myStatus);
1299         if (myStatus == U_ZERO_ERROR) {
1300             return value;
1301         }
1302         else if (myStatus == U_USING_FALLBACK_WARNING) {
1303             fallbackValue = value;
1304         }
1305     }
1306 
1307     if (fallbackValue != (uint32_t)-1) {
1308         *status = U_USING_FALLBACK_WARNING;
1309         return fallbackValue;
1310     }
1311 
1312     /* no match found */
1313     *status = U_ILLEGAL_ARGUMENT_ERROR;
1314     return 0;   /* return international (root) */
1315 }
1316