1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1997-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ULOC.CPP
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   04/01/97    aliu        Creation.
15 *   08/21/98    stephen     JDK 1.2 sync
16 *   12/08/98    rtg         New Locale implementation and C API
17 *   03/15/99    damiba      overhaul.
18 *   04/06/99    stephen     changed setDefault() to realloc and copy
19 *   06/14/99    stephen     Changed calls to ures_open for new params
20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22 *                           brought canonicalization code into line with spec
23 *****************************************************************************/
24 
25 /*
26    POSIX's locale format, from putil.c: [no spaces]
27 
28      ll [ _CC ] [ . MM ] [ @ VV]
29 
30      l = lang, C = ctry, M = charmap, V = variant
31 */
32 
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
36 
37 #include "putilimp.h"
38 #include "ustr_imp.h"
39 #include "ulocimp.h"
40 #include "umutex.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43 #include "locmap.h"
44 #include "uarrsort.h"
45 #include "uenumimp.h"
46 #include "uassert.h"
47 #include "charstr.h"
48 
49 #include <stdio.h> /* for sprintf */
50 
51 U_NAMESPACE_USE
52 
53 /* ### Declarations **************************************************/
54 
55 /* Locale stuff from locid.cpp */
56 U_CFUNC void locale_set_default(const char *id);
57 U_CFUNC const char *locale_get_default(void);
58 U_CFUNC int32_t
59 locale_getKeywords(const char *localeID,
60             char prev,
61             char *keywords, int32_t keywordCapacity,
62             char *values, int32_t valuesCapacity, int32_t *valLen,
63             UBool valuesToo,
64             UErrorCode *status);
65 
66 /* ### Data tables **************************************************/
67 
68 /**
69  * Table of language codes, both 2- and 3-letter, with preference
70  * given to 2-letter codes where possible.  Includes 3-letter codes
71  * that lack a 2-letter equivalent.
72  *
73  * This list must be in sorted order.  This list is returned directly
74  * to the user by some API.
75  *
76  * This list must be kept in sync with LANGUAGES_3, with corresponding
77  * entries matched.
78  *
79  * This table should be terminated with a NULL entry, followed by a
80  * second list, and another NULL entry.  The first list is visible to
81  * user code when this array is returned by API.  The second list
82  * contains codes we support, but do not expose through user API.
83  *
84  * Notes
85  *
86  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87  * include the revisions up to 2001/7/27 *CWB*
88  *
89  * The 3 character codes are the terminology codes like RFC 3066.  This
90  * is compatible with prior ICU codes
91  *
92  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93  * table but now at the end of the table because 3 character codes are
94  * duplicates.  This avoids bad searches going from 3 to 2 character
95  * codes.
96  *
97  * The range qaa-qtz is reserved for local use
98  */
99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
100 /* ISO639 table version is 20150505 */
101 /* Subsequent hand addition of selected languages */
102 static const char * const LANGUAGES[] = {
103     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
104     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
105     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
106     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
108     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
111     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
112     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
113     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
114     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
116     "cs",  "csb", "cu",  "cv",  "cy",
117     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
118     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119     "dyo", "dyu", "dz",  "dzg",
120     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
121     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
122     "ext",
123     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
124     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
125     "frs", "fur", "fy",
126     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
128     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
129     "gur", "guz", "gv",  "gwi",
130     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
131     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
132     "hup", "hy",  "hz",
133     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
134     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
135     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
136     "jv",
137     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
139     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
140     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
141     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
142     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
143     "kv",  "kw",  "ky",
144     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
145     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
146     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
147     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
148     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
150     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
151     "ml",  "mn",  "mnc", "mni", "mo",
152     "moh", "mos", "mr",  "mrj",
153     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
154     "my",  "mye", "myv", "mzn",
155     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
156     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
157     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
158     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
159     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
160     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
161     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
162     "pon", "prg", "pro", "ps",  "pt",
163     "qu",  "quc", "qug",
164     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
165     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
166     "rw",  "rwk",
167     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
168     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
169     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
170     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
171     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
172     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
173     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
174     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
175     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
176     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
177     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
178     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
179     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
180     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
181     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
182     "vot", "vro", "vun",
183     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
184     "xal", "xh",  "xmf", "xog",
185     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
186     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
187     "zun", "zxx", "zza",
188 NULL,
189     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
190 NULL
191 };
192 
193 static const char* const DEPRECATED_LANGUAGES[]={
194     "in", "iw", "ji", "jw", NULL, NULL
195 };
196 static const char* const REPLACEMENT_LANGUAGES[]={
197     "id", "he", "yi", "jv", NULL, NULL
198 };
199 
200 /**
201  * Table of 3-letter language codes.
202  *
203  * This is a lookup table used to convert 3-letter language codes to
204  * their 2-letter equivalent, where possible.  It must be kept in sync
205  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
206  * same language as LANGUAGES_3[i].  The commented-out lines are
207  * copied from LANGUAGES to make eyeballing this baby easier.
208  *
209  * Where a 3-letter language code has no 2-letter equivalent, the
210  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
211  *
212  * This table should be terminated with a NULL entry, followed by a
213  * second list, and another NULL entry.  The two lists correspond to
214  * the two lists in LANGUAGES.
215  */
216 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
217 /* ISO639 table version is 20150505 */
218 /* Subsequent hand addition of selected languages */
219 static const char * const LANGUAGES_3[] = {
220     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
221     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
222     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
223     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
224     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
225     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
226     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
227     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
228     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
229     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
230     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
231     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
232     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
233     "ces", "csb", "chu", "chv", "cym",
234     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
235     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
236     "dyo", "dyu", "dzo", "dzg",
237     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
238     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
239     "ext",
240     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
241     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
242     "frs", "fur", "fry",
243     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
244     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
245     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
246     "gur", "guz", "glv", "gwi",
247     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
248     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
249     "hup", "hye", "her",
250     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
251     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
252     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
253     "jav",
254     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
255     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
256     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
257     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
258     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
259     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
260     "kom", "cor", "kir",
261     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
262     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
263     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
264     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
265     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
266     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
267     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
268     "mal", "mon", "mnc", "mni", "mol",
269     "moh", "mos", "mar", "mrj",
270     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
271     "mya", "mye", "myv", "mzn",
272     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
273     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
274     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
275     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
276     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
277     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
278     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
279     "pon", "prg", "pro", "pus", "por",
280     "que", "quc", "qug",
281     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
282     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
283     "kin", "rwk",
284     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
285     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
286     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
287     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
288     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
289     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
290     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
291     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
292     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
293     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
294     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
295     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
296     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
297     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
298     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
299     "vot", "vro", "vun",
300     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
301     "xal", "xho", "xmf", "xog",
302     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
303     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
304     "zun", "zxx", "zza",
305 NULL,
306 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
307     "ind", "heb", "yid", "jaw", "srp",
308 NULL
309 };
310 
311 /**
312  * Table of 2-letter country codes.
313  *
314  * This list must be in sorted order.  This list is returned directly
315  * to the user by some API.
316  *
317  * This list must be kept in sync with COUNTRIES_3, with corresponding
318  * entries matched.
319  *
320  * This table should be terminated with a NULL entry, followed by a
321  * second list, and another NULL entry.  The first list is visible to
322  * user code when this array is returned by API.  The second list
323  * contains codes we support, but do not expose through user API.
324  *
325  * Notes:
326  *
327  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
328  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
329  * new codes keeping the old ones for compatibility updated to include
330  * 1999/12/03 revisions *CWB*
331  *
332  * RO(ROM) is now RO(ROU) according to
333  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
334  */
335 static const char * const COUNTRIES[] = {
336     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
337     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
338     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
339     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
340     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
341     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
342     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
343     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
344     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
345     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
346     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
347     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
348     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
349     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
350     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
351     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
352     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
353     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
354     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
355     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
356     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
357     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
358     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
359     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
360     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
361     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
362     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
363     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
364     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
365     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
366 NULL,
367     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
368 NULL
369 };
370 
371 static const char* const DEPRECATED_COUNTRIES[] = {
372     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
373 };
374 static const char* const REPLACEMENT_COUNTRIES[] = {
375 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
376     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
377 };
378 
379 /**
380  * Table of 3-letter country codes.
381  *
382  * This is a lookup table used to convert 3-letter country codes to
383  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
384  * For all valid i, COUNTRIES[i] must refer to the same country as
385  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
386  * to make eyeballing this baby easier.
387  *
388  * This table should be terminated with a NULL entry, followed by a
389  * second list, and another NULL entry.  The two lists correspond to
390  * the two lists in COUNTRIES.
391  */
392 static const char * const COUNTRIES_3[] = {
393 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
394     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
395 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
396     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
397 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
398     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
399 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
400     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
401 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
402     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
403 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
404     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
405 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
406     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
407 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
408     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
409 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
410     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
411 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
412     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
413 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
414     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
415 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
416     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
417 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
418     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
419 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
420     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
421 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
422     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
423 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
424     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
425 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
426     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
427 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
428     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
429 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
430     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
431 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
432     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
433 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
434     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
435 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
436     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
437 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
438     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
439 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
440     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
441 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
442     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
443 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
444     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
445 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
446     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
447 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
448     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
449 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
450     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
451 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
452     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
453 NULL,
454 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
455     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
456 NULL
457 };
458 
459 typedef struct CanonicalizationMap {
460     const char *id;          /* input ID */
461     const char *canonicalID; /* canonicalized output ID */
462 } CanonicalizationMap;
463 
464 /**
465  * A map to canonicalize locale IDs.  This handles a variety of
466  * different semantic kinds of transformations.
467  */
468 static const CanonicalizationMap CANONICALIZE_MAP[] = {
469     { "art_LOJBAN",     "jbo" }, /* registered name */
470     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
471     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
472     { "zh_GAN",         "gan" }, /* registered name */
473     { "zh_GUOYU",       "zh" }, /* registered name */
474     { "zh_HAKKA",       "hak" }, /* registered name */
475     { "zh_MIN_NAN",     "nan" }, /* registered name */
476     { "zh_WUU",         "wuu" }, /* registered name */
477     { "zh_XIANG",       "hsn" }, /* registered name */
478     { "zh_YUE",         "yue" }, /* registered name */
479 };
480 
481 /* ### BCP47 Conversion *******************************************/
482 /* Test if the locale id has BCP47 u extension and does not have '@' */
483 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
484 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
485 #define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
486     if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
487             U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
488         finalID=id; \
489         if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
490     } else { \
491         finalID=buffer; \
492     } \
493 } UPRV_BLOCK_MACRO_END
494 /* Gets the size of the shortest subtag in the given localeID. */
getShortestSubtagLength(const char * localeID)495 static int32_t getShortestSubtagLength(const char *localeID) {
496     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
497     int32_t length = localeIDLength;
498     int32_t tmpLength = 0;
499     int32_t i;
500     UBool reset = TRUE;
501 
502     for (i = 0; i < localeIDLength; i++) {
503         if (localeID[i] != '_' && localeID[i] != '-') {
504             if (reset) {
505                 tmpLength = 0;
506                 reset = FALSE;
507             }
508             tmpLength++;
509         } else {
510             if (tmpLength != 0 && tmpLength < length) {
511                 length = tmpLength;
512             }
513             reset = TRUE;
514         }
515     }
516 
517     return length;
518 }
519 
520 /* ### Keywords **************************************************/
521 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
522 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
523 /* Punctuation/symbols allowed in legacy key values */
524 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
525 
526 #define ULOC_KEYWORD_BUFFER_LEN 25
527 #define ULOC_MAX_NO_KEYWORDS 25
528 
529 U_CAPI const char * U_EXPORT2
locale_getKeywordsStart(const char * localeID)530 locale_getKeywordsStart(const char *localeID) {
531     const char *result = NULL;
532     if((result = uprv_strchr(localeID, '@')) != NULL) {
533         return result;
534     }
535 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
536     else {
537         /* We do this because the @ sign is variant, and the @ sign used on one
538         EBCDIC machine won't be compiled the same way on other EBCDIC based
539         machines. */
540         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
541         const uint8_t *charToFind = ebcdicSigns;
542         while(*charToFind) {
543             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
544                 return result;
545             }
546             charToFind++;
547         }
548     }
549 #endif
550     return NULL;
551 }
552 
553 /**
554  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
555  * @param keywordName incoming name to be canonicalized
556  * @param status return status (keyword too long)
557  * @return length of the keyword name
558  */
locale_canonKeywordName(char * buf,const char * keywordName,UErrorCode * status)559 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
560 {
561   int32_t keywordNameLen = 0;
562 
563   for (; *keywordName != 0; keywordName++) {
564     if (!UPRV_ISALPHANUM(*keywordName)) {
565       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
566       return 0;
567     }
568     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
569       buf[keywordNameLen++] = uprv_tolower(*keywordName);
570     } else {
571       /* keyword name too long for internal buffer */
572       *status = U_INTERNAL_PROGRAM_ERROR;
573       return 0;
574     }
575   }
576   if (keywordNameLen == 0) {
577     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
578     return 0;
579   }
580   buf[keywordNameLen] = 0; /* terminate */
581 
582   return keywordNameLen;
583 }
584 
585 typedef struct {
586     char keyword[ULOC_KEYWORD_BUFFER_LEN];
587     int32_t keywordLen;
588     const char *valueStart;
589     int32_t valueLen;
590 } KeywordStruct;
591 
592 static int32_t U_CALLCONV
compareKeywordStructs(const void *,const void * left,const void * right)593 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
594     const char* leftString = ((const KeywordStruct *)left)->keyword;
595     const char* rightString = ((const KeywordStruct *)right)->keyword;
596     return uprv_strcmp(leftString, rightString);
597 }
598 
599 static int32_t
_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)600 _getKeywords(const char *localeID,
601              char prev,
602              char *keywords, int32_t keywordCapacity,
603              char *values, int32_t valuesCapacity, int32_t *valLen,
604              UBool valuesToo,
605              UErrorCode *status)
606 {
607     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
608 
609     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
610     int32_t numKeywords = 0;
611     const char* pos = localeID;
612     const char* equalSign = NULL;
613     const char* semicolon = NULL;
614     int32_t i = 0, j, n;
615     int32_t keywordsLen = 0;
616     int32_t valuesLen = 0;
617 
618     if(prev == '@') { /* start of keyword definition */
619         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
620         do {
621             UBool duplicate = FALSE;
622             /* skip leading spaces */
623             while(*pos == ' ') {
624                 pos++;
625             }
626             if (!*pos) { /* handle trailing "; " */
627                 break;
628             }
629             if(numKeywords == maxKeywords) {
630                 *status = U_INTERNAL_PROGRAM_ERROR;
631                 return 0;
632             }
633             equalSign = uprv_strchr(pos, '=');
634             semicolon = uprv_strchr(pos, ';');
635             /* lack of '=' [foo@currency] is illegal */
636             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
637             if(!equalSign || (semicolon && semicolon<equalSign)) {
638                 *status = U_INVALID_FORMAT_ERROR;
639                 return 0;
640             }
641             /* need to normalize both keyword and keyword name */
642             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
643                 /* keyword name too long for internal buffer */
644                 *status = U_INTERNAL_PROGRAM_ERROR;
645                 return 0;
646             }
647             for(i = 0, n = 0; i < equalSign - pos; ++i) {
648                 if (pos[i] != ' ') {
649                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
650                 }
651             }
652 
653             /* zero-length keyword is an error. */
654             if (n == 0) {
655                 *status = U_INVALID_FORMAT_ERROR;
656                 return 0;
657             }
658 
659             keywordList[numKeywords].keyword[n] = 0;
660             keywordList[numKeywords].keywordLen = n;
661             /* now grab the value part. First we skip the '=' */
662             equalSign++;
663             /* then we leading spaces */
664             while(*equalSign == ' ') {
665                 equalSign++;
666             }
667 
668             /* Premature end or zero-length value */
669             if (!*equalSign || equalSign == semicolon) {
670                 *status = U_INVALID_FORMAT_ERROR;
671                 return 0;
672             }
673 
674             keywordList[numKeywords].valueStart = equalSign;
675 
676             pos = semicolon;
677             i = 0;
678             if(pos) {
679                 while(*(pos - i - 1) == ' ') {
680                     i++;
681                 }
682                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
683                 pos++;
684             } else {
685                 i = (int32_t)uprv_strlen(equalSign);
686                 while(i && equalSign[i-1] == ' ') {
687                     i--;
688                 }
689                 keywordList[numKeywords].valueLen = i;
690             }
691             /* If this is a duplicate keyword, then ignore it */
692             for (j=0; j<numKeywords; ++j) {
693                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
694                     duplicate = TRUE;
695                     break;
696                 }
697             }
698             if (!duplicate) {
699                 ++numKeywords;
700             }
701         } while(pos);
702 
703         /* now we have a list of keywords */
704         /* we need to sort it */
705         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
706 
707         /* Now construct the keyword part */
708         for(i = 0; i < numKeywords; i++) {
709             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
710                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
711                 if(valuesToo) {
712                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
713                 } else {
714                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
715                 }
716             }
717             keywordsLen += keywordList[i].keywordLen + 1;
718             if(valuesToo) {
719                 if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
720                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
721                 }
722                 keywordsLen += keywordList[i].valueLen;
723 
724                 if(i < numKeywords - 1) {
725                     if(keywordsLen < keywordCapacity) {
726                         keywords[keywordsLen] = ';';
727                     }
728                     keywordsLen++;
729                 }
730             }
731             if(values) {
732                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
733                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
734                     values[valuesLen + keywordList[i].valueLen] = 0;
735                 }
736                 valuesLen += keywordList[i].valueLen + 1;
737             }
738         }
739         if(values) {
740             values[valuesLen] = 0;
741             if(valLen) {
742                 *valLen = valuesLen;
743             }
744         }
745         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
746     } else {
747         return 0;
748     }
749 }
750 
751 U_CFUNC int32_t
locale_getKeywords(const char * localeID,char prev,char * keywords,int32_t keywordCapacity,char * values,int32_t valuesCapacity,int32_t * valLen,UBool valuesToo,UErrorCode * status)752 locale_getKeywords(const char *localeID,
753                    char prev,
754                    char *keywords, int32_t keywordCapacity,
755                    char *values, int32_t valuesCapacity, int32_t *valLen,
756                    UBool valuesToo,
757                    UErrorCode *status) {
758     return _getKeywords(localeID, prev, keywords, keywordCapacity,
759                         values, valuesCapacity, valLen, valuesToo,
760                         status);
761 }
762 
763 U_CAPI int32_t U_EXPORT2
uloc_getKeywordValue(const char * localeID,const char * keywordName,char * buffer,int32_t bufferCapacity,UErrorCode * status)764 uloc_getKeywordValue(const char* localeID,
765                      const char* keywordName,
766                      char* buffer, int32_t bufferCapacity,
767                      UErrorCode* status)
768 {
769     if (buffer != nullptr) {
770         buffer[0] = '\0';
771     }
772     const char* startSearchHere = NULL;
773     const char* nextSeparator = NULL;
774     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
775     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
776     int32_t result = 0;
777 
778     if(status && U_SUCCESS(*status) && localeID) {
779       char tempBuffer[ULOC_FULLNAME_CAPACITY];
780       const char* tmpLocaleID;
781 
782       if (keywordName == NULL || keywordName[0] == 0) {
783         *status = U_ILLEGAL_ARGUMENT_ERROR;
784         return 0;
785       }
786 
787       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
788       if(U_FAILURE(*status)) {
789         return 0;
790       }
791 
792       if (_hasBCP47Extension(localeID)) {
793           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
794       } else {
795           tmpLocaleID=localeID;
796       }
797 
798       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
799       if(startSearchHere == NULL) {
800           /* no keywords, return at once */
801           return 0;
802       }
803 
804       /* find the first keyword */
805       while(startSearchHere) {
806           const char* keyValueTail;
807           int32_t keyValueLen;
808 
809           startSearchHere++; /* skip @ or ; */
810           nextSeparator = uprv_strchr(startSearchHere, '=');
811           if(!nextSeparator) {
812               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
813               return 0;
814           }
815           /* strip leading & trailing spaces (TC decided to tolerate these) */
816           while(*startSearchHere == ' ') {
817               startSearchHere++;
818           }
819           keyValueTail = nextSeparator;
820           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
821               keyValueTail--;
822           }
823           /* now keyValueTail points to first char after the keyName */
824           /* copy & normalize keyName from locale */
825           if (startSearchHere == keyValueTail) {
826               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
827               return 0;
828           }
829           keyValueLen = 0;
830           while (startSearchHere < keyValueTail) {
831             if (!UPRV_ISALPHANUM(*startSearchHere)) {
832               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
833               return 0;
834             }
835             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
836               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
837             } else {
838               /* keyword name too long for internal buffer */
839               *status = U_INTERNAL_PROGRAM_ERROR;
840               return 0;
841             }
842           }
843           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
844 
845           startSearchHere = uprv_strchr(nextSeparator, ';');
846 
847           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
848                /* current entry matches the keyword. */
849              nextSeparator++; /* skip '=' */
850               /* First strip leading & trailing spaces (TC decided to tolerate these) */
851               while(*nextSeparator == ' ') {
852                 nextSeparator++;
853               }
854               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
855               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
856                 keyValueTail--;
857               }
858               /* Now copy the value, but check well-formedness */
859               if (nextSeparator == keyValueTail) {
860                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
861                 return 0;
862               }
863               keyValueLen = 0;
864               while (nextSeparator < keyValueTail) {
865                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
866                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
867                   return 0;
868                 }
869                 if (keyValueLen < bufferCapacity) {
870                   /* Should we lowercase value to return here? Tests expect as-is. */
871                   buffer[keyValueLen++] = *nextSeparator++;
872                 } else { /* keep advancing so we return correct length in case of overflow */
873                   keyValueLen++;
874                   nextSeparator++;
875                 }
876               }
877               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
878               return result;
879           }
880       }
881     }
882     return 0;
883 }
884 
885 U_CAPI int32_t U_EXPORT2
uloc_setKeywordValue(const char * keywordName,const char * keywordValue,char * buffer,int32_t bufferCapacity,UErrorCode * status)886 uloc_setKeywordValue(const char* keywordName,
887                      const char* keywordValue,
888                      char* buffer, int32_t bufferCapacity,
889                      UErrorCode* status)
890 {
891     /* TODO: sorting. removal. */
892     int32_t keywordNameLen;
893     int32_t keywordValueLen;
894     int32_t bufLen;
895     int32_t needLen = 0;
896     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
897     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
898     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
899     int32_t rc;
900     char* nextSeparator = NULL;
901     char* nextEqualsign = NULL;
902     char* startSearchHere = NULL;
903     char* keywordStart = NULL;
904     CharString updatedKeysAndValues;
905     int32_t updatedKeysAndValuesLen;
906     UBool handledInputKeyAndValue = FALSE;
907     char keyValuePrefix = '@';
908 
909     if(U_FAILURE(*status)) {
910         return -1;
911     }
912     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
913         *status = U_ILLEGAL_ARGUMENT_ERROR;
914         return 0;
915     }
916     bufLen = (int32_t)uprv_strlen(buffer);
917     if(bufferCapacity<bufLen) {
918         /* The capacity is less than the length?! Is this NULL terminated? */
919         *status = U_ILLEGAL_ARGUMENT_ERROR;
920         return 0;
921     }
922     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
923     if(U_FAILURE(*status)) {
924         return 0;
925     }
926 
927     keywordValueLen = 0;
928     if(keywordValue) {
929         while (*keywordValue != 0) {
930             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
931                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
932                 return 0;
933             }
934             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
935                 /* Should we force lowercase in value to set? */
936                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
937             } else {
938                 /* keywordValue too long for internal buffer */
939                 *status = U_INTERNAL_PROGRAM_ERROR;
940                 return 0;
941             }
942         }
943     }
944     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
945 
946     startSearchHere = (char*)locale_getKeywordsStart(buffer);
947     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
948         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
949             return bufLen;
950         }
951 
952         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
953         if(startSearchHere) { /* had a single @ */
954             needLen--; /* already had the @ */
955             /* startSearchHere points at the @ */
956         } else {
957             startSearchHere=buffer+bufLen;
958         }
959         if(needLen >= bufferCapacity) {
960             *status = U_BUFFER_OVERFLOW_ERROR;
961             return needLen; /* no change */
962         }
963         *startSearchHere++ = '@';
964         uprv_strcpy(startSearchHere, keywordNameBuffer);
965         startSearchHere += keywordNameLen;
966         *startSearchHere++ = '=';
967         uprv_strcpy(startSearchHere, keywordValueBuffer);
968         return needLen;
969     } /* end shortcut - no @ */
970 
971     keywordStart = startSearchHere;
972     /* search for keyword */
973     while(keywordStart) {
974         const char* keyValueTail;
975         int32_t keyValueLen;
976 
977         keywordStart++; /* skip @ or ; */
978         nextEqualsign = uprv_strchr(keywordStart, '=');
979         if (!nextEqualsign) {
980             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
981             return 0;
982         }
983         /* strip leading & trailing spaces (TC decided to tolerate these) */
984         while(*keywordStart == ' ') {
985             keywordStart++;
986         }
987         keyValueTail = nextEqualsign;
988         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
989             keyValueTail--;
990         }
991         /* now keyValueTail points to first char after the keyName */
992         /* copy & normalize keyName from locale */
993         if (keywordStart == keyValueTail) {
994             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
995             return 0;
996         }
997         keyValueLen = 0;
998         while (keywordStart < keyValueTail) {
999             if (!UPRV_ISALPHANUM(*keywordStart)) {
1000                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1001                 return 0;
1002             }
1003             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1004                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1005             } else {
1006                 /* keyword name too long for internal buffer */
1007                 *status = U_INTERNAL_PROGRAM_ERROR;
1008                 return 0;
1009             }
1010         }
1011         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1012 
1013         nextSeparator = uprv_strchr(nextEqualsign, ';');
1014 
1015         /* start processing the value part */
1016         nextEqualsign++; /* skip '=' */
1017         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1018         while(*nextEqualsign == ' ') {
1019             nextEqualsign++;
1020         }
1021         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1022         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1023             keyValueTail--;
1024         }
1025         if (nextEqualsign == keyValueTail) {
1026             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1027             return 0;
1028         }
1029 
1030         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1031         if(rc == 0) {
1032             /* Current entry matches the input keyword. Update the entry */
1033             if(keywordValueLen > 0) { /* updating a value */
1034                 updatedKeysAndValues.append(keyValuePrefix, *status);
1035                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1036                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1037                 updatedKeysAndValues.append('=', *status);
1038                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1039             } /* else removing this entry, don't emit anything */
1040             handledInputKeyAndValue = TRUE;
1041         } else {
1042            /* input keyword sorts earlier than current entry, add before current entry */
1043             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1044                 /* insert new entry at this location */
1045                 updatedKeysAndValues.append(keyValuePrefix, *status);
1046                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1047                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1048                 updatedKeysAndValues.append('=', *status);
1049                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1050                 handledInputKeyAndValue = TRUE;
1051             }
1052             /* copy the current entry */
1053             updatedKeysAndValues.append(keyValuePrefix, *status);
1054             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1055             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1056             updatedKeysAndValues.append('=', *status);
1057             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1058         }
1059         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1060             /* append new entry at the end, it sorts later than existing entries */
1061             updatedKeysAndValues.append(keyValuePrefix, *status);
1062             /* skip keyValuePrefix update, no subsequent key-value pair */
1063             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1064             updatedKeysAndValues.append('=', *status);
1065             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1066             handledInputKeyAndValue = TRUE;
1067         }
1068         keywordStart = nextSeparator;
1069     } /* end loop searching */
1070 
1071     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1072      * problems with the passed-in locale. So if we did encounter problems with the
1073      * passed-in locale above, those errors took precedence and overrode any error
1074      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1075      * are errors here they are from updatedKeysAndValues.append; they do cause an
1076      * error return but the passed-in locale is unmodified and the original bufLen is
1077      * returned.
1078      */
1079     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1080         /* if input key/value specified removal of a keyword not present in locale, or
1081          * there was an error in CharString.append, leave original locale alone. */
1082         return bufLen;
1083     }
1084 
1085     updatedKeysAndValuesLen = updatedKeysAndValues.length();
1086     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1087     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1088     if(needLen >= bufferCapacity) {
1089         *status = U_BUFFER_OVERFLOW_ERROR;
1090         return needLen; /* no change */
1091     }
1092     if (updatedKeysAndValuesLen > 0) {
1093         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1094     }
1095     buffer[needLen]=0;
1096     return needLen;
1097 }
1098 
1099 /* ### ID parsing implementation **************************************************/
1100 
1101 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1102 
1103 /*returns TRUE if one of the special prefixes is here (s=string)
1104   'x-' or 'i-' */
1105 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1106 
1107 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1108  * except for variant
1109  */
1110 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1111 
1112 /**
1113  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1114  * a NULL entry, followed by more entries, and a second NULL entry.
1115  *
1116  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1117  * COUNTRIES_3.
1118  */
_findIndex(const char * const * list,const char * key)1119 static int16_t _findIndex(const char* const* list, const char* key)
1120 {
1121     const char* const* anchor = list;
1122     int32_t pass = 0;
1123 
1124     /* Make two passes through two NULL-terminated arrays at 'list' */
1125     while (pass++ < 2) {
1126         while (*list) {
1127             if (uprv_strcmp(key, *list) == 0) {
1128                 return (int16_t)(list - anchor);
1129             }
1130             list++;
1131         }
1132         ++list;     /* skip final NULL *CWB*/
1133     }
1134     return -1;
1135 }
1136 
1137 /* count the length of src while copying it to dest; return strlen(src) */
1138 static inline int32_t
_copyCount(char * dest,int32_t destCapacity,const char * src)1139 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1140     const char *anchor;
1141     char c;
1142 
1143     anchor=src;
1144     for(;;) {
1145         if((c=*src)==0) {
1146             return (int32_t)(src-anchor);
1147         }
1148         if(destCapacity<=0) {
1149             return (int32_t)((src-anchor)+uprv_strlen(src));
1150         }
1151         ++src;
1152         *dest++=c;
1153         --destCapacity;
1154     }
1155 }
1156 
1157 U_CFUNC const char*
uloc_getCurrentCountryID(const char * oldID)1158 uloc_getCurrentCountryID(const char* oldID){
1159     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1160     if (offset >= 0) {
1161         return REPLACEMENT_COUNTRIES[offset];
1162     }
1163     return oldID;
1164 }
1165 U_CFUNC const char*
uloc_getCurrentLanguageID(const char * oldID)1166 uloc_getCurrentLanguageID(const char* oldID){
1167     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1168     if (offset >= 0) {
1169         return REPLACEMENT_LANGUAGES[offset];
1170     }
1171     return oldID;
1172 }
1173 /*
1174  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1175  * avoid duplicating code to handle the earlier locale ID pieces
1176  * in the functions for the later ones by
1177  * setting the *pEnd pointer to where they stopped parsing
1178  *
1179  * TODO try to use this in Locale
1180  */
1181 U_CFUNC int32_t
ulocimp_getLanguage(const char * localeID,char * language,int32_t languageCapacity,const char ** pEnd)1182 ulocimp_getLanguage(const char *localeID,
1183                     char *language, int32_t languageCapacity,
1184                     const char **pEnd) {
1185     int32_t i=0;
1186     int32_t offset;
1187     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1188 
1189     if (uprv_stricmp(localeID, "root") == 0) {
1190         localeID += 4;
1191     } else if (uprv_strnicmp(localeID, "und", 3) == 0 &&
1192                (localeID[3] == '\0' ||
1193                 localeID[3] == '-' ||
1194                 localeID[3] == '_' ||
1195                 localeID[3] == '@')) {
1196         localeID += 3;
1197     }
1198 
1199     /* if it starts with i- or x- then copy that prefix */
1200     if(_isIDPrefix(localeID)) {
1201         if(i<languageCapacity) {
1202             language[i]=(char)uprv_tolower(*localeID);
1203         }
1204         if(i<languageCapacity) {
1205             language[i+1]='-';
1206         }
1207         i+=2;
1208         localeID+=2;
1209     }
1210 
1211     /* copy the language as far as possible and count its length */
1212     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1213         if(i<languageCapacity) {
1214             language[i]=(char)uprv_tolower(*localeID);
1215         }
1216         if(i<3) {
1217             U_ASSERT(i>=0);
1218             lang[i]=(char)uprv_tolower(*localeID);
1219         }
1220         i++;
1221         localeID++;
1222     }
1223 
1224     if(i==3) {
1225         /* convert 3 character code to 2 character code if possible *CWB*/
1226         offset=_findIndex(LANGUAGES_3, lang);
1227         if(offset>=0) {
1228             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1229         }
1230     }
1231 
1232     if(pEnd!=NULL) {
1233         *pEnd=localeID;
1234     }
1235     return i;
1236 }
1237 
1238 U_CFUNC int32_t
ulocimp_getScript(const char * localeID,char * script,int32_t scriptCapacity,const char ** pEnd)1239 ulocimp_getScript(const char *localeID,
1240                   char *script, int32_t scriptCapacity,
1241                   const char **pEnd)
1242 {
1243     int32_t idLen = 0;
1244 
1245     if (pEnd != NULL) {
1246         *pEnd = localeID;
1247     }
1248 
1249     /* copy the second item as far as possible and count its length */
1250     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1251             && uprv_isASCIILetter(localeID[idLen])) {
1252         idLen++;
1253     }
1254 
1255     /* If it's exactly 4 characters long, then it's a script and not a country. */
1256     if (idLen == 4) {
1257         int32_t i;
1258         if (pEnd != NULL) {
1259             *pEnd = localeID+idLen;
1260         }
1261         if(idLen > scriptCapacity) {
1262             idLen = scriptCapacity;
1263         }
1264         if (idLen >= 1) {
1265             script[0]=(char)uprv_toupper(*(localeID++));
1266         }
1267         for (i = 1; i < idLen; i++) {
1268             script[i]=(char)uprv_tolower(*(localeID++));
1269         }
1270     }
1271     else {
1272         idLen = 0;
1273     }
1274     return idLen;
1275 }
1276 
1277 U_CFUNC int32_t
ulocimp_getCountry(const char * localeID,char * country,int32_t countryCapacity,const char ** pEnd)1278 ulocimp_getCountry(const char *localeID,
1279                    char *country, int32_t countryCapacity,
1280                    const char **pEnd)
1281 {
1282     int32_t idLen=0;
1283     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1284     int32_t offset;
1285 
1286     /* copy the country as far as possible and count its length */
1287     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1288         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1289             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1290         }
1291         idLen++;
1292     }
1293 
1294     /* the country should be either length 2 or 3 */
1295     if (idLen == 2 || idLen == 3) {
1296         UBool gotCountry = FALSE;
1297         /* convert 3 character code to 2 character code if possible *CWB*/
1298         if(idLen==3) {
1299             offset=_findIndex(COUNTRIES_3, cnty);
1300             if(offset>=0) {
1301                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1302                 gotCountry = TRUE;
1303             }
1304         }
1305         if (!gotCountry) {
1306             int32_t i = 0;
1307             for (i = 0; i < idLen; i++) {
1308                 if (i < countryCapacity) {
1309                     country[i]=(char)uprv_toupper(localeID[i]);
1310                 }
1311             }
1312         }
1313         localeID+=idLen;
1314     } else {
1315         idLen = 0;
1316     }
1317 
1318     if(pEnd!=NULL) {
1319         *pEnd=localeID;
1320     }
1321 
1322     return idLen;
1323 }
1324 
1325 /**
1326  * @param needSeparator if true, then add leading '_' if any variants
1327  * are added to 'variant'
1328  */
1329 static int32_t
_getVariantEx(const char * localeID,char prev,char * variant,int32_t variantCapacity,UBool needSeparator)1330 _getVariantEx(const char *localeID,
1331               char prev,
1332               char *variant, int32_t variantCapacity,
1333               UBool needSeparator) {
1334     int32_t i=0;
1335 
1336     /* get one or more variant tags and separate them with '_' */
1337     if(_isIDSeparator(prev)) {
1338         /* get a variant string after a '-' or '_' */
1339         while(!_isTerminator(*localeID)) {
1340             if (needSeparator) {
1341                 if (i<variantCapacity) {
1342                     variant[i] = '_';
1343                 }
1344                 ++i;
1345                 needSeparator = FALSE;
1346             }
1347             if(i<variantCapacity) {
1348                 variant[i]=(char)uprv_toupper(*localeID);
1349                 if(variant[i]=='-') {
1350                     variant[i]='_';
1351                 }
1352             }
1353             i++;
1354             localeID++;
1355         }
1356     }
1357 
1358     /* if there is no variant tag after a '-' or '_' then look for '@' */
1359     if(i==0) {
1360         if(prev=='@') {
1361             /* keep localeID */
1362         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1363             ++localeID; /* point after the '@' */
1364         } else {
1365             return 0;
1366         }
1367         while(!_isTerminator(*localeID)) {
1368             if (needSeparator) {
1369                 if (i<variantCapacity) {
1370                     variant[i] = '_';
1371                 }
1372                 ++i;
1373                 needSeparator = FALSE;
1374             }
1375             if(i<variantCapacity) {
1376                 variant[i]=(char)uprv_toupper(*localeID);
1377                 if(variant[i]=='-' || variant[i]==',') {
1378                     variant[i]='_';
1379                 }
1380             }
1381             i++;
1382             localeID++;
1383         }
1384     }
1385 
1386     return i;
1387 }
1388 
1389 static int32_t
_getVariant(const char * localeID,char prev,char * variant,int32_t variantCapacity)1390 _getVariant(const char *localeID,
1391             char prev,
1392             char *variant, int32_t variantCapacity) {
1393     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1394 }
1395 
1396 /* Keyword enumeration */
1397 
1398 typedef struct UKeywordsContext {
1399     char* keywords;
1400     char* current;
1401 } UKeywordsContext;
1402 
1403 U_CDECL_BEGIN
1404 
1405 static void U_CALLCONV
uloc_kw_closeKeywords(UEnumeration * enumerator)1406 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1407     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1408     uprv_free(enumerator->context);
1409     uprv_free(enumerator);
1410 }
1411 
1412 static int32_t U_CALLCONV
uloc_kw_countKeywords(UEnumeration * en,UErrorCode *)1413 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1414     char *kw = ((UKeywordsContext *)en->context)->keywords;
1415     int32_t result = 0;
1416     while(*kw) {
1417         result++;
1418         kw += uprv_strlen(kw)+1;
1419     }
1420     return result;
1421 }
1422 
1423 static const char * U_CALLCONV
uloc_kw_nextKeyword(UEnumeration * en,int32_t * resultLength,UErrorCode *)1424 uloc_kw_nextKeyword(UEnumeration* en,
1425                     int32_t* resultLength,
1426                     UErrorCode* /*status*/) {
1427     const char* result = ((UKeywordsContext *)en->context)->current;
1428     int32_t len = 0;
1429     if(*result) {
1430         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1431         ((UKeywordsContext *)en->context)->current += len+1;
1432     } else {
1433         result = NULL;
1434     }
1435     if (resultLength) {
1436         *resultLength = len;
1437     }
1438     return result;
1439 }
1440 
1441 static void U_CALLCONV
uloc_kw_resetKeywords(UEnumeration * en,UErrorCode *)1442 uloc_kw_resetKeywords(UEnumeration* en,
1443                       UErrorCode* /*status*/) {
1444     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1445 }
1446 
1447 U_CDECL_END
1448 
1449 
1450 static const UEnumeration gKeywordsEnum = {
1451     NULL,
1452     NULL,
1453     uloc_kw_closeKeywords,
1454     uloc_kw_countKeywords,
1455     uenum_unextDefault,
1456     uloc_kw_nextKeyword,
1457     uloc_kw_resetKeywords
1458 };
1459 
1460 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywordList(const char * keywordList,int32_t keywordListSize,UErrorCode * status)1461 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1462 {
1463     LocalMemory<UKeywordsContext> myContext;
1464     LocalMemory<UEnumeration> result;
1465 
1466     if (U_FAILURE(*status)) {
1467         return nullptr;
1468     }
1469     myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1470     result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1471     if (myContext.isNull() || result.isNull()) {
1472         *status = U_MEMORY_ALLOCATION_ERROR;
1473         return nullptr;
1474     }
1475     uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1476     myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1477     if (myContext->keywords == nullptr) {
1478         *status = U_MEMORY_ALLOCATION_ERROR;
1479         return nullptr;
1480     }
1481     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1482     myContext->keywords[keywordListSize] = 0;
1483     myContext->current = myContext->keywords;
1484     result->context = myContext.orphan();
1485     return result.orphan();
1486 }
1487 
1488 U_CAPI UEnumeration* U_EXPORT2
uloc_openKeywords(const char * localeID,UErrorCode * status)1489 uloc_openKeywords(const char* localeID,
1490                         UErrorCode* status)
1491 {
1492     int32_t i=0;
1493     char keywords[256];
1494     int32_t keywordsCapacity = 256;
1495     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1496     const char* tmpLocaleID;
1497 
1498     if(status==NULL || U_FAILURE(*status)) {
1499         return 0;
1500     }
1501 
1502     if (_hasBCP47Extension(localeID)) {
1503         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1504     } else {
1505         if (localeID==NULL) {
1506            localeID=uloc_getDefault();
1507         }
1508         tmpLocaleID=localeID;
1509     }
1510 
1511     /* Skip the language */
1512     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1513     if(_isIDSeparator(*tmpLocaleID)) {
1514         const char *scriptID;
1515         /* Skip the script if available */
1516         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1517         if(scriptID != tmpLocaleID+1) {
1518             /* Found optional script */
1519             tmpLocaleID = scriptID;
1520         }
1521         /* Skip the Country */
1522         if (_isIDSeparator(*tmpLocaleID)) {
1523             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1524             if(_isIDSeparator(*tmpLocaleID)) {
1525                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1526             }
1527         }
1528     }
1529 
1530     /* keywords are located after '@' */
1531     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1532         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1533     }
1534 
1535     if(i) {
1536         return uloc_openKeywordList(keywords, i, status);
1537     } else {
1538         return NULL;
1539     }
1540 }
1541 
1542 
1543 /* bit-flags for 'options' parameter of _canonicalize */
1544 #define _ULOC_STRIP_KEYWORDS 0x2
1545 #define _ULOC_CANONICALIZE   0x1
1546 
1547 #define OPTION_SET(options, mask) ((options & mask) != 0)
1548 
1549 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1550 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1551 
1552 /**
1553  * Canonicalize the given localeID, to level 1 or to level 2,
1554  * depending on the options.  To specify level 1, pass in options=0.
1555  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1556  *
1557  * This is the code underlying uloc_getName and uloc_canonicalize.
1558  */
1559 static int32_t
_canonicalize(const char * localeID,char * result,int32_t resultCapacity,uint32_t options,UErrorCode * err)1560 _canonicalize(const char* localeID,
1561               char* result,
1562               int32_t resultCapacity,
1563               uint32_t options,
1564               UErrorCode* err) {
1565     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1566     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1567     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1568     const char* origLocaleID;
1569     const char* tmpLocaleID;
1570     const char* keywordAssign = NULL;
1571     const char* separatorIndicator = NULL;
1572     char* name;
1573     char* variant = NULL; /* pointer into name, or NULL */
1574 
1575     if (U_FAILURE(*err)) {
1576         return 0;
1577     }
1578 
1579     if (_hasBCP47Extension(localeID)) {
1580         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1581     } else {
1582         if (localeID==NULL) {
1583            localeID=uloc_getDefault();
1584         }
1585         tmpLocaleID=localeID;
1586     }
1587 
1588     origLocaleID=tmpLocaleID;
1589 
1590     /* if we are doing a full canonicalization, then put results in
1591        localeBuffer, if necessary; otherwise send them to result. */
1592     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1593         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1594         name = localeBuffer;
1595         nameCapacity = (int32_t)sizeof(localeBuffer);
1596     } else {
1597         name = result;
1598         nameCapacity = resultCapacity;
1599     }
1600 
1601     /* get all pieces, one after another, and separate with '_' */
1602     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1603 
1604     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1605         const char *d = uloc_getDefault();
1606 
1607         len = (int32_t)uprv_strlen(d);
1608 
1609         if (name != NULL) {
1610             uprv_memcpy(name, d, len);
1611         }
1612     } else if(_isIDSeparator(*tmpLocaleID)) {
1613         const char *scriptID;
1614 
1615         ++fieldCount;
1616         if(len<nameCapacity) {
1617             name[len]='_';
1618         }
1619         ++len;
1620 
1621         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1622             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1623         if(scriptSize > 0) {
1624             /* Found optional script */
1625             tmpLocaleID = scriptID;
1626             ++fieldCount;
1627             len+=scriptSize;
1628             if (_isIDSeparator(*tmpLocaleID)) {
1629                 /* If there is something else, then we add the _ */
1630                 if(len<nameCapacity) {
1631                     name[len]='_';
1632                 }
1633                 ++len;
1634             }
1635         }
1636 
1637         if (_isIDSeparator(*tmpLocaleID)) {
1638             const char *cntryID;
1639             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1640                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1641             if (cntrySize > 0) {
1642                 /* Found optional country */
1643                 tmpLocaleID = cntryID;
1644                 len+=cntrySize;
1645             }
1646             if(_isIDSeparator(*tmpLocaleID)) {
1647                 /* If there is something else, then we add the _  if we found country before. */
1648                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1649                     ++fieldCount;
1650                     if(len<nameCapacity) {
1651                         name[len]='_';
1652                     }
1653                     ++len;
1654                 }
1655 
1656                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1657                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1658                 if (variantSize > 0) {
1659                     variant = len<nameCapacity ? name+len : NULL;
1660                     len += variantSize;
1661                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1662                 }
1663             }
1664         }
1665     }
1666 
1667     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1668     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1669         UBool done = FALSE;
1670         do {
1671             char c = *tmpLocaleID;
1672             switch (c) {
1673             case 0:
1674             case '@':
1675                 done = TRUE;
1676                 break;
1677             default:
1678                 if (len<nameCapacity) {
1679                     name[len] = c;
1680                 }
1681                 ++len;
1682                 ++tmpLocaleID;
1683                 break;
1684             }
1685         } while (!done);
1686     }
1687 
1688     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1689        After this, tmpLocaleID either points to '@' or is NULL */
1690     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1691         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1692         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1693     }
1694 
1695     /* Copy POSIX-style variant, if any [mr@FOO] */
1696     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1697         tmpLocaleID != NULL && keywordAssign == NULL) {
1698         for (;;) {
1699             char c = *tmpLocaleID;
1700             if (c == 0) {
1701                 break;
1702             }
1703             if (len<nameCapacity) {
1704                 name[len] = c;
1705             }
1706             ++len;
1707             ++tmpLocaleID;
1708         }
1709     }
1710 
1711     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1712         /* Handle @FOO variant if @ is present and not followed by = */
1713         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1714             int32_t posixVariantSize;
1715             /* Add missing '_' if needed */
1716             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1717                 do {
1718                     if(len<nameCapacity) {
1719                         name[len]='_';
1720                     }
1721                     ++len;
1722                     ++fieldCount;
1723                 } while(fieldCount<2);
1724             }
1725             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1726                                              (UBool)(variantSize > 0));
1727             if (posixVariantSize > 0) {
1728                 if (variant == NULL) {
1729                     variant = name+len;
1730                 }
1731                 len += posixVariantSize;
1732                 variantSize += posixVariantSize;
1733             }
1734         }
1735 
1736         /* Look up the ID in the canonicalization map */
1737         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1738             const char* id = CANONICALIZE_MAP[j].id;
1739             int32_t n = (int32_t)uprv_strlen(id);
1740             if (len == n && uprv_strncmp(name, id, n) == 0) {
1741                 if (n == 0 && tmpLocaleID != NULL) {
1742                     break; /* Don't remap "" if keywords present */
1743                 }
1744                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1745                 break;
1746             }
1747         }
1748     }
1749 
1750     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1751         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1752             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1753             if(len<nameCapacity) {
1754                 name[len]='@';
1755             }
1756             ++len;
1757             ++fieldCount;
1758             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1759                                 NULL, 0, NULL, TRUE, err);
1760         }
1761     }
1762 
1763     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1764         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1765     }
1766 
1767     return u_terminateChars(result, resultCapacity, len, err);
1768 }
1769 
1770 /* ### ID parsing API **************************************************/
1771 
1772 U_CAPI int32_t  U_EXPORT2
uloc_getParent(const char * localeID,char * parent,int32_t parentCapacity,UErrorCode * err)1773 uloc_getParent(const char*    localeID,
1774                char* parent,
1775                int32_t parentCapacity,
1776                UErrorCode* err)
1777 {
1778     const char *lastUnderscore;
1779     int32_t i;
1780 
1781     if (U_FAILURE(*err))
1782         return 0;
1783 
1784     if (localeID == NULL)
1785         localeID = uloc_getDefault();
1786 
1787     lastUnderscore=uprv_strrchr(localeID, '_');
1788     if(lastUnderscore!=NULL) {
1789         i=(int32_t)(lastUnderscore-localeID);
1790     } else {
1791         i=0;
1792     }
1793 
1794     if (i > 0) {
1795         if (uprv_strnicmp(localeID, "und_", 4) == 0) {
1796             localeID += 3;
1797             i -= 3;
1798             uprv_memmove(parent, localeID, uprv_min(i, parentCapacity));
1799         } else if (parent != localeID) {
1800             uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1801         }
1802     }
1803 
1804     return u_terminateChars(parent, parentCapacity, i, err);
1805 }
1806 
1807 U_CAPI int32_t U_EXPORT2
uloc_getLanguage(const char * localeID,char * language,int32_t languageCapacity,UErrorCode * err)1808 uloc_getLanguage(const char*    localeID,
1809          char* language,
1810          int32_t languageCapacity,
1811          UErrorCode* err)
1812 {
1813     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1814     int32_t i=0;
1815 
1816     if (err==NULL || U_FAILURE(*err)) {
1817         return 0;
1818     }
1819 
1820     if(localeID==NULL) {
1821         localeID=uloc_getDefault();
1822     }
1823 
1824     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1825     return u_terminateChars(language, languageCapacity, i, err);
1826 }
1827 
1828 U_CAPI int32_t U_EXPORT2
uloc_getScript(const char * localeID,char * script,int32_t scriptCapacity,UErrorCode * err)1829 uloc_getScript(const char*    localeID,
1830          char* script,
1831          int32_t scriptCapacity,
1832          UErrorCode* err)
1833 {
1834     int32_t i=0;
1835 
1836     if(err==NULL || U_FAILURE(*err)) {
1837         return 0;
1838     }
1839 
1840     if(localeID==NULL) {
1841         localeID=uloc_getDefault();
1842     }
1843 
1844     /* skip the language */
1845     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1846     if(_isIDSeparator(*localeID)) {
1847         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1848     }
1849     return u_terminateChars(script, scriptCapacity, i, err);
1850 }
1851 
1852 U_CAPI int32_t  U_EXPORT2
uloc_getCountry(const char * localeID,char * country,int32_t countryCapacity,UErrorCode * err)1853 uloc_getCountry(const char* localeID,
1854             char* country,
1855             int32_t countryCapacity,
1856             UErrorCode* err)
1857 {
1858     int32_t i=0;
1859 
1860     if(err==NULL || U_FAILURE(*err)) {
1861         return 0;
1862     }
1863 
1864     if(localeID==NULL) {
1865         localeID=uloc_getDefault();
1866     }
1867 
1868     /* Skip the language */
1869     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1870     if(_isIDSeparator(*localeID)) {
1871         const char *scriptID;
1872         /* Skip the script if available */
1873         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1874         if(scriptID != localeID+1) {
1875             /* Found optional script */
1876             localeID = scriptID;
1877         }
1878         if(_isIDSeparator(*localeID)) {
1879             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1880         }
1881     }
1882     return u_terminateChars(country, countryCapacity, i, err);
1883 }
1884 
1885 U_CAPI int32_t  U_EXPORT2
uloc_getVariant(const char * localeID,char * variant,int32_t variantCapacity,UErrorCode * err)1886 uloc_getVariant(const char* localeID,
1887                 char* variant,
1888                 int32_t variantCapacity,
1889                 UErrorCode* err)
1890 {
1891     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1892     const char* tmpLocaleID;
1893     int32_t i=0;
1894 
1895     if(err==NULL || U_FAILURE(*err)) {
1896         return 0;
1897     }
1898 
1899     if (_hasBCP47Extension(localeID)) {
1900         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1901     } else {
1902         if (localeID==NULL) {
1903            localeID=uloc_getDefault();
1904         }
1905         tmpLocaleID=localeID;
1906     }
1907 
1908     /* Skip the language */
1909     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1910     if(_isIDSeparator(*tmpLocaleID)) {
1911         const char *scriptID;
1912         /* Skip the script if available */
1913         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1914         if(scriptID != tmpLocaleID+1) {
1915             /* Found optional script */
1916             tmpLocaleID = scriptID;
1917         }
1918         /* Skip the Country */
1919         if (_isIDSeparator(*tmpLocaleID)) {
1920             const char *cntryID;
1921             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1922             if (cntryID != tmpLocaleID+1) {
1923                 /* Found optional country */
1924                 tmpLocaleID = cntryID;
1925             }
1926             if(_isIDSeparator(*tmpLocaleID)) {
1927                 /* If there was no country ID, skip a possible extra IDSeparator */
1928                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1929                     tmpLocaleID++;
1930                 }
1931                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
1932             }
1933         }
1934     }
1935 
1936     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1937     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1938 /*
1939     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1940         i=_getVariant(localeID+1, '@', variant, variantCapacity);
1941     }
1942 */
1943     return u_terminateChars(variant, variantCapacity, i, err);
1944 }
1945 
1946 U_CAPI int32_t  U_EXPORT2
uloc_getName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1947 uloc_getName(const char* localeID,
1948              char* name,
1949              int32_t nameCapacity,
1950              UErrorCode* err)
1951 {
1952     return _canonicalize(localeID, name, nameCapacity, 0, err);
1953 }
1954 
1955 U_CAPI int32_t  U_EXPORT2
uloc_getBaseName(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1956 uloc_getBaseName(const char* localeID,
1957                  char* name,
1958                  int32_t nameCapacity,
1959                  UErrorCode* err)
1960 {
1961     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
1962 }
1963 
1964 U_CAPI int32_t  U_EXPORT2
uloc_canonicalize(const char * localeID,char * name,int32_t nameCapacity,UErrorCode * err)1965 uloc_canonicalize(const char* localeID,
1966                   char* name,
1967                   int32_t nameCapacity,
1968                   UErrorCode* err)
1969 {
1970     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
1971 }
1972 
1973 U_CAPI const char*  U_EXPORT2
uloc_getISO3Language(const char * localeID)1974 uloc_getISO3Language(const char* localeID)
1975 {
1976     int16_t offset;
1977     char lang[ULOC_LANG_CAPACITY];
1978     UErrorCode err = U_ZERO_ERROR;
1979 
1980     if (localeID == NULL)
1981     {
1982         localeID = uloc_getDefault();
1983     }
1984     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1985     if (U_FAILURE(err))
1986         return "";
1987     offset = _findIndex(LANGUAGES, lang);
1988     if (offset < 0)
1989         return "";
1990     return LANGUAGES_3[offset];
1991 }
1992 
1993 U_CAPI const char*  U_EXPORT2
uloc_getISO3Country(const char * localeID)1994 uloc_getISO3Country(const char* localeID)
1995 {
1996     int16_t offset;
1997     char cntry[ULOC_LANG_CAPACITY];
1998     UErrorCode err = U_ZERO_ERROR;
1999 
2000     if (localeID == NULL)
2001     {
2002         localeID = uloc_getDefault();
2003     }
2004     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2005     if (U_FAILURE(err))
2006         return "";
2007     offset = _findIndex(COUNTRIES, cntry);
2008     if (offset < 0)
2009         return "";
2010 
2011     return COUNTRIES_3[offset];
2012 }
2013 
2014 U_CAPI uint32_t  U_EXPORT2
uloc_getLCID(const char * localeID)2015 uloc_getLCID(const char* localeID)
2016 {
2017     UErrorCode status = U_ZERO_ERROR;
2018     char       langID[ULOC_FULLNAME_CAPACITY];
2019     uint32_t   lcid = 0;
2020 
2021     /* Check for incomplete id. */
2022     if (!localeID || uprv_strlen(localeID) < 2) {
2023         return 0;
2024     }
2025 
2026     // First, attempt Windows platform lookup if available, but fall
2027     // through to catch any special cases (ICU vs Windows name differences).
2028     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2029     if (U_FAILURE(status)) {
2030         return 0;
2031     }
2032     if (lcid > 0) {
2033         // Windows found an LCID, return that
2034         return lcid;
2035     }
2036 
2037     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2038     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2039         return 0;
2040     }
2041 
2042     if (uprv_strchr(localeID, '@')) {
2043         // uprv_convertToLCID does not support keywords other than collation.
2044         // Remove all keywords except collation.
2045         int32_t len;
2046         char collVal[ULOC_KEYWORDS_CAPACITY];
2047         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2048 
2049         len = uloc_getKeywordValue(localeID, "collation", collVal,
2050             UPRV_LENGTHOF(collVal) - 1, &status);
2051 
2052         if (U_SUCCESS(status) && len > 0) {
2053             collVal[len] = 0;
2054 
2055             len = uloc_getBaseName(localeID, tmpLocaleID,
2056                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2057 
2058             if (U_SUCCESS(status) && len > 0) {
2059                 tmpLocaleID[len] = 0;
2060 
2061                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2062                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2063 
2064                 if (U_SUCCESS(status) && len > 0) {
2065                     tmpLocaleID[len] = 0;
2066                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2067                 }
2068             }
2069         }
2070 
2071         // fall through - all keywords are simply ignored
2072         status = U_ZERO_ERROR;
2073     }
2074 
2075     return uprv_convertToLCID(langID, localeID, &status);
2076 }
2077 
2078 U_CAPI int32_t U_EXPORT2
uloc_getLocaleForLCID(uint32_t hostid,char * locale,int32_t localeCapacity,UErrorCode * status)2079 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2080                 UErrorCode *status)
2081 {
2082     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2083 }
2084 
2085 /* ### Default locale **************************************************/
2086 
2087 U_CAPI const char*  U_EXPORT2
uloc_getDefault()2088 uloc_getDefault()
2089 {
2090     return locale_get_default();
2091 }
2092 
2093 U_CAPI void  U_EXPORT2
uloc_setDefault(const char * newDefaultLocale,UErrorCode * err)2094 uloc_setDefault(const char*   newDefaultLocale,
2095              UErrorCode* err)
2096 {
2097     if (U_FAILURE(*err))
2098         return;
2099     /* the error code isn't currently used for anything by this function*/
2100 
2101     /* propagate change to C++ */
2102     locale_set_default(newDefaultLocale);
2103 }
2104 
2105 /**
2106  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2107  * to an array of pointers to arrays of char.  All of these pointers are owned
2108  * by ICU-- do not delete them, and do not write through them.  The array is
2109  * terminated with a null pointer.
2110  */
2111 U_CAPI const char* const*  U_EXPORT2
uloc_getISOLanguages()2112 uloc_getISOLanguages()
2113 {
2114     return LANGUAGES;
2115 }
2116 
2117 /**
2118  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2119  * pointer to an array of pointers to arrays of char.  All of these pointers are
2120  * owned by ICU-- do not delete them, and do not write through them.  The array is
2121  * terminated with a null pointer.
2122  */
2123 U_CAPI const char* const*  U_EXPORT2
uloc_getISOCountries()2124 uloc_getISOCountries()
2125 {
2126     return COUNTRIES;
2127 }
2128 
2129 
2130 /* this function to be moved into cstring.c later */
2131 static char gDecimal = 0;
2132 
2133 static /* U_CAPI */
2134 double
2135 /* U_EXPORT2 */
_uloc_strtod(const char * start,char ** end)2136 _uloc_strtod(const char *start, char **end) {
2137     char *decimal;
2138     char *myEnd;
2139     char buf[30];
2140     double rv;
2141     if (!gDecimal) {
2142         char rep[5];
2143         /* For machines that decide to change the decimal on you,
2144         and try to be too smart with localization.
2145         This normally should be just a '.'. */
2146         sprintf(rep, "%+1.1f", 1.0);
2147         gDecimal = rep[2];
2148     }
2149 
2150     if(gDecimal == '.') {
2151         return uprv_strtod(start, end); /* fall through to OS */
2152     } else {
2153         uprv_strncpy(buf, start, 29);
2154         buf[29]=0;
2155         decimal = uprv_strchr(buf, '.');
2156         if(decimal) {
2157             *decimal = gDecimal;
2158         } else {
2159             return uprv_strtod(start, end); /* no decimal point */
2160         }
2161         rv = uprv_strtod(buf, &myEnd);
2162         if(end) {
2163             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2164         }
2165         return rv;
2166     }
2167 }
2168 
2169 typedef struct {
2170     float q;
2171     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2172     char locale[ULOC_FULLNAME_CAPACITY+1];
2173 } _acceptLangItem;
2174 
2175 static int32_t U_CALLCONV
uloc_acceptLanguageCompare(const void *,const void * a,const void * b)2176 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2177 {
2178     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2179     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2180 
2181     int32_t rc = 0;
2182     if(bb->q < aa->q) {
2183         rc = -1;  /* A > B */
2184     } else if(bb->q > aa->q) {
2185         rc = 1;   /* A < B */
2186     } else {
2187         rc = 0;   /* A = B */
2188     }
2189 
2190     if(rc==0) {
2191         rc = uprv_stricmp(aa->locale, bb->locale);
2192     }
2193 
2194 #if defined(ULOC_DEBUG)
2195     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2196     aa->locale, aa->q,
2197     bb->locale, bb->q,
2198     rc);*/
2199 #endif
2200 
2201     return rc;
2202 }
2203 
2204 /*
2205 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2206 */
2207 
2208 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguageFromHTTP(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char * httpAcceptLanguage,UEnumeration * availableLocales,UErrorCode * status)2209 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2210                             const char *httpAcceptLanguage,
2211                             UEnumeration* availableLocales,
2212                             UErrorCode *status)
2213 {
2214   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2215     char tmp[ULOC_FULLNAME_CAPACITY +1];
2216     int32_t n = 0;
2217     const char *itemEnd;
2218     const char *paramEnd;
2219     const char *s;
2220     const char *t;
2221     int32_t res;
2222     int32_t i;
2223     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2224 
2225     if(U_FAILURE(*status)) {
2226         return -1;
2227     }
2228 
2229     for(s=httpAcceptLanguage;s&&*s;) {
2230         while(isspace(*s)) /* eat space at the beginning */
2231             s++;
2232         itemEnd=uprv_strchr(s,',');
2233         paramEnd=uprv_strchr(s,';');
2234         if(!itemEnd) {
2235             itemEnd = httpAcceptLanguage+l; /* end of string */
2236         }
2237         if(paramEnd && paramEnd<itemEnd) {
2238             /* semicolon (;) is closer than end (,) */
2239             t = paramEnd+1;
2240             if(*t=='q') {
2241                 t++;
2242             }
2243             while(isspace(*t)) {
2244                 t++;
2245             }
2246             if(*t=='=') {
2247                 t++;
2248             }
2249             while(isspace(*t)) {
2250                 t++;
2251             }
2252             items[n].q = (float)_uloc_strtod(t,NULL);
2253         } else {
2254             /* no semicolon - it's 1.0 */
2255             items[n].q = 1.0f;
2256             paramEnd = itemEnd;
2257         }
2258         items[n].dummy=0;
2259         /* eat spaces prior to semi */
2260         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2261             ;
2262         int32_t slen = static_cast<int32_t>(((t+1)-s));
2263         if(slen > ULOC_FULLNAME_CAPACITY) {
2264           *status = U_BUFFER_OVERFLOW_ERROR;
2265           return -1; // too big
2266         }
2267         uprv_strncpy(items[n].locale, s, slen);
2268         items[n].locale[slen]=0; // terminate
2269         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2270         if(U_FAILURE(*status)) return -1;
2271         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2272             // canonicalization had an effect- copy back
2273             uprv_strncpy(items[n].locale, tmp, clen);
2274             items[n].locale[clen] = 0; // terminate
2275         }
2276 #if defined(ULOC_DEBUG)
2277         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2278 #endif
2279         n++;
2280         s = itemEnd;
2281         while(*s==',') { /* eat duplicate commas */
2282             s++;
2283         }
2284         if(n>=items.getCapacity()) { // If we need more items
2285           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2286               *status = U_MEMORY_ALLOCATION_ERROR;
2287               return -1;
2288           }
2289 #if defined(ULOC_DEBUG)
2290           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2291 #endif
2292         }
2293     }
2294     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2295     if (U_FAILURE(*status)) {
2296         return -1;
2297     }
2298     LocalMemory<const char*> strs(NULL);
2299     if (strs.allocateInsteadAndReset(n) == NULL) {
2300         *status = U_MEMORY_ALLOCATION_ERROR;
2301         return -1;
2302     }
2303     for(i=0;i<n;i++) {
2304 #if defined(ULOC_DEBUG)
2305         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2306 #endif
2307         strs[i]=items[i].locale;
2308     }
2309     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2310                                strs.getAlias(), n, availableLocales, status);
2311     return res;
2312 }
2313 
2314 
2315 U_CAPI int32_t U_EXPORT2
uloc_acceptLanguage(char * result,int32_t resultAvailable,UAcceptResult * outResult,const char ** acceptList,int32_t acceptListCount,UEnumeration * availableLocales,UErrorCode * status)2316 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2317                     UAcceptResult *outResult, const char **acceptList,
2318                     int32_t acceptListCount,
2319                     UEnumeration* availableLocales,
2320                     UErrorCode *status)
2321 {
2322     int32_t i,j;
2323     int32_t len;
2324     int32_t maxLen=0;
2325     char tmp[ULOC_FULLNAME_CAPACITY+1];
2326     const char *l;
2327     char **fallbackList;
2328     if(U_FAILURE(*status)) {
2329         return -1;
2330     }
2331     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2332     if(fallbackList==NULL) {
2333         *status = U_MEMORY_ALLOCATION_ERROR;
2334         return -1;
2335     }
2336     for(i=0;i<acceptListCount;i++) {
2337 #if defined(ULOC_DEBUG)
2338         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2339 #endif
2340         while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2341 #if defined(ULOC_DEBUG)
2342             fprintf(stderr,"  %s\n", l);
2343 #endif
2344             len = (int32_t)uprv_strlen(l);
2345             if(!uprv_strcmp(acceptList[i], l)) {
2346                 if(outResult) {
2347                     *outResult = ULOC_ACCEPT_VALID;
2348                 }
2349 #if defined(ULOC_DEBUG)
2350                 fprintf(stderr, "MATCH! %s\n", l);
2351 #endif
2352                 if(len>0) {
2353                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2354                 }
2355                 for(j=0;j<i;j++) {
2356                     uprv_free(fallbackList[j]);
2357                 }
2358                 uprv_free(fallbackList);
2359                 return u_terminateChars(result, resultAvailable, len, status);
2360             }
2361             if(len>maxLen) {
2362                 maxLen = len;
2363             }
2364         }
2365         uenum_reset(availableLocales, status);
2366         /* save off parent info */
2367         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2368             fallbackList[i] = uprv_strdup(tmp);
2369         } else {
2370             fallbackList[i]=0;
2371         }
2372     }
2373 
2374     for(maxLen--;maxLen>0;maxLen--) {
2375         for(i=0;i<acceptListCount;i++) {
2376             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2377 #if defined(ULOC_DEBUG)
2378                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2379 #endif
2380                 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2381 #if defined(ULOC_DEBUG)
2382                     fprintf(stderr,"  %s\n", l);
2383 #endif
2384                     len = (int32_t)uprv_strlen(l);
2385                     if(!uprv_strcmp(fallbackList[i], l)) {
2386                         if(outResult) {
2387                             *outResult = ULOC_ACCEPT_FALLBACK;
2388                         }
2389 #if defined(ULOC_DEBUG)
2390                         fprintf(stderr, "fallback MATCH! %s\n", l);
2391 #endif
2392                         if(len>0) {
2393                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2394                         }
2395                         for(j=0;j<acceptListCount;j++) {
2396                             uprv_free(fallbackList[j]);
2397                         }
2398                         uprv_free(fallbackList);
2399                         return u_terminateChars(result, resultAvailable, len, status);
2400                     }
2401                 }
2402                 uenum_reset(availableLocales, status);
2403 
2404                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2405                     uprv_free(fallbackList[i]);
2406                     fallbackList[i] = uprv_strdup(tmp);
2407                 } else {
2408                     uprv_free(fallbackList[i]);
2409                     fallbackList[i]=0;
2410                 }
2411             }
2412         }
2413         if(outResult) {
2414             *outResult = ULOC_ACCEPT_FAILED;
2415         }
2416     }
2417     for(i=0;i<acceptListCount;i++) {
2418         uprv_free(fallbackList[i]);
2419     }
2420     uprv_free(fallbackList);
2421     return -1;
2422 }
2423 
2424 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleKey(const char * keyword)2425 uloc_toUnicodeLocaleKey(const char* keyword)
2426 {
2427     const char* bcpKey = ulocimp_toBcpKey(keyword);
2428     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2429         // unknown keyword, but syntax is fine..
2430         return keyword;
2431     }
2432     return bcpKey;
2433 }
2434 
2435 U_CAPI const char* U_EXPORT2
uloc_toUnicodeLocaleType(const char * keyword,const char * value)2436 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2437 {
2438     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2439     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2440         // unknown keyword, but syntax is fine..
2441         return value;
2442     }
2443     return bcpType;
2444 }
2445 
2446 static UBool
isWellFormedLegacyKey(const char * legacyKey)2447 isWellFormedLegacyKey(const char* legacyKey)
2448 {
2449     const char* p = legacyKey;
2450     while (*p) {
2451         if (!UPRV_ISALPHANUM(*p)) {
2452             return FALSE;
2453         }
2454         p++;
2455     }
2456     return TRUE;
2457 }
2458 
2459 static UBool
isWellFormedLegacyType(const char * legacyType)2460 isWellFormedLegacyType(const char* legacyType)
2461 {
2462     const char* p = legacyType;
2463     int32_t alphaNumLen = 0;
2464     while (*p) {
2465         if (*p == '_' || *p == '/' || *p == '-') {
2466             if (alphaNumLen == 0) {
2467                 return FALSE;
2468             }
2469             alphaNumLen = 0;
2470         } else if (UPRV_ISALPHANUM(*p)) {
2471             alphaNumLen++;
2472         } else {
2473             return FALSE;
2474         }
2475         p++;
2476     }
2477     return (alphaNumLen != 0);
2478 }
2479 
2480 U_CAPI const char* U_EXPORT2
uloc_toLegacyKey(const char * keyword)2481 uloc_toLegacyKey(const char* keyword)
2482 {
2483     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2484     if (legacyKey == NULL) {
2485         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2486         //
2487         // Note:
2488         //  LDML/CLDR provides some definition of keyword syntax in
2489         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2490         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2491         //  Keys can only consist of [0-9a-zA-Z].
2492         if (isWellFormedLegacyKey(keyword)) {
2493             return keyword;
2494         }
2495     }
2496     return legacyKey;
2497 }
2498 
2499 U_CAPI const char* U_EXPORT2
uloc_toLegacyType(const char * keyword,const char * value)2500 uloc_toLegacyType(const char* keyword, const char* value)
2501 {
2502     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2503     if (legacyType == NULL) {
2504         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2505         //
2506         // Note:
2507         //  LDML/CLDR provides some definition of keyword syntax in
2508         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2509         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2510         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2511         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2512         if (isWellFormedLegacyType(value)) {
2513             return value;
2514         }
2515     }
2516     return legacyType;
2517 }
2518 
2519 /*eof*/
2520