1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2009-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
17 #include "ustr_imp.h"
18 #include "bytesinkutil.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24 #include "ulocimp.h"
25 #include "uassert.h"
26 
27 
28 /* struct holding a single variant */
29 typedef struct VariantListEntry {
30     const char              *variant;
31     struct VariantListEntry *next;
32 } VariantListEntry;
33 
34 /* struct holding a single attribute value */
35 struct AttributeListEntry : public icu::UMemory {
36     const char              *attribute;
37     struct AttributeListEntry *next;
38 };
39 
40 /* struct holding a single extension */
41 struct ExtensionListEntry : public icu::UMemory {
42     const char                  *key;
43     const char                  *value;
44     struct ExtensionListEntry   *next;
45 };
46 
47 #define MAXEXTLANG 3
48 typedef struct ULanguageTag {
49     char                *buf;   /* holding parsed subtags */
50     const char          *language;
51     const char          *extlang[MAXEXTLANG];
52     const char          *script;
53     const char          *region;
54     VariantListEntry    *variants;
55     ExtensionListEntry  *extensions;
56     const char          *privateuse;
57     const char          *legacy;
58 } ULanguageTag;
59 
60 #define MINLEN 2
61 #define SEP '-'
62 #define PRIVATEUSE 'x'
63 #define LDMLEXT 'u'
64 
65 #define LOCALE_SEP '_'
66 #define LOCALE_EXT_SEP '@'
67 #define LOCALE_KEYWORD_SEP ';'
68 #define LOCALE_KEY_TYPE_SEP '='
69 
70 #define ISALPHA(c) uprv_isASCIILetter(c)
71 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72 
73 static const char EMPTY[] = "";
74 static const char LANG_UND[] = "und";
75 static const char PRIVATEUSE_KEY[] = "x";
76 static const char _POSIX[] = "_POSIX";
77 static const char POSIX_KEY[] = "va";
78 static const char POSIX_VALUE[] = "posix";
79 static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
80 static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
81 static const char LOCALE_TYPE_YES[] = "yes";
82 
83 #define LANG_UND_LEN 3
84 
85 /*
86  Updated on 2018-09-12 from
87  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88 
89  This table has 2 parts. The part for
90  legacy language tags (marked as “Type: grandfathered” in BCP 47)
91  is generated by the following scripts from the IANA language tag registry.
92 
93  curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
94  egrep -A 7 'Type: grandfathered' | \
95  egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
96  awk -n '/Tag/ {printf("    \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
97  tr 'A-Z' 'a-z'
98 
99 
100  The 2nd part is made of five ICU-specific entries. They're kept for
101  the backward compatibility for now, even though there are no preferred
102  values. They may have to be removed for the strict BCP 47 compliance.
103 
104 */
105 static const char* const LEGACY[] = {
106 /*  legacy          preferred */
107     "art-lojban",   "jbo",
108     "en-gb-oed",    "en-gb-oxendict",
109     "i-ami",        "ami",
110     "i-bnn",        "bnn",
111     "i-hak",        "hak",
112     "i-klingon",    "tlh",
113     "i-lux",        "lb",
114     "i-navajo",     "nv",
115     "i-pwn",        "pwn",
116     "i-tao",        "tao",
117     "i-tay",        "tay",
118     "i-tsu",        "tsu",
119     "no-bok",       "nb",
120     "no-nyn",       "nn",
121     "sgn-be-fr",    "sfb",
122     "sgn-be-nl",    "vgt",
123     "sgn-ch-de",    "sgg",
124     "zh-guoyu",     "cmn",
125     "zh-hakka",     "hak",
126     "zh-min-nan",   "nan",
127     "zh-xiang",     "hsn",
128 
129     // Legacy tags with no preferred value in the IANA
130     // registry. Kept for now for the backward compatibility
131     // because ICU has mapped them this way.
132     "i-default",    "en-x-i-default",
133     "i-enochian",   "und-x-i-enochian",
134     "i-mingo",      "see-x-i-mingo",
135     "zh-min",       "nan-x-zh-min",
136 };
137 
138 /*
139  Updated on 2018-09-12 from
140  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
141 
142  The table lists redundant tags with preferred value in the IANA languate tag registry.
143  It's generated with the following command:
144 
145  curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
146  grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
147  awk -n '/Tag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
148  tr 'A-Z' 'a-z'
149 
150  In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
151  a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
152 */
153 
154 static const char* const REDUNDANT[] = {
155 //  redundant       preferred
156     "sgn-br",       "bzs",
157     "sgn-co",       "csn",
158     "sgn-de",       "gsg",
159     "sgn-dk",       "dsl",
160     "sgn-es",       "ssp",
161     "sgn-fr",       "fsl",
162     "sgn-gb",       "bfi",
163     "sgn-gr",       "gss",
164     "sgn-ie",       "isg",
165     "sgn-it",       "ise",
166     "sgn-jp",       "jsl",
167     "sgn-mx",       "mfs",
168     "sgn-ni",       "ncs",
169     "sgn-nl",       "dse",
170     "sgn-no",       "nsl",
171     "sgn-pt",       "psr",
172     "sgn-se",       "swl",
173     "sgn-us",       "ase",
174     "sgn-za",       "sfs",
175     "zh-cmn",       "cmn",
176     "zh-cmn-hans",  "cmn-hans",
177     "zh-cmn-hant",  "cmn-hant",
178     "zh-gan",       "gan",
179     "zh-wuu",       "wuu",
180     "zh-yue",       "yue",
181 
182     // variant tag with preferred value
183     "ja-latn-hepburn-heploc", "ja-latn-alalc97",
184 };
185 
186 /*
187   Updated on 2018-09-12 from
188   https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
189 
190   grep 'Type: language' -A 7 language-subtag-registry  | egrep 'Subtag|Prefe' | \
191   grep -B1 'Preferred' | grep -v '^--' | \
192   awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
193 
194   Make sure that 2-letter language subtags come before 3-letter subtags.
195 */
196 static const char DEPRECATEDLANGS[][4] = {
197 /*  deprecated  new */
198     "in",       "id",
199     "iw",       "he",
200     "ji",       "yi",
201     "jw",       "jv",
202     "mo",       "ro",
203     "aam",       "aas",
204     "adp",       "dz",
205     "aue",       "ktz",
206     "ayx",       "nun",
207     "bgm",       "bcg",
208     "bjd",       "drl",
209     "ccq",       "rki",
210     "cjr",       "mom",
211     "cka",       "cmr",
212     "cmk",       "xch",
213     "coy",       "pij",
214     "cqu",       "quh",
215     "drh",       "khk",
216     "drw",       "prs",
217     "gav",       "dev",
218     "gfx",       "vaj",
219     "ggn",       "gvr",
220     "gti",       "nyc",
221     "guv",       "duz",
222     "hrr",       "jal",
223     "ibi",       "opa",
224     "ilw",       "gal",
225     "jeg",       "oyb",
226     "kgc",       "tdf",
227     "kgh",       "kml",
228     "koj",       "kwv",
229     "krm",       "bmf",
230     "ktr",       "dtp",
231     "kvs",       "gdj",
232     "kwq",       "yam",
233     "kxe",       "tvd",
234     "kzj",       "dtp",
235     "kzt",       "dtp",
236     "lii",       "raq",
237     "lmm",       "rmx",
238     "meg",       "cir",
239     "mst",       "mry",
240     "mwj",       "vaj",
241     "myt",       "mry",
242     "nad",       "xny",
243     "ncp",       "kdz",
244     "nnx",       "ngv",
245     "nts",       "pij",
246     "oun",       "vaj",
247     "pcr",       "adx",
248     "pmc",       "huw",
249     "pmu",       "phr",
250     "ppa",       "bfy",
251     "ppr",       "lcq",
252     "pry",       "prt",
253     "puz",       "pub",
254     "sca",       "hle",
255     "skk",       "oyb",
256     "tdu",       "dtp",
257     "thc",       "tpo",
258     "thx",       "oyb",
259     "tie",       "ras",
260     "tkk",       "twm",
261     "tlw",       "weo",
262     "tmp",       "tyj",
263     "tne",       "kak",
264     "tnf",       "prs",
265     "tsf",       "taj",
266     "uok",       "ema",
267     "xba",       "cax",
268     "xia",       "acn",
269     "xkh",       "waw",
270     "xsj",       "suj",
271     "ybd",       "rki",
272     "yma",       "lrr",
273     "ymt",       "mtm",
274     "yos",       "zom",
275     "yuu",       "yug",
276 };
277 
278 /*
279   Updated on 2018-04-24 from
280 
281   curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
282   grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
283   grep -B1 'Preferred' | \
284   awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
285 */
286 static const char DEPRECATEDREGIONS[][3] = {
287 /*  deprecated  new */
288     "BU",       "MM",
289     "DD",       "DE",
290     "FX",       "FR",
291     "TP",       "TL",
292     "YD",       "YE",
293     "ZR",       "CD",
294 };
295 
296 /*
297 * -------------------------------------------------
298 *
299 * These ultag_ functions may be exposed as APIs later
300 *
301 * -------------------------------------------------
302 */
303 
304 static ULanguageTag*
305 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
306 
307 static void
308 ultag_close(ULanguageTag* langtag);
309 
310 static const char*
311 ultag_getLanguage(const ULanguageTag* langtag);
312 
313 #if 0
314 static const char*
315 ultag_getJDKLanguage(const ULanguageTag* langtag);
316 #endif
317 
318 static const char*
319 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
320 
321 static int32_t
322 ultag_getExtlangSize(const ULanguageTag* langtag);
323 
324 static const char*
325 ultag_getScript(const ULanguageTag* langtag);
326 
327 static const char*
328 ultag_getRegion(const ULanguageTag* langtag);
329 
330 static const char*
331 ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
332 
333 static int32_t
334 ultag_getVariantsSize(const ULanguageTag* langtag);
335 
336 static const char*
337 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
338 
339 static const char*
340 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
341 
342 static int32_t
343 ultag_getExtensionsSize(const ULanguageTag* langtag);
344 
345 static const char*
346 ultag_getPrivateUse(const ULanguageTag* langtag);
347 
348 #if 0
349 static const char*
350 ultag_getLegacy(const ULanguageTag* langtag);
351 #endif
352 
353 U_NAMESPACE_BEGIN
354 
355 /**
356  * \class LocalULanguageTagPointer
357  * "Smart pointer" class, closes a ULanguageTag via ultag_close().
358  * For most methods see the LocalPointerBase base class.
359  *
360  * @see LocalPointerBase
361  * @see LocalPointer
362  * @internal
363  */
364 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
365 
366 U_NAMESPACE_END
367 
368 /*
369 * -------------------------------------------------
370 *
371 * Language subtag syntax validation functions
372 *
373 * -------------------------------------------------
374 */
375 
376 static UBool
_isAlphaString(const char * s,int32_t len)377 _isAlphaString(const char* s, int32_t len) {
378     int32_t i;
379     for (i = 0; i < len; i++) {
380         if (!ISALPHA(*(s + i))) {
381             return FALSE;
382         }
383     }
384     return TRUE;
385 }
386 
387 static UBool
_isNumericString(const char * s,int32_t len)388 _isNumericString(const char* s, int32_t len) {
389     int32_t i;
390     for (i = 0; i < len; i++) {
391         if (!ISNUMERIC(*(s + i))) {
392             return FALSE;
393         }
394     }
395     return TRUE;
396 }
397 
398 static UBool
_isAlphaNumericString(const char * s,int32_t len)399 _isAlphaNumericString(const char* s, int32_t len) {
400     int32_t i;
401     for (i = 0; i < len; i++) {
402         if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
403             return FALSE;
404         }
405     }
406     return TRUE;
407 }
408 
409 static UBool
_isAlphaNumericStringLimitedLength(const char * s,int32_t len,int32_t min,int32_t max)410 _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
411     if (len < 0) {
412         len = (int32_t)uprv_strlen(s);
413     }
414     if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
415         return TRUE;
416     }
417     return FALSE;
418 }
419 
420 U_CFUNC UBool
ultag_isLanguageSubtag(const char * s,int32_t len)421 ultag_isLanguageSubtag(const char* s, int32_t len) {
422     /*
423      * unicode_language_subtag = alpha{2,3} | alpha{5,8};
424      * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
425      * See ICU-20372
426      */
427     if (len < 0) {
428         len = (int32_t)uprv_strlen(s);
429     }
430     if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
431         return TRUE;
432     }
433     return FALSE;
434 }
435 
436 static UBool
_isExtlangSubtag(const char * s,int32_t len)437 _isExtlangSubtag(const char* s, int32_t len) {
438     /*
439      * extlang       = 3ALPHA              ; selected ISO 639 codes
440      *                 *2("-" 3ALPHA)      ; permanently reserved
441      */
442     if (len < 0) {
443         len = (int32_t)uprv_strlen(s);
444     }
445     if (len == 3 && _isAlphaString(s, len)) {
446         return TRUE;
447     }
448     return FALSE;
449 }
450 
451 U_CFUNC UBool
ultag_isScriptSubtag(const char * s,int32_t len)452 ultag_isScriptSubtag(const char* s, int32_t len) {
453     /*
454      * script        = 4ALPHA              ; ISO 15924 code
455      */
456     if (len < 0) {
457         len = (int32_t)uprv_strlen(s);
458     }
459     if (len == 4 && _isAlphaString(s, len)) {
460         return TRUE;
461     }
462     return FALSE;
463 }
464 
465 U_CFUNC UBool
ultag_isRegionSubtag(const char * s,int32_t len)466 ultag_isRegionSubtag(const char* s, int32_t len) {
467     /*
468      * region        = 2ALPHA              ; ISO 3166-1 code
469      *               / 3DIGIT              ; UN M.49 code
470      */
471     if (len < 0) {
472         len = (int32_t)uprv_strlen(s);
473     }
474     if (len == 2 && _isAlphaString(s, len)) {
475         return TRUE;
476     }
477     if (len == 3 && _isNumericString(s, len)) {
478         return TRUE;
479     }
480     return FALSE;
481 }
482 
483 static UBool
_isVariantSubtag(const char * s,int32_t len)484 _isVariantSubtag(const char* s, int32_t len) {
485     /*
486      * variant       = 5*8alphanum         ; registered variants
487      *               / (DIGIT 3alphanum)
488      */
489     if (len < 0) {
490         len = (int32_t)uprv_strlen(s);
491     }
492     if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
493         return TRUE;
494     }
495     if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
496         return TRUE;
497     }
498     return FALSE;
499 }
500 
501 static UBool
_isSepListOf(UBool (* test)(const char *,int32_t),const char * s,int32_t len)502 _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
503     const char *p = s;
504     const char *pSubtag = NULL;
505 
506     if (len < 0) {
507         len = (int32_t)uprv_strlen(s);
508     }
509 
510     while ((p - s) < len) {
511         if (*p == SEP) {
512             if (pSubtag == NULL) {
513                 return FALSE;
514             }
515             if (!test(pSubtag, (int32_t)(p - pSubtag))) {
516                 return FALSE;
517             }
518             pSubtag = NULL;
519         } else if (pSubtag == NULL) {
520             pSubtag = p;
521         }
522         p++;
523     }
524     if (pSubtag == NULL) {
525         return FALSE;
526     }
527     return test(pSubtag, (int32_t)(p - pSubtag));
528 }
529 
530 U_CFUNC UBool
ultag_isVariantSubtags(const char * s,int32_t len)531 ultag_isVariantSubtags(const char* s, int32_t len) {
532     return _isSepListOf(&_isVariantSubtag, s, len);
533 }
534 
535 // This is for the ICU-specific "lvariant" handling.
536 static UBool
_isPrivateuseVariantSubtag(const char * s,int32_t len)537 _isPrivateuseVariantSubtag(const char* s, int32_t len) {
538     /*
539      * variant       = 1*8alphanum         ; registered variants
540      *               / (DIGIT 3alphanum)
541      */
542     return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
543 }
544 
545 static UBool
_isExtensionSingleton(const char * s,int32_t len)546 _isExtensionSingleton(const char* s, int32_t len) {
547     /*
548      * extension     = singleton 1*("-" (2*8alphanum))
549      *
550      * singleton     = DIGIT               ; 0 - 9
551      *               / %x41-57             ; A - W
552      *               / %x59-5A             ; Y - Z
553      *               / %x61-77             ; a - w
554      *               / %x79-7A             ; y - z
555      */
556     if (len < 0) {
557         len = (int32_t)uprv_strlen(s);
558     }
559     if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
560         return TRUE;
561     }
562     return FALSE;
563 }
564 
565 static UBool
_isExtensionSubtag(const char * s,int32_t len)566 _isExtensionSubtag(const char* s, int32_t len) {
567     /*
568      * extension     = singleton 1*("-" (2*8alphanum))
569      */
570     return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
571 }
572 
573 U_CFUNC UBool
ultag_isExtensionSubtags(const char * s,int32_t len)574 ultag_isExtensionSubtags(const char* s, int32_t len) {
575     return _isSepListOf(&_isExtensionSubtag, s, len);
576 }
577 
578 static UBool
_isPrivateuseValueSubtag(const char * s,int32_t len)579 _isPrivateuseValueSubtag(const char* s, int32_t len) {
580     /*
581      * privateuse    = "x" 1*("-" (1*8alphanum))
582      */
583     return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
584 }
585 
586 U_CFUNC UBool
ultag_isPrivateuseValueSubtags(const char * s,int32_t len)587 ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
588     return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
589 }
590 
591 U_CFUNC UBool
ultag_isUnicodeLocaleAttribute(const char * s,int32_t len)592 ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
593     /*
594      * attribute = alphanum{3,8} ;
595      */
596     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
597 }
598 
599 U_CFUNC UBool
ultag_isUnicodeLocaleAttributes(const char * s,int32_t len)600 ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
601     return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
602 }
603 
604 U_CFUNC UBool
ultag_isUnicodeLocaleKey(const char * s,int32_t len)605 ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
606     /*
607      * key = alphanum alpha ;
608      */
609     if (len < 0) {
610         len = (int32_t)uprv_strlen(s);
611     }
612     if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
613         return TRUE;
614     }
615     return FALSE;
616 }
617 
618 U_CFUNC UBool
_isUnicodeLocaleTypeSubtag(const char * s,int32_t len)619 _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
620     /*
621      * alphanum{3,8}
622      */
623     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
624 }
625 
626 U_CFUNC UBool
ultag_isUnicodeLocaleType(const char * s,int32_t len)627 ultag_isUnicodeLocaleType(const char*s, int32_t len) {
628     /*
629      * type = alphanum{3,8} (sep alphanum{3,8})* ;
630      */
631     return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
632 }
633 
634 static UBool
_isTKey(const char * s,int32_t len)635 _isTKey(const char* s, int32_t len)
636 {
637     /*
638      * tkey = alpha digit ;
639      */
640     if (len < 0) {
641         len = (int32_t)uprv_strlen(s);
642     }
643     if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
644         return TRUE;
645     }
646     return FALSE;
647 }
648 
649 U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char * localeID)650 ultag_getTKeyStart(const char *localeID) {
651     const char *result = localeID;
652     const char *sep;
653     while((sep = uprv_strchr(result, SEP)) != nullptr) {
654         if (_isTKey(result, static_cast<int32_t>(sep - result))) {
655             return result;
656         }
657         result = ++sep;
658     }
659     if (_isTKey(result, -1)) {
660         return result;
661     }
662     return nullptr;
663 }
664 
665 static UBool
_isTValue(const char * s,int32_t len)666 _isTValue(const char* s, int32_t len)
667 {
668     /*
669      * tvalue = (sep alphanum{3,8})+ ;
670      */
671     return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
672 }
673 
674 static UBool
_isTransformedExtensionSubtag(int32_t & state,const char * s,int32_t len)675 _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
676 {
677     const int32_t kStart = 0;       // Start, wait for unicode_language_subtag, tkey or end
678     const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
679                                     // unicode_region_subtag, unicode_variant_subtag, tkey or end
680     const int32_t kGotScript = 2;   // Got unicode_script_subtag, wait for unicode_region_subtag,
681                                     // unicode_variant_subtag, tkey, or end
682     const int32_t kGotRegion = 3;   // Got unicode_region_subtag, wait for unicode_variant_subtag,
683                                     // tkey, or end.
684     const int32_t kGotVariant = 4;  // Got unicode_variant_subtag, wait for unicode_variant_subtag
685                                     // tkey or end.
686     const int32_t kGotTKey = -1;    // Got tkey, wait for tvalue. ERROR if stop here.
687     const int32_t kGotTValue = 6;   // Got tvalue, wait for tkey, tvalue or end
688 
689 
690     if (len < 0) {
691         len = (int32_t)uprv_strlen(s);
692     }
693     switch (state) {
694         case kStart:
695             if (ultag_isLanguageSubtag(s, len) && len != 4) {
696                 state = kGotLanguage;
697                 return TRUE;
698             }
699             if (_isTKey(s, len)) {
700                 state = kGotTKey;
701                 return TRUE;
702             }
703             return FALSE;
704         case kGotLanguage:
705             if (ultag_isScriptSubtag(s, len)) {
706                 state = kGotScript;
707                 return TRUE;
708             }
709             U_FALLTHROUGH;
710         case kGotScript:
711             if (ultag_isRegionSubtag(s, len)) {
712                 state = kGotRegion;
713                 return TRUE;
714             }
715             U_FALLTHROUGH;
716         case kGotRegion:
717             U_FALLTHROUGH;
718         case kGotVariant:
719             if (_isVariantSubtag(s, len)) {
720                 state = kGotVariant;
721                 return TRUE;
722             }
723             if (_isTKey(s, len)) {
724                 state = kGotTKey;
725                 return TRUE;
726             }
727             return FALSE;
728         case kGotTKey:
729             if (_isTValue(s, len)) {
730                 state = kGotTValue;
731                 return TRUE;
732             }
733             return FALSE;
734         case kGotTValue:
735             if (_isTKey(s, len)) {
736                 state = kGotTKey;
737                 return TRUE;
738             }
739             if (_isTValue(s, len)) {
740                 return TRUE;
741             }
742             return FALSE;
743     }
744     return FALSE;
745 }
746 
747 static UBool
_isUnicodeExtensionSubtag(int32_t & state,const char * s,int32_t len)748 _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
749 {
750     const int32_t kStart = 0;         // Start, wait for a key or attribute or end
751     const int32_t kGotKey = 1;        // Got a key, wait for type or key or end
752     const int32_t kGotType = 2;       // Got a type, wait for key or end
753 
754     switch (state) {
755         case kStart:
756             if (ultag_isUnicodeLocaleKey(s, len)) {
757                 state = kGotKey;
758                 return TRUE;
759             }
760             if (ultag_isUnicodeLocaleAttribute(s, len)) {
761                 return TRUE;
762             }
763             return FALSE;
764         case kGotKey:
765             if (ultag_isUnicodeLocaleKey(s, len)) {
766                 return TRUE;
767             }
768             if (_isUnicodeLocaleTypeSubtag(s, len)) {
769                 state = kGotType;
770                 return TRUE;
771             }
772             return FALSE;
773         case kGotType:
774             if (ultag_isUnicodeLocaleKey(s, len)) {
775                 state = kGotKey;
776                 return TRUE;
777             }
778             if (_isUnicodeLocaleTypeSubtag(s, len)) {
779                 return TRUE;
780             }
781             return FALSE;
782     }
783     return FALSE;
784 }
785 
786 static UBool
_isStatefulSepListOf(UBool (* test)(int32_t &,const char *,int32_t),const char * s,int32_t len)787 _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
788 {
789     int32_t state = 0;
790     const char* p;
791     const char* start = s;
792     int32_t subtagLen = 0;
793 
794     if (len < 0) {
795         len = (int32_t)uprv_strlen(s);
796     }
797 
798     for (p = s; len > 0; p++, len--) {
799         if (*p == SEP) {
800             if (!test(state, start, subtagLen)) {
801                 return FALSE;
802             }
803             subtagLen = 0;
804             start = p + 1;
805         } else {
806             subtagLen++;
807         }
808     }
809 
810     if (test(state, start, subtagLen) && state >= 0) {
811         return TRUE;
812     }
813     return FALSE;
814 }
815 
816 U_CFUNC UBool
ultag_isTransformedExtensionSubtags(const char * s,int32_t len)817 ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
818 {
819     return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
820 }
821 
822 U_CFUNC UBool
ultag_isUnicodeExtensionSubtags(const char * s,int32_t len)823 ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
824     return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
825 }
826 
827 
828 /*
829 * -------------------------------------------------
830 *
831 * Helper functions
832 *
833 * -------------------------------------------------
834 */
835 
836 static UBool
_addVariantToList(VariantListEntry ** first,VariantListEntry * var)837 _addVariantToList(VariantListEntry **first, VariantListEntry *var) {
838     UBool bAdded = TRUE;
839 
840     if (*first == NULL) {
841         var->next = NULL;
842         *first = var;
843     } else {
844         VariantListEntry *prev, *cur;
845         int32_t cmp;
846 
847         /* variants order should be preserved */
848         prev = NULL;
849         cur = *first;
850         while (TRUE) {
851             if (cur == NULL) {
852                 prev->next = var;
853                 var->next = NULL;
854                 break;
855             }
856 
857             /* Checking for duplicate variant */
858             cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
859             if (cmp == 0) {
860                 /* duplicated variant */
861                 bAdded = FALSE;
862                 break;
863             }
864             prev = cur;
865             cur = cur->next;
866         }
867     }
868 
869     return bAdded;
870 }
871 
872 static UBool
_addAttributeToList(AttributeListEntry ** first,AttributeListEntry * attr)873 _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
874     UBool bAdded = TRUE;
875 
876     if (*first == NULL) {
877         attr->next = NULL;
878         *first = attr;
879     } else {
880         AttributeListEntry *prev, *cur;
881         int32_t cmp;
882 
883         /* reorder variants in alphabetical order */
884         prev = NULL;
885         cur = *first;
886         while (TRUE) {
887             if (cur == NULL) {
888                 prev->next = attr;
889                 attr->next = NULL;
890                 break;
891             }
892             cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
893             if (cmp < 0) {
894                 if (prev == NULL) {
895                     *first = attr;
896                 } else {
897                     prev->next = attr;
898                 }
899                 attr->next = cur;
900                 break;
901             }
902             if (cmp == 0) {
903                 /* duplicated variant */
904                 bAdded = FALSE;
905                 break;
906             }
907             prev = cur;
908             cur = cur->next;
909         }
910     }
911 
912     return bAdded;
913 }
914 
915 
916 static UBool
_addExtensionToList(ExtensionListEntry ** first,ExtensionListEntry * ext,UBool localeToBCP)917 _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
918     UBool bAdded = TRUE;
919 
920     if (*first == NULL) {
921         ext->next = NULL;
922         *first = ext;
923     } else {
924         ExtensionListEntry *prev, *cur;
925         int32_t cmp;
926 
927         /* reorder variants in alphabetical order */
928         prev = NULL;
929         cur = *first;
930         while (TRUE) {
931             if (cur == NULL) {
932                 prev->next = ext;
933                 ext->next = NULL;
934                 break;
935             }
936             if (localeToBCP) {
937                 /* special handling for locale to bcp conversion */
938                 int32_t len, curlen;
939 
940                 len = (int32_t)uprv_strlen(ext->key);
941                 curlen = (int32_t)uprv_strlen(cur->key);
942 
943                 if (len == 1 && curlen == 1) {
944                     if (*(ext->key) == *(cur->key)) {
945                         cmp = 0;
946                     } else if (*(ext->key) == PRIVATEUSE) {
947                         cmp = 1;
948                     } else if (*(cur->key) == PRIVATEUSE) {
949                         cmp = -1;
950                     } else {
951                         cmp = *(ext->key) - *(cur->key);
952                     }
953                 } else if (len == 1) {
954                     cmp = *(ext->key) - LDMLEXT;
955                 } else if (curlen == 1) {
956                     cmp = LDMLEXT - *(cur->key);
957                 } else {
958                     cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
959                     /* Both are u extension keys - we need special handling for 'attribute' */
960                     if (cmp != 0) {
961                         if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
962                             cmp = 1;
963                         } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
964                             cmp = -1;
965                         }
966                     }
967                 }
968             } else {
969                 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
970             }
971             if (cmp < 0) {
972                 if (prev == NULL) {
973                     *first = ext;
974                 } else {
975                     prev->next = ext;
976                 }
977                 ext->next = cur;
978                 break;
979             }
980             if (cmp == 0) {
981                 /* duplicated extension key */
982                 bAdded = FALSE;
983                 break;
984             }
985             prev = cur;
986             cur = cur->next;
987         }
988     }
989 
990     return bAdded;
991 }
992 
993 static void
_initializeULanguageTag(ULanguageTag * langtag)994 _initializeULanguageTag(ULanguageTag* langtag) {
995     int32_t i;
996 
997     langtag->buf = NULL;
998 
999     langtag->language = EMPTY;
1000     for (i = 0; i < MAXEXTLANG; i++) {
1001         langtag->extlang[i] = NULL;
1002     }
1003 
1004     langtag->script = EMPTY;
1005     langtag->region = EMPTY;
1006 
1007     langtag->variants = NULL;
1008     langtag->extensions = NULL;
1009 
1010     langtag->legacy = EMPTY;
1011     langtag->privateuse = EMPTY;
1012 }
1013 
1014 static void
_appendLanguageToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1015 _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1016     char buf[ULOC_LANG_CAPACITY];
1017     UErrorCode tmpStatus = U_ZERO_ERROR;
1018     int32_t len, i;
1019 
1020     if (U_FAILURE(*status)) {
1021         return;
1022     }
1023 
1024     len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1025     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1026         if (strict) {
1027             *status = U_ILLEGAL_ARGUMENT_ERROR;
1028             return;
1029         }
1030         len = 0;
1031     }
1032 
1033     /* Note: returned language code is in lower case letters */
1034 
1035     if (len == 0) {
1036         sink.Append(LANG_UND, LANG_UND_LEN);
1037     } else if (!ultag_isLanguageSubtag(buf, len)) {
1038             /* invalid language code */
1039         if (strict) {
1040             *status = U_ILLEGAL_ARGUMENT_ERROR;
1041             return;
1042         }
1043         sink.Append(LANG_UND, LANG_UND_LEN);
1044     } else {
1045         /* resolve deprecated */
1046         for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1047             // 2-letter deprecated subtags are listede before 3-letter
1048             // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1049             // across the 1st 3-letter subtag, if the input is a 2-letter code.
1050             // to avoid continuing to try when there's no match.
1051             if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1052             if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1053                 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1054                 len = (int32_t)uprv_strlen(buf);
1055                 break;
1056             }
1057         }
1058         sink.Append(buf, len);
1059     }
1060 }
1061 
1062 static void
_appendScriptToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1063 _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1064     char buf[ULOC_SCRIPT_CAPACITY];
1065     UErrorCode tmpStatus = U_ZERO_ERROR;
1066     int32_t len;
1067 
1068     if (U_FAILURE(*status)) {
1069         return;
1070     }
1071 
1072     len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1073     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1074         if (strict) {
1075             *status = U_ILLEGAL_ARGUMENT_ERROR;
1076         }
1077         return;
1078     }
1079 
1080     if (len > 0) {
1081         if (!ultag_isScriptSubtag(buf, len)) {
1082             /* invalid script code */
1083             if (strict) {
1084                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1085             }
1086             return;
1087         } else {
1088             sink.Append("-", 1);
1089             sink.Append(buf, len);
1090         }
1091     }
1092 }
1093 
1094 static void
_appendRegionToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1095 _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1096     char buf[ULOC_COUNTRY_CAPACITY];
1097     UErrorCode tmpStatus = U_ZERO_ERROR;
1098     int32_t len;
1099 
1100     if (U_FAILURE(*status)) {
1101         return;
1102     }
1103 
1104     len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1105     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1106         if (strict) {
1107             *status = U_ILLEGAL_ARGUMENT_ERROR;
1108         }
1109         return;
1110     }
1111 
1112     if (len > 0) {
1113         if (!ultag_isRegionSubtag(buf, len)) {
1114             /* invalid region code */
1115             if (strict) {
1116                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1117             }
1118             return;
1119         } else {
1120             sink.Append("-", 1);
1121             /* resolve deprecated */
1122             for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1123                 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1124                     uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1125                     len = (int32_t)uprv_strlen(buf);
1126                     break;
1127                 }
1128             }
1129             sink.Append(buf, len);
1130         }
1131     }
1132 }
1133 
_sortVariants(VariantListEntry * first)1134 static void _sortVariants(VariantListEntry* first) {
1135     for (VariantListEntry* var1 = first; var1 != NULL; var1 = var1->next) {
1136         for (VariantListEntry* var2 = var1->next; var2 != NULL; var2 = var2->next) {
1137             // Swap var1->variant and var2->variant.
1138             if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) {
1139                 const char* temp = var1->variant;
1140                 var1->variant = var2->variant;
1141                 var2->variant = temp;
1142             }
1143         }
1144     }
1145 }
1146 
1147 static void
_appendVariantsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool * hadPosix,UErrorCode * status)1148 _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1149     char buf[ULOC_FULLNAME_CAPACITY];
1150     UErrorCode tmpStatus = U_ZERO_ERROR;
1151     int32_t len, i;
1152 
1153     if (U_FAILURE(*status)) {
1154         return;
1155     }
1156 
1157     len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1158     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1159         if (strict) {
1160             *status = U_ILLEGAL_ARGUMENT_ERROR;
1161         }
1162         return;
1163     }
1164 
1165     if (len > 0) {
1166         char *p, *pVar;
1167         UBool bNext = TRUE;
1168         VariantListEntry *var;
1169         VariantListEntry *varFirst = NULL;
1170 
1171         pVar = NULL;
1172         p = buf;
1173         while (bNext) {
1174             if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1175                 if (*p == 0) {
1176                     bNext = FALSE;
1177                 } else {
1178                     *p = 0; /* terminate */
1179                 }
1180                 if (pVar == NULL) {
1181                     if (strict) {
1182                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1183                         break;
1184                     }
1185                     /* ignore empty variant */
1186                 } else {
1187                     /* ICU uses upper case letters for variants, but
1188                        the canonical format is lowercase in BCP47 */
1189                     for (i = 0; *(pVar + i) != 0; i++) {
1190                         *(pVar + i) = uprv_tolower(*(pVar + i));
1191                     }
1192 
1193                     /* validate */
1194                     if (_isVariantSubtag(pVar, -1)) {
1195                         if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1196                             /* emit the variant to the list */
1197                             var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1198                             if (var == NULL) {
1199                                 *status = U_MEMORY_ALLOCATION_ERROR;
1200                                 break;
1201                             }
1202                             var->variant = pVar;
1203                             if (!_addVariantToList(&varFirst, var)) {
1204                                 /* duplicated variant */
1205                                 uprv_free(var);
1206                                 if (strict) {
1207                                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1208                                     break;
1209                                 }
1210                             }
1211                         } else {
1212                             /* Special handling for POSIX variant, need to remember that we had it and then */
1213                             /* treat it like an extension later. */
1214                             *hadPosix = TRUE;
1215                         }
1216                     } else if (strict) {
1217                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1218                         break;
1219                     } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1220                         /* Handle private use subtags separately */
1221                         break;
1222                     }
1223                 }
1224                 /* reset variant starting position */
1225                 pVar = NULL;
1226             } else if (pVar == NULL) {
1227                 pVar = p;
1228             }
1229             p++;
1230         }
1231 
1232         if (U_SUCCESS(*status)) {
1233             if (varFirst != NULL) {
1234                 int32_t varLen;
1235 
1236                 /* per UTS35, we should sort the variants */
1237                 _sortVariants(varFirst);
1238 
1239                 /* write out validated/normalized variants to the target */
1240                 var = varFirst;
1241                 while (var != NULL) {
1242                     sink.Append("-", 1);
1243                     varLen = (int32_t)uprv_strlen(var->variant);
1244                     sink.Append(var->variant, varLen);
1245                     var = var->next;
1246                 }
1247             }
1248         }
1249 
1250         /* clean up */
1251         var = varFirst;
1252         while (var != NULL) {
1253             VariantListEntry *tmpVar = var->next;
1254             uprv_free(var);
1255             var = tmpVar;
1256         }
1257 
1258         if (U_FAILURE(*status)) {
1259             return;
1260         }
1261     }
1262 }
1263 
1264 static void
_appendKeywordsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1265 _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1266     char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1267     int32_t attrBufLength = 0;
1268 
1269     icu::MemoryPool<AttributeListEntry> attrPool;
1270     icu::MemoryPool<ExtensionListEntry> extPool;
1271     icu::MemoryPool<icu::CharString> strPool;
1272 
1273     icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1274     if (U_FAILURE(*status) && !hadPosix) {
1275         return;
1276     }
1277     if (keywordEnum.isValid() || hadPosix) {
1278         /* reorder extensions */
1279         int32_t len;
1280         const char *key;
1281         ExtensionListEntry *firstExt = NULL;
1282         ExtensionListEntry *ext;
1283         AttributeListEntry *firstAttr = NULL;
1284         AttributeListEntry *attr;
1285         icu::MemoryPool<icu::CharString> extBufPool;
1286         const char *bcpKey=nullptr, *bcpValue=nullptr;
1287         UErrorCode tmpStatus = U_ZERO_ERROR;
1288         int32_t keylen;
1289         UBool isBcpUExt;
1290 
1291         while (TRUE) {
1292             key = uenum_next(keywordEnum.getAlias(), NULL, status);
1293             if (key == NULL) {
1294                 break;
1295             }
1296 
1297             icu::CharString buf;
1298             {
1299                 icu::CharStringByteSink sink(&buf);
1300                 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
1301             }
1302             len = buf.length();
1303 
1304             if (U_FAILURE(tmpStatus)) {
1305                 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1306                     *status = U_MEMORY_ALLOCATION_ERROR;
1307                     break;
1308                 }
1309                 if (strict) {
1310                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1311                     break;
1312                 }
1313                 /* ignore this keyword */
1314                 tmpStatus = U_ZERO_ERROR;
1315                 continue;
1316             }
1317 
1318             keylen = (int32_t)uprv_strlen(key);
1319             isBcpUExt = (keylen > 1);
1320 
1321             /* special keyword used for representing Unicode locale attributes */
1322             if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1323                 if (len > 0) {
1324                     int32_t i = 0;
1325                     while (TRUE) {
1326                         attrBufLength = 0;
1327                         for (; i < len; i++) {
1328                             if (buf[i] != '-') {
1329                                 attrBuf[attrBufLength++] = buf[i];
1330                             } else {
1331                                 i++;
1332                                 break;
1333                             }
1334                         }
1335                         if (attrBufLength > 0) {
1336                             attrBuf[attrBufLength] = 0;
1337 
1338                         } else if (i >= len){
1339                             break;
1340                         }
1341 
1342                         /* create AttributeListEntry */
1343                         attr = attrPool.create();
1344                         if (attr == NULL) {
1345                             *status = U_MEMORY_ALLOCATION_ERROR;
1346                             break;
1347                         }
1348                         icu::CharString* attrValue =
1349                                 strPool.create(attrBuf, attrBufLength, *status);
1350                         if (attrValue == NULL) {
1351                             *status = U_MEMORY_ALLOCATION_ERROR;
1352                             break;
1353                         }
1354                         if (U_FAILURE(*status)) {
1355                             break;
1356                         }
1357                         attr->attribute = attrValue->data();
1358 
1359                         if (!_addAttributeToList(&firstAttr, attr)) {
1360                             if (strict) {
1361                                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1362                                 break;
1363                             }
1364                         }
1365                     }
1366                     /* for a place holder ExtensionListEntry */
1367                     bcpKey = LOCALE_ATTRIBUTE_KEY;
1368                     bcpValue = NULL;
1369                 }
1370             } else if (isBcpUExt) {
1371                 bcpKey = uloc_toUnicodeLocaleKey(key);
1372                 if (bcpKey == NULL) {
1373                     if (strict) {
1374                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1375                         break;
1376                     }
1377                     continue;
1378                 }
1379 
1380                 /* we've checked buf is null-terminated above */
1381                 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1382                 if (bcpValue == NULL) {
1383                     if (strict) {
1384                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1385                         break;
1386                     }
1387                     continue;
1388                 }
1389                 if (bcpValue == buf.data()) {
1390                     /*
1391                     When uloc_toUnicodeLocaleType(key, buf) returns the
1392                     input value as is, the value is well-formed, but has
1393                     no known mapping. This implementation normalizes the
1394                     value to lower case
1395                     */
1396                     icu::CharString* extBuf = extBufPool.create(buf, tmpStatus);
1397 
1398                     if (extBuf == nullptr) {
1399                         *status = U_MEMORY_ALLOCATION_ERROR;
1400                         break;
1401                     }
1402                     if (U_FAILURE(tmpStatus)) {
1403                         *status = tmpStatus;
1404                         break;
1405                     }
1406 
1407                     T_CString_toLowerCase(extBuf->data());
1408                     bcpValue = extBuf->data();
1409                 }
1410             } else {
1411                 if (*key == PRIVATEUSE) {
1412                     if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1413                         if (strict) {
1414                             *status = U_ILLEGAL_ARGUMENT_ERROR;
1415                             break;
1416                         }
1417                         continue;
1418                     }
1419                 } else {
1420                     if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1421                         if (strict) {
1422                             *status = U_ILLEGAL_ARGUMENT_ERROR;
1423                             break;
1424                         }
1425                         continue;
1426                     }
1427                 }
1428                 bcpKey = key;
1429                 icu::CharString* extBuf =
1430                     extBufPool.create(buf.data(), len, tmpStatus);
1431                 if (extBuf == nullptr) {
1432                     *status = U_MEMORY_ALLOCATION_ERROR;
1433                     break;
1434                 }
1435                 if (U_FAILURE(tmpStatus)) {
1436                     *status = tmpStatus;
1437                     break;
1438                 }
1439                 bcpValue = extBuf->data();
1440             }
1441 
1442             /* create ExtensionListEntry */
1443             ext = extPool.create();
1444             if (ext == NULL) {
1445                 *status = U_MEMORY_ALLOCATION_ERROR;
1446                 break;
1447             }
1448             ext->key = bcpKey;
1449             ext->value = bcpValue;
1450 
1451             if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1452                 if (strict) {
1453                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1454                     break;
1455                 }
1456             }
1457         }
1458 
1459         /* Special handling for POSIX variant - add the keywords for POSIX */
1460         if (hadPosix) {
1461             /* create ExtensionListEntry for POSIX */
1462             ext = extPool.create();
1463             if (ext == NULL) {
1464                 *status = U_MEMORY_ALLOCATION_ERROR;
1465                 return;
1466             }
1467             ext->key = POSIX_KEY;
1468             ext->value = POSIX_VALUE;
1469 
1470             if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1471                 // Silently ignore errors.
1472             }
1473         }
1474 
1475         if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
1476             UBool startLDMLExtension = FALSE;
1477             for (ext = firstExt; ext; ext = ext->next) {
1478                 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1479                     /* first LDML u singlton extension */
1480                    sink.Append("-u", 2);
1481                    startLDMLExtension = TRUE;
1482                 }
1483 
1484                 /* write out the sorted BCP47 attributes, extensions and private use */
1485                 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1486                     /* write the value for the attributes */
1487                     for (attr = firstAttr; attr; attr = attr->next) {
1488                         sink.Append("-", 1);
1489                         sink.Append(
1490                                 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1491                     }
1492                 } else {
1493                     sink.Append("-", 1);
1494                     sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1495                     if (uprv_strcmp(ext->value, "true") != 0 &&
1496                         uprv_strcmp(ext->value, "yes") != 0) {
1497                       sink.Append("-", 1);
1498                       sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1499                     }
1500                 }
1501             }
1502         }
1503     }
1504 }
1505 
1506 /**
1507  * Append keywords parsed from LDML extension value
1508  * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1509  * Note: char* buf is used for storing keywords
1510  */
1511 static void
_appendLDMLExtensionAsKeywords(const char * ldmlext,ExtensionListEntry ** appendTo,icu::MemoryPool<ExtensionListEntry> & extPool,icu::MemoryPool<icu::CharString> & kwdBuf,UBool * posixVariant,UErrorCode * status)1512 _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1513     const char *pTag;   /* beginning of current subtag */
1514     const char *pKwds;  /* beginning of key-type pairs */
1515     UBool variantExists = *posixVariant;
1516 
1517     ExtensionListEntry *kwdFirst = NULL;    /* first LDML keyword */
1518     ExtensionListEntry *kwd, *nextKwd;
1519 
1520     int32_t len;
1521 
1522     /* Reset the posixVariant value */
1523     *posixVariant = FALSE;
1524 
1525     pTag = ldmlext;
1526     pKwds = NULL;
1527 
1528     {
1529         AttributeListEntry *attrFirst = NULL;   /* first attribute */
1530         AttributeListEntry *attr, *nextAttr;
1531 
1532         char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1533         int32_t attrBufIdx = 0;
1534 
1535         icu::MemoryPool<AttributeListEntry> attrPool;
1536 
1537         /* Iterate through u extension attributes */
1538         while (*pTag) {
1539             /* locate next separator char */
1540             for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1541 
1542             if (ultag_isUnicodeLocaleKey(pTag, len)) {
1543                 pKwds = pTag;
1544                 break;
1545             }
1546 
1547             /* add this attribute to the list */
1548             attr = attrPool.create();
1549             if (attr == NULL) {
1550                 *status = U_MEMORY_ALLOCATION_ERROR;
1551                 return;
1552             }
1553 
1554             if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1555                 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1556                 attrBuf[attrBufIdx + len] = 0;
1557                 attr->attribute = &attrBuf[attrBufIdx];
1558                 attrBufIdx += (len + 1);
1559             } else {
1560                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1561                 return;
1562             }
1563 
1564             // duplicate attribute is ignored, causes no error.
1565             _addAttributeToList(&attrFirst, attr);
1566 
1567             /* next tag */
1568             pTag += len;
1569             if (*pTag) {
1570                 /* next to the separator */
1571                 pTag++;
1572             }
1573         }
1574 
1575         if (attrFirst) {
1576             /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1577 
1578             kwd = extPool.create();
1579             if (kwd == NULL) {
1580                 *status = U_MEMORY_ALLOCATION_ERROR;
1581                 return;
1582             }
1583 
1584             icu::CharString* value = kwdBuf.create();
1585             if (value == NULL) {
1586                 *status = U_MEMORY_ALLOCATION_ERROR;
1587                 return;
1588             }
1589 
1590             /* attribute subtags sorted in alphabetical order as type */
1591             attr = attrFirst;
1592             while (attr != NULL) {
1593                 nextAttr = attr->next;
1594                 if (attr != attrFirst) {
1595                     value->append('-', *status);
1596                 }
1597                 value->append(attr->attribute, *status);
1598                 attr = nextAttr;
1599             }
1600             if (U_FAILURE(*status)) {
1601                 return;
1602             }
1603 
1604             kwd->key = LOCALE_ATTRIBUTE_KEY;
1605             kwd->value = value->data();
1606 
1607             if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1608                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1609                 return;
1610             }
1611         }
1612     }
1613 
1614     if (pKwds) {
1615         const char *pBcpKey = NULL;     /* u extenstion key subtag */
1616         const char *pBcpType = NULL;    /* beginning of u extension type subtag(s) */
1617         int32_t bcpKeyLen = 0;
1618         int32_t bcpTypeLen = 0;
1619         UBool isDone = FALSE;
1620 
1621         pTag = pKwds;
1622         /* BCP47 representation of LDML key/type pairs */
1623         while (!isDone) {
1624             const char *pNextBcpKey = NULL;
1625             int32_t nextBcpKeyLen = 0;
1626             UBool emitKeyword = FALSE;
1627 
1628             if (*pTag) {
1629                 /* locate next separator char */
1630                 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1631 
1632                 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1633                     if (pBcpKey) {
1634                         emitKeyword = TRUE;
1635                         pNextBcpKey = pTag;
1636                         nextBcpKeyLen = len;
1637                     } else {
1638                         pBcpKey = pTag;
1639                         bcpKeyLen = len;
1640                     }
1641                 } else {
1642                     U_ASSERT(pBcpKey != NULL);
1643                     /* within LDML type subtags */
1644                     if (pBcpType) {
1645                         bcpTypeLen += (len + 1);
1646                     } else {
1647                         pBcpType = pTag;
1648                         bcpTypeLen = len;
1649                     }
1650                 }
1651 
1652                 /* next tag */
1653                 pTag += len;
1654                 if (*pTag) {
1655                     /* next to the separator */
1656                     pTag++;
1657                 }
1658             } else {
1659                 /* processing last one */
1660                 emitKeyword = TRUE;
1661                 isDone = TRUE;
1662             }
1663 
1664             if (emitKeyword) {
1665                 const char *pKey = NULL;    /* LDML key */
1666                 const char *pType = NULL;   /* LDML type */
1667 
1668                 char bcpKeyBuf[3];          /* BCP key length is always 2 for now */
1669 
1670                 U_ASSERT(pBcpKey != NULL);
1671 
1672                 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1673                     /* the BCP key is invalid */
1674                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1675                     return;
1676                 }
1677                 U_ASSERT(bcpKeyLen <= 2);
1678 
1679                 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1680                 bcpKeyBuf[bcpKeyLen] = 0;
1681 
1682                 /* u extension key to LDML key */
1683                 pKey = uloc_toLegacyKey(bcpKeyBuf);
1684                 if (pKey == NULL) {
1685                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1686                     return;
1687                 }
1688                 if (pKey == bcpKeyBuf) {
1689                     /*
1690                     The key returned by toLegacyKey points to the input buffer.
1691                     We normalize the result key to lower case.
1692                     */
1693                     T_CString_toLowerCase(bcpKeyBuf);
1694                     icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1695                     if (key == NULL) {
1696                         *status = U_MEMORY_ALLOCATION_ERROR;
1697                         return;
1698                     }
1699                     if (U_FAILURE(*status)) {
1700                         return;
1701                     }
1702                     pKey = key->data();
1703                 }
1704 
1705                 if (pBcpType) {
1706                     char bcpTypeBuf[128];       /* practically long enough even considering multiple subtag type */
1707                     if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1708                         /* the BCP type is too long */
1709                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1710                         return;
1711                     }
1712 
1713                     uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1714                     bcpTypeBuf[bcpTypeLen] = 0;
1715 
1716                     /* BCP type to locale type */
1717                     pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1718                     if (pType == NULL) {
1719                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1720                         return;
1721                     }
1722                     if (pType == bcpTypeBuf) {
1723                         /*
1724                         The type returned by toLegacyType points to the input buffer.
1725                         We normalize the result type to lower case.
1726                         */
1727                         /* normalize to lower case */
1728                         T_CString_toLowerCase(bcpTypeBuf);
1729                         icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1730                         if (type == NULL) {
1731                             *status = U_MEMORY_ALLOCATION_ERROR;
1732                             return;
1733                         }
1734                         if (U_FAILURE(*status)) {
1735                             return;
1736                         }
1737                         pType = type->data();
1738                     }
1739                 } else {
1740                     /* typeless - default type value is "yes" */
1741                     pType = LOCALE_TYPE_YES;
1742                 }
1743 
1744                 /* Special handling for u-va-posix, since we want to treat this as a variant,
1745                    not as a keyword */
1746                 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1747                     *posixVariant = TRUE;
1748                 } else {
1749                     /* create an ExtensionListEntry for this keyword */
1750                     kwd = extPool.create();
1751                     if (kwd == NULL) {
1752                         *status = U_MEMORY_ALLOCATION_ERROR;
1753                         return;
1754                     }
1755 
1756                     kwd->key = pKey;
1757                     kwd->value = pType;
1758 
1759                     if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1760                         // duplicate keyword is allowed, Only the first
1761                         // is honored.
1762                     }
1763                 }
1764 
1765                 pBcpKey = pNextBcpKey;
1766                 bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
1767                 pBcpType = NULL;
1768                 bcpTypeLen = 0;
1769             }
1770         }
1771     }
1772 
1773     kwd = kwdFirst;
1774     while (kwd != NULL) {
1775         nextKwd = kwd->next;
1776         _addExtensionToList(appendTo, kwd, FALSE);
1777         kwd = nextKwd;
1778     }
1779 }
1780 
1781 
1782 static void
_appendKeywords(ULanguageTag * langtag,icu::ByteSink & sink,UErrorCode * status)1783 _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1784     int32_t i, n;
1785     int32_t len;
1786     ExtensionListEntry *kwdFirst = NULL;
1787     ExtensionListEntry *kwd;
1788     const char *key, *type;
1789     icu::MemoryPool<ExtensionListEntry> extPool;
1790     icu::MemoryPool<icu::CharString> kwdBuf;
1791     UBool posixVariant = FALSE;
1792 
1793     if (U_FAILURE(*status)) {
1794         return;
1795     }
1796 
1797     n = ultag_getExtensionsSize(langtag);
1798 
1799     /* resolve locale keywords and reordering keys */
1800     for (i = 0; i < n; i++) {
1801         key = ultag_getExtensionKey(langtag, i);
1802         type = ultag_getExtensionValue(langtag, i);
1803         if (*key == LDMLEXT) {
1804             /* Determine if variants already exists */
1805             if (ultag_getVariantsSize(langtag)) {
1806                 posixVariant = TRUE;
1807             }
1808 
1809             _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1810             if (U_FAILURE(*status)) {
1811                 break;
1812             }
1813         } else {
1814             kwd = extPool.create();
1815             if (kwd == NULL) {
1816                 *status = U_MEMORY_ALLOCATION_ERROR;
1817                 break;
1818             }
1819             kwd->key = key;
1820             kwd->value = type;
1821             if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1822                 *status = U_ILLEGAL_ARGUMENT_ERROR;
1823                 break;
1824             }
1825         }
1826     }
1827 
1828     if (U_SUCCESS(*status)) {
1829         type = ultag_getPrivateUse(langtag);
1830         if ((int32_t)uprv_strlen(type) > 0) {
1831             /* add private use as a keyword */
1832             kwd = extPool.create();
1833             if (kwd == NULL) {
1834                 *status = U_MEMORY_ALLOCATION_ERROR;
1835             } else {
1836                 kwd->key = PRIVATEUSE_KEY;
1837                 kwd->value = type;
1838                 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1839                     *status = U_ILLEGAL_ARGUMENT_ERROR;
1840                 }
1841             }
1842         }
1843     }
1844 
1845     /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1846 
1847     if (U_SUCCESS(*status) && posixVariant) {
1848         len = (int32_t) uprv_strlen(_POSIX);
1849         sink.Append(_POSIX, len);
1850     }
1851 
1852     if (U_SUCCESS(*status) && kwdFirst != NULL) {
1853         /* write out the sorted keywords */
1854         UBool firstValue = TRUE;
1855         kwd = kwdFirst;
1856         do {
1857             if (firstValue) {
1858                 sink.Append("@", 1);
1859                 firstValue = FALSE;
1860             } else {
1861                 sink.Append(";", 1);
1862             }
1863 
1864             /* key */
1865             len = (int32_t)uprv_strlen(kwd->key);
1866             sink.Append(kwd->key, len);
1867             sink.Append("=", 1);
1868 
1869             /* type */
1870             len = (int32_t)uprv_strlen(kwd->value);
1871             sink.Append(kwd->value, len);
1872 
1873             kwd = kwd->next;
1874         } while (kwd);
1875     }
1876 }
1877 
1878 static void
_appendPrivateuseToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1879 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1880     (void)hadPosix;
1881     char buf[ULOC_FULLNAME_CAPACITY];
1882     char tmpAppend[ULOC_FULLNAME_CAPACITY];
1883     UErrorCode tmpStatus = U_ZERO_ERROR;
1884     int32_t len, i;
1885     int32_t reslen = 0;
1886     int32_t capacity = sizeof tmpAppend;
1887 
1888     if (U_FAILURE(*status)) {
1889         return;
1890     }
1891 
1892     len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1893     if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1894         if (strict) {
1895             *status = U_ILLEGAL_ARGUMENT_ERROR;
1896         }
1897         return;
1898     }
1899 
1900     if (len > 0) {
1901         char *p, *pPriv;
1902         UBool bNext = TRUE;
1903         UBool firstValue = TRUE;
1904         UBool writeValue;
1905 
1906         pPriv = NULL;
1907         p = buf;
1908         while (bNext) {
1909             writeValue = FALSE;
1910             if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1911                 if (*p == 0) {
1912                     bNext = FALSE;
1913                 } else {
1914                     *p = 0; /* terminate */
1915                 }
1916                 if (pPriv != NULL) {
1917                     /* Private use in the canonical format is lowercase in BCP47 */
1918                     for (i = 0; *(pPriv + i) != 0; i++) {
1919                         *(pPriv + i) = uprv_tolower(*(pPriv + i));
1920                     }
1921 
1922                     /* validate */
1923                     if (_isPrivateuseValueSubtag(pPriv, -1)) {
1924                         if (firstValue) {
1925                             if (!_isVariantSubtag(pPriv, -1)) {
1926                                 writeValue = TRUE;
1927                             }
1928                         } else {
1929                             writeValue = TRUE;
1930                         }
1931                     } else if (strict) {
1932                         *status = U_ILLEGAL_ARGUMENT_ERROR;
1933                         break;
1934                     } else {
1935                         break;
1936                     }
1937 
1938                     if (writeValue) {
1939                         if (reslen < capacity) {
1940                             tmpAppend[reslen++] = SEP;
1941                         }
1942 
1943                         if (firstValue) {
1944                             if (reslen < capacity) {
1945                                 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1946                             }
1947 
1948                             if (reslen < capacity) {
1949                                 tmpAppend[reslen++] = SEP;
1950                             }
1951 
1952                             len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1953                             if (reslen < capacity) {
1954                                 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1955                             }
1956                             reslen += len;
1957 
1958                             if (reslen < capacity) {
1959                                 tmpAppend[reslen++] = SEP;
1960                             }
1961 
1962                             firstValue = FALSE;
1963                         }
1964 
1965                         len = (int32_t)uprv_strlen(pPriv);
1966                         if (reslen < capacity) {
1967                             uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1968                         }
1969                         reslen += len;
1970                     }
1971                 }
1972                 /* reset private use starting position */
1973                 pPriv = NULL;
1974             } else if (pPriv == NULL) {
1975                 pPriv = p;
1976             }
1977             p++;
1978         }
1979 
1980         if (U_FAILURE(*status)) {
1981             return;
1982         }
1983     }
1984 
1985     if (U_SUCCESS(*status)) {
1986         len = reslen;
1987         sink.Append(tmpAppend, len);
1988     }
1989 }
1990 
1991 /*
1992 * -------------------------------------------------
1993 *
1994 * ultag_ functions
1995 *
1996 * -------------------------------------------------
1997 */
1998 
1999 /* Bit flags used by the parser */
2000 #define LANG 0x0001
2001 #define EXTL 0x0002
2002 #define SCRT 0x0004
2003 #define REGN 0x0008
2004 #define VART 0x0010
2005 #define EXTS 0x0020
2006 #define EXTV 0x0040
2007 #define PRIV 0x0080
2008 
2009 /**
2010  * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function.
2011  * As a work-around, optimization is disabled for this function on VS2015 and VS2017.
2012  * This work-around should be removed once the following versions of Visual Studio are no
2013  * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4.
2014  */
2015 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2016 #pragma optimize( "", off )
2017 #endif
2018 
2019 static ULanguageTag*
ultag_parse(const char * tag,int32_t tagLen,int32_t * parsedLen,UErrorCode * status)2020 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2021     char *tagBuf;
2022     int16_t next;
2023     char *pSubtag, *pNext, *pLastGoodPosition;
2024     int32_t subtagLen;
2025     int32_t extlangIdx;
2026     ExtensionListEntry *pExtension;
2027     char *pExtValueSubtag, *pExtValueSubtagEnd;
2028     int32_t i;
2029     UBool privateuseVar = FALSE;
2030     int32_t legacyLen = 0;
2031 
2032     if (parsedLen != NULL) {
2033         *parsedLen = 0;
2034     }
2035 
2036     if (U_FAILURE(*status)) {
2037         return NULL;
2038     }
2039 
2040     if (tagLen < 0) {
2041         tagLen = (int32_t)uprv_strlen(tag);
2042     }
2043 
2044     /* copy the entire string */
2045     tagBuf = (char*)uprv_malloc(tagLen + 1);
2046     if (tagBuf == NULL) {
2047         *status = U_MEMORY_ALLOCATION_ERROR;
2048         return NULL;
2049     }
2050 
2051     if (tagLen > 0) {
2052         uprv_memcpy(tagBuf, tag, tagLen);
2053     }
2054     *(tagBuf + tagLen) = 0;
2055 
2056     /* create a ULanguageTag */
2057     icu::LocalULanguageTagPointer t(
2058             (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2059     if (t.isNull()) {
2060         uprv_free(tagBuf);
2061         *status = U_MEMORY_ALLOCATION_ERROR;
2062         return NULL;
2063     }
2064     _initializeULanguageTag(t.getAlias());
2065     t->buf = tagBuf;
2066 
2067     if (tagLen < MINLEN) {
2068         /* the input tag is too short - return empty ULanguageTag */
2069         return t.orphan();
2070     }
2071 
2072     size_t parsedLenDelta = 0;
2073     // Legacy tag will be consider together. Legacy tag with intervening
2074     // script and region such as art-DE-lojban or art-Latn-lojban won't be
2075     // matched.
2076     /* check if the tag is legacy */
2077     for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
2078         int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
2079         if (tagLen < checkLegacyLen) {
2080             continue;
2081         }
2082         if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
2083             // make sure next char is '-'.
2084             continue;
2085         }
2086         if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
2087             int32_t newTagLength;
2088 
2089             legacyLen = checkLegacyLen;  /* back up for output parsedLen */
2090             int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
2091             newTagLength = replacementLen + tagLen - checkLegacyLen;
2092             if (tagLen < newTagLength) {
2093                 uprv_free(tagBuf);
2094                 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2095                 if (tagBuf == NULL) {
2096                     *status = U_MEMORY_ALLOCATION_ERROR;
2097                     return NULL;
2098                 }
2099                 t->buf = tagBuf;
2100                 tagLen = newTagLength;
2101             }
2102             parsedLenDelta = checkLegacyLen - replacementLen;
2103             uprv_strcpy(t->buf, LEGACY[i + 1]);
2104             if (checkLegacyLen != tagLen) {
2105                 uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
2106             }
2107             break;
2108         }
2109     }
2110 
2111     if (legacyLen == 0) {
2112         for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2113             const char* redundantTag = REDUNDANT[i];
2114             size_t redundantTagLen = uprv_strlen(redundantTag);
2115             // The preferred tag for a redundant tag is always shorter than redundant
2116             // tag. A redundant tag may or may not be followed by other subtags.
2117             // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2118             if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2119                 const char* redundantTagEnd = tagBuf + redundantTagLen;
2120                 if (*redundantTagEnd  == '\0' || *redundantTagEnd == SEP) {
2121                     const char* preferredTag = REDUNDANT[i + 1];
2122                     size_t preferredTagLen = uprv_strlen(preferredTag);
2123                     uprv_strncpy(t->buf, preferredTag, preferredTagLen);
2124                     if (*redundantTagEnd == SEP) {
2125                         uprv_memmove(tagBuf + preferredTagLen,
2126                                      redundantTagEnd,
2127                                      tagLen - redundantTagLen + 1);
2128                     } else {
2129                         tagBuf[preferredTagLen] = '\0';
2130                     }
2131                     // parsedLen should be the length of the input
2132                     // before redundantTag is replaced by preferredTag.
2133                     // Save the delta to add it back later.
2134                     parsedLenDelta = redundantTagLen - preferredTagLen;
2135                     break;
2136                 }
2137             }
2138         }
2139     }
2140 
2141     /*
2142      * langtag      =   language
2143      *                  ["-" script]
2144      *                  ["-" region]
2145      *                  *("-" variant)
2146      *                  *("-" extension)
2147      *                  ["-" privateuse]
2148      */
2149 
2150     next = LANG | PRIV;
2151     pNext = pLastGoodPosition = tagBuf;
2152     extlangIdx = 0;
2153     pExtension = NULL;
2154     pExtValueSubtag = NULL;
2155     pExtValueSubtagEnd = NULL;
2156 
2157     while (pNext) {
2158         char *pSep;
2159 
2160         pSubtag = pNext;
2161 
2162         /* locate next separator char */
2163         pSep = pSubtag;
2164         while (*pSep) {
2165             if (*pSep == SEP) {
2166                 break;
2167             }
2168             pSep++;
2169         }
2170         if (*pSep == 0) {
2171             /* last subtag */
2172             pNext = NULL;
2173         } else {
2174             pNext = pSep + 1;
2175         }
2176         subtagLen = (int32_t)(pSep - pSubtag);
2177 
2178         if (next & LANG) {
2179             if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2180                 *pSep = 0;  /* terminate */
2181                 // TODO: move deprecated language code handling here.
2182                 t->language = T_CString_toLowerCase(pSubtag);
2183 
2184                 pLastGoodPosition = pSep;
2185                 next = SCRT | REGN | VART | EXTS | PRIV;
2186                 if (subtagLen <= 3)
2187                   next |= EXTL;
2188                 continue;
2189             }
2190         }
2191         if (next & EXTL) {
2192             if (_isExtlangSubtag(pSubtag, subtagLen)) {
2193                 *pSep = 0;
2194                 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2195 
2196                 pLastGoodPosition = pSep;
2197                 if (extlangIdx < 3) {
2198                     next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2199                 } else {
2200                     next = SCRT | REGN | VART | EXTS | PRIV;
2201                 }
2202                 continue;
2203             }
2204         }
2205         if (next & SCRT) {
2206             if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2207                 char *p = pSubtag;
2208 
2209                 *pSep = 0;
2210 
2211                 /* to title case */
2212                 *p = uprv_toupper(*p);
2213                 p++;
2214                 for (; *p; p++) {
2215                     *p = uprv_tolower(*p);
2216                 }
2217 
2218                 t->script = pSubtag;
2219 
2220                 pLastGoodPosition = pSep;
2221                 next = REGN | VART | EXTS | PRIV;
2222                 continue;
2223             }
2224         }
2225         if (next & REGN) {
2226             if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2227                 *pSep = 0;
2228                 // TODO: move deprecated region code handling here.
2229                 t->region = T_CString_toUpperCase(pSubtag);
2230 
2231                 pLastGoodPosition = pSep;
2232                 next = VART | EXTS | PRIV;
2233                 continue;
2234             }
2235         }
2236         if (next & VART) {
2237             if (_isVariantSubtag(pSubtag, subtagLen) ||
2238                (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2239                 VariantListEntry *var;
2240                 UBool isAdded;
2241 
2242                 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2243                 if (var == NULL) {
2244                     *status = U_MEMORY_ALLOCATION_ERROR;
2245                     return NULL;
2246                 }
2247                 *pSep = 0;
2248                 var->variant = T_CString_toUpperCase(pSubtag);
2249                 isAdded = _addVariantToList(&(t->variants), var);
2250                 if (!isAdded) {
2251                     /* duplicated variant entry */
2252                     uprv_free(var);
2253                     break;
2254                 }
2255                 pLastGoodPosition = pSep;
2256                 next = VART | EXTS | PRIV;
2257                 continue;
2258             }
2259         }
2260         if (next & EXTS) {
2261             if (_isExtensionSingleton(pSubtag, subtagLen)) {
2262                 if (pExtension != NULL) {
2263                     if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2264                         /* the previous extension is incomplete */
2265                         uprv_free(pExtension);
2266                         pExtension = NULL;
2267                         break;
2268                     }
2269 
2270                     /* terminate the previous extension value */
2271                     *pExtValueSubtagEnd = 0;
2272                     pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2273 
2274                     /* insert the extension to the list */
2275                     if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2276                         pLastGoodPosition = pExtValueSubtagEnd;
2277                     } else {
2278                         /* stop parsing here */
2279                         uprv_free(pExtension);
2280                         pExtension = NULL;
2281                         break;
2282                     }
2283                 }
2284 
2285                 /* create a new extension */
2286                 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2287                 if (pExtension == NULL) {
2288                     *status = U_MEMORY_ALLOCATION_ERROR;
2289                     return NULL;
2290                 }
2291                 *pSep = 0;
2292                 pExtension->key = T_CString_toLowerCase(pSubtag);
2293                 pExtension->value = NULL;   /* will be set later */
2294 
2295                 /*
2296                  * reset the start and the end location of extension value
2297                  * subtags for this extension
2298                  */
2299                 pExtValueSubtag = NULL;
2300                 pExtValueSubtagEnd = NULL;
2301 
2302                 next = EXTV;
2303                 continue;
2304             }
2305         }
2306         if (next & EXTV) {
2307             if (_isExtensionSubtag(pSubtag, subtagLen)) {
2308                 if (pExtValueSubtag == NULL) {
2309                     /* if the start postion of this extension's value is not yet,
2310                         this one is the first value subtag */
2311                     pExtValueSubtag = pSubtag;
2312                 }
2313 
2314                 /* Mark the end of this subtag */
2315                 pExtValueSubtagEnd = pSep;
2316                 next = EXTS | EXTV | PRIV;
2317 
2318                 continue;
2319             }
2320         }
2321         if (next & PRIV) {
2322             if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2323                 char *pPrivuseVal;
2324 
2325                 if (pExtension != NULL) {
2326                     /* Process the last extension */
2327                     if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2328                         /* the previous extension is incomplete */
2329                         uprv_free(pExtension);
2330                         pExtension = NULL;
2331                         break;
2332                     } else {
2333                         /* terminate the previous extension value */
2334                         *pExtValueSubtagEnd = 0;
2335                         pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2336 
2337                         /* insert the extension to the list */
2338                         if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2339                             pLastGoodPosition = pExtValueSubtagEnd;
2340                             pExtension = NULL;
2341                         } else {
2342                         /* stop parsing here */
2343                             uprv_free(pExtension);
2344                             pExtension = NULL;
2345                             break;
2346                         }
2347                     }
2348                 }
2349 
2350                 /* The rest of part will be private use value subtags */
2351                 if (pNext == NULL) {
2352                     /* empty private use subtag */
2353                     break;
2354                 }
2355                 /* back up the private use value start position */
2356                 pPrivuseVal = pNext;
2357 
2358                 /* validate private use value subtags */
2359                 while (pNext) {
2360                     pSubtag = pNext;
2361                     pSep = pSubtag;
2362                     while (*pSep) {
2363                         if (*pSep == SEP) {
2364                             break;
2365                         }
2366                         pSep++;
2367                     }
2368                     if (*pSep == 0) {
2369                         /* last subtag */
2370                         pNext = NULL;
2371                     } else {
2372                         pNext = pSep + 1;
2373                     }
2374                     subtagLen = (int32_t)(pSep - pSubtag);
2375 
2376                     if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2377                         *pSep = 0;
2378                         next = VART;
2379                         privateuseVar = TRUE;
2380                         break;
2381                     } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2382                         pLastGoodPosition = pSep;
2383                     } else {
2384                         break;
2385                     }
2386                 }
2387 
2388                 if (next == VART) {
2389                     continue;
2390                 }
2391 
2392                 if (pLastGoodPosition - pPrivuseVal > 0) {
2393                     *pLastGoodPosition = 0;
2394                     t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2395                 }
2396                 /* No more subtags, exiting the parse loop */
2397                 break;
2398             }
2399             break;
2400         }
2401 
2402         /* If we fell through here, it means this subtag is illegal - quit parsing */
2403         break;
2404     }
2405 
2406     if (pExtension != NULL) {
2407         /* Process the last extension */
2408         if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2409             /* the previous extension is incomplete */
2410             uprv_free(pExtension);
2411         } else {
2412             /* terminate the previous extension value */
2413             *pExtValueSubtagEnd = 0;
2414             pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2415             /* insert the extension to the list */
2416             if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2417                 pLastGoodPosition = pExtValueSubtagEnd;
2418             } else {
2419                 uprv_free(pExtension);
2420             }
2421         }
2422     }
2423 
2424     if (parsedLen != NULL) {
2425         *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2426     }
2427 
2428     return t.orphan();
2429 }
2430 
2431 // Ticket #12705 - Turn optimization back on.
2432 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2433 #pragma optimize( "", on )
2434 #endif
2435 
2436 static void
ultag_close(ULanguageTag * langtag)2437 ultag_close(ULanguageTag* langtag) {
2438 
2439     if (langtag == NULL) {
2440         return;
2441     }
2442 
2443     uprv_free(langtag->buf);
2444 
2445     if (langtag->variants) {
2446         VariantListEntry *curVar = langtag->variants;
2447         while (curVar) {
2448             VariantListEntry *nextVar = curVar->next;
2449             uprv_free(curVar);
2450             curVar = nextVar;
2451         }
2452     }
2453 
2454     if (langtag->extensions) {
2455         ExtensionListEntry *curExt = langtag->extensions;
2456         while (curExt) {
2457             ExtensionListEntry *nextExt = curExt->next;
2458             uprv_free(curExt);
2459             curExt = nextExt;
2460         }
2461     }
2462 
2463     uprv_free(langtag);
2464 }
2465 
2466 static const char*
ultag_getLanguage(const ULanguageTag * langtag)2467 ultag_getLanguage(const ULanguageTag* langtag) {
2468     return langtag->language;
2469 }
2470 
2471 #if 0
2472 static const char*
2473 ultag_getJDKLanguage(const ULanguageTag* langtag) {
2474     int32_t i;
2475     for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
2476         if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2477             return DEPRECATEDLANGS[i + 1];
2478         }
2479     }
2480     return langtag->language;
2481 }
2482 #endif
2483 
2484 static const char*
ultag_getExtlang(const ULanguageTag * langtag,int32_t idx)2485 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2486     if (idx >= 0 && idx < MAXEXTLANG) {
2487         return langtag->extlang[idx];
2488     }
2489     return NULL;
2490 }
2491 
2492 static int32_t
ultag_getExtlangSize(const ULanguageTag * langtag)2493 ultag_getExtlangSize(const ULanguageTag* langtag) {
2494     int32_t size = 0;
2495     int32_t i;
2496     for (i = 0; i < MAXEXTLANG; i++) {
2497         if (langtag->extlang[i]) {
2498             size++;
2499         }
2500     }
2501     return size;
2502 }
2503 
2504 static const char*
ultag_getScript(const ULanguageTag * langtag)2505 ultag_getScript(const ULanguageTag* langtag) {
2506     return langtag->script;
2507 }
2508 
2509 static const char*
ultag_getRegion(const ULanguageTag * langtag)2510 ultag_getRegion(const ULanguageTag* langtag) {
2511     return langtag->region;
2512 }
2513 
2514 static const char*
ultag_getVariant(const ULanguageTag * langtag,int32_t idx)2515 ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2516     const char *var = NULL;
2517     VariantListEntry *cur = langtag->variants;
2518     int32_t i = 0;
2519     while (cur) {
2520         if (i == idx) {
2521             var = cur->variant;
2522             break;
2523         }
2524         cur = cur->next;
2525         i++;
2526     }
2527     return var;
2528 }
2529 
2530 static int32_t
ultag_getVariantsSize(const ULanguageTag * langtag)2531 ultag_getVariantsSize(const ULanguageTag* langtag) {
2532     int32_t size = 0;
2533     VariantListEntry *cur = langtag->variants;
2534     while (TRUE) {
2535         if (cur == NULL) {
2536             break;
2537         }
2538         size++;
2539         cur = cur->next;
2540     }
2541     return size;
2542 }
2543 
2544 static const char*
ultag_getExtensionKey(const ULanguageTag * langtag,int32_t idx)2545 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2546     const char *key = NULL;
2547     ExtensionListEntry *cur = langtag->extensions;
2548     int32_t i = 0;
2549     while (cur) {
2550         if (i == idx) {
2551             key = cur->key;
2552             break;
2553         }
2554         cur = cur->next;
2555         i++;
2556     }
2557     return key;
2558 }
2559 
2560 static const char*
ultag_getExtensionValue(const ULanguageTag * langtag,int32_t idx)2561 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2562     const char *val = NULL;
2563     ExtensionListEntry *cur = langtag->extensions;
2564     int32_t i = 0;
2565     while (cur) {
2566         if (i == idx) {
2567             val = cur->value;
2568             break;
2569         }
2570         cur = cur->next;
2571         i++;
2572     }
2573     return val;
2574 }
2575 
2576 static int32_t
ultag_getExtensionsSize(const ULanguageTag * langtag)2577 ultag_getExtensionsSize(const ULanguageTag* langtag) {
2578     int32_t size = 0;
2579     ExtensionListEntry *cur = langtag->extensions;
2580     while (TRUE) {
2581         if (cur == NULL) {
2582             break;
2583         }
2584         size++;
2585         cur = cur->next;
2586     }
2587     return size;
2588 }
2589 
2590 static const char*
ultag_getPrivateUse(const ULanguageTag * langtag)2591 ultag_getPrivateUse(const ULanguageTag* langtag) {
2592     return langtag->privateuse;
2593 }
2594 
2595 #if 0
2596 static const char*
2597 ultag_getLegacy(const ULanguageTag* langtag) {
2598     return langtag->legacy;
2599 }
2600 #endif
2601 
2602 
2603 /*
2604 * -------------------------------------------------
2605 *
2606 * Locale/BCP47 conversion APIs, exposed as uloc_*
2607 *
2608 * -------------------------------------------------
2609 */
2610 U_CAPI int32_t U_EXPORT2
uloc_toLanguageTag(const char * localeID,char * langtag,int32_t langtagCapacity,UBool strict,UErrorCode * status)2611 uloc_toLanguageTag(const char* localeID,
2612                    char* langtag,
2613                    int32_t langtagCapacity,
2614                    UBool strict,
2615                    UErrorCode* status) {
2616     if (U_FAILURE(*status)) {
2617         return 0;
2618     }
2619 
2620     icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2621     ulocimp_toLanguageTag(localeID, sink, strict, status);
2622 
2623     int32_t reslen = sink.NumberOfBytesAppended();
2624 
2625     if (U_FAILURE(*status)) {
2626         return reslen;
2627     }
2628 
2629     if (sink.Overflowed()) {
2630         *status = U_BUFFER_OVERFLOW_ERROR;
2631     } else {
2632         u_terminateChars(langtag, langtagCapacity, reslen, status);
2633     }
2634 
2635     return reslen;
2636 }
2637 
2638 
2639 U_CAPI void U_EXPORT2
ulocimp_toLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)2640 ulocimp_toLanguageTag(const char* localeID,
2641                       icu::ByteSink& sink,
2642                       UBool strict,
2643                       UErrorCode* status) {
2644     icu::CharString canonical;
2645     int32_t reslen;
2646     UErrorCode tmpStatus = U_ZERO_ERROR;
2647     UBool hadPosix = FALSE;
2648     const char* pKeywordStart;
2649 
2650     /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "".  See #6835 */
2651     int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2652     if (resultCapacity > 0) {
2653         char* buffer;
2654 
2655         for (;;) {
2656             buffer = canonical.getAppendBuffer(
2657                     /*minCapacity=*/resultCapacity,
2658                     /*desiredCapacityHint=*/resultCapacity,
2659                     resultCapacity,
2660                     tmpStatus);
2661 
2662             if (U_FAILURE(tmpStatus)) {
2663                 *status = tmpStatus;
2664                 return;
2665             }
2666 
2667             reslen =
2668                 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2669 
2670             if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2671                 break;
2672             }
2673 
2674             resultCapacity = reslen;
2675             tmpStatus = U_ZERO_ERROR;
2676         }
2677 
2678         if (U_FAILURE(tmpStatus)) {
2679             *status = U_ILLEGAL_ARGUMENT_ERROR;
2680             return;
2681         }
2682 
2683         canonical.append(buffer, reslen, tmpStatus);
2684         if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2685             tmpStatus = U_ZERO_ERROR;  // Terminators provided by CharString.
2686         }
2687 
2688         if (U_FAILURE(tmpStatus)) {
2689             *status = tmpStatus;
2690             return;
2691         }
2692     }
2693 
2694     /* For handling special case - private use only tag */
2695     pKeywordStart = locale_getKeywordsStart(canonical.data());
2696     if (pKeywordStart == canonical.data()) {
2697         int kwdCnt = 0;
2698         UBool done = FALSE;
2699 
2700         icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2701         if (U_SUCCESS(tmpStatus)) {
2702             kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2703             if (kwdCnt == 1) {
2704                 const char *key;
2705                 int32_t len = 0;
2706 
2707                 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2708                 if (len == 1 && *key == PRIVATEUSE) {
2709                     icu::CharString buf;
2710                     {
2711                         icu::CharStringByteSink sink(&buf);
2712                         ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
2713                     }
2714                     if (U_SUCCESS(tmpStatus)) {
2715                         if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
2716                             /* return private use only tag */
2717                             sink.Append("und-x-", 6);
2718                             sink.Append(buf.data(), buf.length());
2719                             done = TRUE;
2720                         } else if (strict) {
2721                             *status = U_ILLEGAL_ARGUMENT_ERROR;
2722                             done = TRUE;
2723                         }
2724                         /* if not strict mode, then "und" will be returned */
2725                     } else {
2726                         *status = U_ILLEGAL_ARGUMENT_ERROR;
2727                         done = TRUE;
2728                     }
2729                 }
2730             }
2731             if (done) {
2732                 return;
2733             }
2734         }
2735     }
2736 
2737     _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2738     _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2739     _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2740     _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2741     _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2742     _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2743 }
2744 
2745 
2746 U_CAPI int32_t U_EXPORT2
uloc_forLanguageTag(const char * langtag,char * localeID,int32_t localeIDCapacity,int32_t * parsedLength,UErrorCode * status)2747 uloc_forLanguageTag(const char* langtag,
2748                     char* localeID,
2749                     int32_t localeIDCapacity,
2750                     int32_t* parsedLength,
2751                     UErrorCode* status) {
2752     if (U_FAILURE(*status)) {
2753         return 0;
2754     }
2755 
2756     icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2757     ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2758 
2759     int32_t reslen = sink.NumberOfBytesAppended();
2760 
2761     if (U_FAILURE(*status)) {
2762         return reslen;
2763     }
2764 
2765     if (sink.Overflowed()) {
2766         *status = U_BUFFER_OVERFLOW_ERROR;
2767     } else {
2768         u_terminateChars(localeID, localeIDCapacity, reslen, status);
2769     }
2770 
2771     return reslen;
2772 }
2773 
2774 
2775 U_CAPI void U_EXPORT2
ulocimp_forLanguageTag(const char * langtag,int32_t tagLen,icu::ByteSink & sink,int32_t * parsedLength,UErrorCode * status)2776 ulocimp_forLanguageTag(const char* langtag,
2777                        int32_t tagLen,
2778                        icu::ByteSink& sink,
2779                        int32_t* parsedLength,
2780                        UErrorCode* status) {
2781     UBool isEmpty = TRUE;
2782     const char *subtag, *p;
2783     int32_t len;
2784     int32_t i, n;
2785     UBool noRegion = TRUE;
2786 
2787     icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2788     if (U_FAILURE(*status)) {
2789         return;
2790     }
2791 
2792     /* language */
2793     subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2794     if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2795         len = (int32_t)uprv_strlen(subtag);
2796         if (len > 0) {
2797             sink.Append(subtag, len);
2798             isEmpty = FALSE;
2799         }
2800     }
2801 
2802     /* script */
2803     subtag = ultag_getScript(lt.getAlias());
2804     len = (int32_t)uprv_strlen(subtag);
2805     if (len > 0) {
2806         sink.Append("_", 1);
2807         isEmpty = FALSE;
2808 
2809         /* write out the script in title case */
2810         char c = uprv_toupper(*subtag);
2811         sink.Append(&c, 1);
2812         sink.Append(subtag + 1, len - 1);
2813     }
2814 
2815     /* region */
2816     subtag = ultag_getRegion(lt.getAlias());
2817     len = (int32_t)uprv_strlen(subtag);
2818     if (len > 0) {
2819         sink.Append("_", 1);
2820         isEmpty = FALSE;
2821 
2822         /* write out the region in upper case */
2823         p = subtag;
2824         while (*p) {
2825             char c = uprv_toupper(*p);
2826             sink.Append(&c, 1);
2827             p++;
2828         }
2829         noRegion = FALSE;
2830     }
2831 
2832     /* variants */
2833     _sortVariants(lt.getAlias()->variants);
2834     n = ultag_getVariantsSize(lt.getAlias());
2835     if (n > 0) {
2836         if (noRegion) {
2837             sink.Append("_", 1);
2838             isEmpty = FALSE;
2839         }
2840 
2841         for (i = 0; i < n; i++) {
2842             subtag = ultag_getVariant(lt.getAlias(), i);
2843             sink.Append("_", 1);
2844 
2845             /* write out the variant in upper case */
2846             p = subtag;
2847             while (*p) {
2848                 char c = uprv_toupper(*p);
2849                 sink.Append(&c, 1);
2850                 p++;
2851             }
2852         }
2853     }
2854 
2855     /* keywords */
2856     n = ultag_getExtensionsSize(lt.getAlias());
2857     subtag = ultag_getPrivateUse(lt.getAlias());
2858     if (n > 0 || uprv_strlen(subtag) > 0) {
2859         if (isEmpty && n > 0) {
2860             /* need a language */
2861             sink.Append(LANG_UND, LANG_UND_LEN);
2862         }
2863         _appendKeywords(lt.getAlias(), sink, status);
2864     }
2865 }
2866