1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2009-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
17 #include "ustr_imp.h"
18 #include "bytesinkutil.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24 #include "ulocimp.h"
25 #include "uassert.h"
26
27
28 /* struct holding a single variant */
29 typedef struct VariantListEntry {
30 const char *variant;
31 struct VariantListEntry *next;
32 } VariantListEntry;
33
34 /* struct holding a single attribute value */
35 struct AttributeListEntry : public icu::UMemory {
36 const char *attribute;
37 struct AttributeListEntry *next;
38 };
39
40 /* struct holding a single extension */
41 struct ExtensionListEntry : public icu::UMemory {
42 const char *key;
43 const char *value;
44 struct ExtensionListEntry *next;
45 };
46
47 #define MAXEXTLANG 3
48 typedef struct ULanguageTag {
49 char *buf; /* holding parsed subtags */
50 const char *language;
51 const char *extlang[MAXEXTLANG];
52 const char *script;
53 const char *region;
54 VariantListEntry *variants;
55 ExtensionListEntry *extensions;
56 const char *privateuse;
57 const char *legacy;
58 } ULanguageTag;
59
60 #define MINLEN 2
61 #define SEP '-'
62 #define PRIVATEUSE 'x'
63 #define LDMLEXT 'u'
64
65 #define LOCALE_SEP '_'
66 #define LOCALE_EXT_SEP '@'
67 #define LOCALE_KEYWORD_SEP ';'
68 #define LOCALE_KEY_TYPE_SEP '='
69
70 #define ISALPHA(c) uprv_isASCIILetter(c)
71 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72
73 static const char EMPTY[] = "";
74 static const char LANG_UND[] = "und";
75 static const char PRIVATEUSE_KEY[] = "x";
76 static const char _POSIX[] = "_POSIX";
77 static const char POSIX_KEY[] = "va";
78 static const char POSIX_VALUE[] = "posix";
79 static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
80 static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
81 static const char LOCALE_TYPE_YES[] = "yes";
82
83 #define LANG_UND_LEN 3
84
85 /*
86 Updated on 2018-09-12 from
87 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88
89 This table has 2 parts. The part for
90 legacy language tags (marked as “Type: grandfathered” in BCP 47)
91 is generated by the following scripts from the IANA language tag registry.
92
93 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
94 egrep -A 7 'Type: grandfathered' | \
95 egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
96 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
97 tr 'A-Z' 'a-z'
98
99
100 The 2nd part is made of five ICU-specific entries. They're kept for
101 the backward compatibility for now, even though there are no preferred
102 values. They may have to be removed for the strict BCP 47 compliance.
103
104 */
105 static const char* const LEGACY[] = {
106 /* legacy preferred */
107 "art-lojban", "jbo",
108 "en-gb-oed", "en-gb-oxendict",
109 "i-ami", "ami",
110 "i-bnn", "bnn",
111 "i-hak", "hak",
112 "i-klingon", "tlh",
113 "i-lux", "lb",
114 "i-navajo", "nv",
115 "i-pwn", "pwn",
116 "i-tao", "tao",
117 "i-tay", "tay",
118 "i-tsu", "tsu",
119 "no-bok", "nb",
120 "no-nyn", "nn",
121 "sgn-be-fr", "sfb",
122 "sgn-be-nl", "vgt",
123 "sgn-ch-de", "sgg",
124 "zh-guoyu", "cmn",
125 "zh-hakka", "hak",
126 "zh-min-nan", "nan",
127 "zh-xiang", "hsn",
128
129 // Legacy tags with no preferred value in the IANA
130 // registry. Kept for now for the backward compatibility
131 // because ICU has mapped them this way.
132 "i-default", "en-x-i-default",
133 "i-enochian", "und-x-i-enochian",
134 "i-mingo", "see-x-i-mingo",
135 "zh-min", "nan-x-zh-min",
136 };
137
138 /*
139 Updated on 2018-09-12 from
140 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
141
142 The table lists redundant tags with preferred value in the IANA languate tag registry.
143 It's generated with the following command:
144
145 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
146 grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
147 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
148 tr 'A-Z' 'a-z'
149
150 In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
151 a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
152 */
153
154 static const char* const REDUNDANT[] = {
155 // redundant preferred
156 "sgn-br", "bzs",
157 "sgn-co", "csn",
158 "sgn-de", "gsg",
159 "sgn-dk", "dsl",
160 "sgn-es", "ssp",
161 "sgn-fr", "fsl",
162 "sgn-gb", "bfi",
163 "sgn-gr", "gss",
164 "sgn-ie", "isg",
165 "sgn-it", "ise",
166 "sgn-jp", "jsl",
167 "sgn-mx", "mfs",
168 "sgn-ni", "ncs",
169 "sgn-nl", "dse",
170 "sgn-no", "nsl",
171 "sgn-pt", "psr",
172 "sgn-se", "swl",
173 "sgn-us", "ase",
174 "sgn-za", "sfs",
175 "zh-cmn", "cmn",
176 "zh-cmn-hans", "cmn-hans",
177 "zh-cmn-hant", "cmn-hant",
178 "zh-gan", "gan",
179 "zh-wuu", "wuu",
180 "zh-yue", "yue",
181
182 // variant tag with preferred value
183 "ja-latn-hepburn-heploc", "ja-latn-alalc97",
184 };
185
186 /*
187 Updated on 2018-09-12 from
188 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
189
190 grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
191 grep -B1 'Preferred' | grep -v '^--' | \
192 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
193
194 Make sure that 2-letter language subtags come before 3-letter subtags.
195 */
196 static const char DEPRECATEDLANGS[][4] = {
197 /* deprecated new */
198 "in", "id",
199 "iw", "he",
200 "ji", "yi",
201 "jw", "jv",
202 "mo", "ro",
203 "aam", "aas",
204 "adp", "dz",
205 "aue", "ktz",
206 "ayx", "nun",
207 "bgm", "bcg",
208 "bjd", "drl",
209 "ccq", "rki",
210 "cjr", "mom",
211 "cka", "cmr",
212 "cmk", "xch",
213 "coy", "pij",
214 "cqu", "quh",
215 "drh", "khk",
216 "drw", "prs",
217 "gav", "dev",
218 "gfx", "vaj",
219 "ggn", "gvr",
220 "gti", "nyc",
221 "guv", "duz",
222 "hrr", "jal",
223 "ibi", "opa",
224 "ilw", "gal",
225 "jeg", "oyb",
226 "kgc", "tdf",
227 "kgh", "kml",
228 "koj", "kwv",
229 "krm", "bmf",
230 "ktr", "dtp",
231 "kvs", "gdj",
232 "kwq", "yam",
233 "kxe", "tvd",
234 "kzj", "dtp",
235 "kzt", "dtp",
236 "lii", "raq",
237 "lmm", "rmx",
238 "meg", "cir",
239 "mst", "mry",
240 "mwj", "vaj",
241 "myt", "mry",
242 "nad", "xny",
243 "ncp", "kdz",
244 "nnx", "ngv",
245 "nts", "pij",
246 "oun", "vaj",
247 "pcr", "adx",
248 "pmc", "huw",
249 "pmu", "phr",
250 "ppa", "bfy",
251 "ppr", "lcq",
252 "pry", "prt",
253 "puz", "pub",
254 "sca", "hle",
255 "skk", "oyb",
256 "tdu", "dtp",
257 "thc", "tpo",
258 "thx", "oyb",
259 "tie", "ras",
260 "tkk", "twm",
261 "tlw", "weo",
262 "tmp", "tyj",
263 "tne", "kak",
264 "tnf", "prs",
265 "tsf", "taj",
266 "uok", "ema",
267 "xba", "cax",
268 "xia", "acn",
269 "xkh", "waw",
270 "xsj", "suj",
271 "ybd", "rki",
272 "yma", "lrr",
273 "ymt", "mtm",
274 "yos", "zom",
275 "yuu", "yug",
276 };
277
278 /*
279 Updated on 2018-04-24 from
280
281 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
282 grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
283 grep -B1 'Preferred' | \
284 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
285 */
286 static const char DEPRECATEDREGIONS[][3] = {
287 /* deprecated new */
288 "BU", "MM",
289 "DD", "DE",
290 "FX", "FR",
291 "TP", "TL",
292 "YD", "YE",
293 "ZR", "CD",
294 };
295
296 /*
297 * -------------------------------------------------
298 *
299 * These ultag_ functions may be exposed as APIs later
300 *
301 * -------------------------------------------------
302 */
303
304 static ULanguageTag*
305 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
306
307 static void
308 ultag_close(ULanguageTag* langtag);
309
310 static const char*
311 ultag_getLanguage(const ULanguageTag* langtag);
312
313 #if 0
314 static const char*
315 ultag_getJDKLanguage(const ULanguageTag* langtag);
316 #endif
317
318 static const char*
319 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
320
321 static int32_t
322 ultag_getExtlangSize(const ULanguageTag* langtag);
323
324 static const char*
325 ultag_getScript(const ULanguageTag* langtag);
326
327 static const char*
328 ultag_getRegion(const ULanguageTag* langtag);
329
330 static const char*
331 ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
332
333 static int32_t
334 ultag_getVariantsSize(const ULanguageTag* langtag);
335
336 static const char*
337 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
338
339 static const char*
340 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
341
342 static int32_t
343 ultag_getExtensionsSize(const ULanguageTag* langtag);
344
345 static const char*
346 ultag_getPrivateUse(const ULanguageTag* langtag);
347
348 #if 0
349 static const char*
350 ultag_getLegacy(const ULanguageTag* langtag);
351 #endif
352
353 U_NAMESPACE_BEGIN
354
355 /**
356 * \class LocalULanguageTagPointer
357 * "Smart pointer" class, closes a ULanguageTag via ultag_close().
358 * For most methods see the LocalPointerBase base class.
359 *
360 * @see LocalPointerBase
361 * @see LocalPointer
362 * @internal
363 */
364 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
365
366 U_NAMESPACE_END
367
368 /*
369 * -------------------------------------------------
370 *
371 * Language subtag syntax validation functions
372 *
373 * -------------------------------------------------
374 */
375
376 static UBool
_isAlphaString(const char * s,int32_t len)377 _isAlphaString(const char* s, int32_t len) {
378 int32_t i;
379 for (i = 0; i < len; i++) {
380 if (!ISALPHA(*(s + i))) {
381 return FALSE;
382 }
383 }
384 return TRUE;
385 }
386
387 static UBool
_isNumericString(const char * s,int32_t len)388 _isNumericString(const char* s, int32_t len) {
389 int32_t i;
390 for (i = 0; i < len; i++) {
391 if (!ISNUMERIC(*(s + i))) {
392 return FALSE;
393 }
394 }
395 return TRUE;
396 }
397
398 static UBool
_isAlphaNumericString(const char * s,int32_t len)399 _isAlphaNumericString(const char* s, int32_t len) {
400 int32_t i;
401 for (i = 0; i < len; i++) {
402 if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
403 return FALSE;
404 }
405 }
406 return TRUE;
407 }
408
409 static UBool
_isAlphaNumericStringLimitedLength(const char * s,int32_t len,int32_t min,int32_t max)410 _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
411 if (len < 0) {
412 len = (int32_t)uprv_strlen(s);
413 }
414 if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
415 return TRUE;
416 }
417 return FALSE;
418 }
419
420 U_CFUNC UBool
ultag_isLanguageSubtag(const char * s,int32_t len)421 ultag_isLanguageSubtag(const char* s, int32_t len) {
422 /*
423 * unicode_language_subtag = alpha{2,3} | alpha{5,8};
424 * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
425 * See ICU-20372
426 */
427 if (len < 0) {
428 len = (int32_t)uprv_strlen(s);
429 }
430 if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
431 return TRUE;
432 }
433 return FALSE;
434 }
435
436 static UBool
_isExtlangSubtag(const char * s,int32_t len)437 _isExtlangSubtag(const char* s, int32_t len) {
438 /*
439 * extlang = 3ALPHA ; selected ISO 639 codes
440 * *2("-" 3ALPHA) ; permanently reserved
441 */
442 if (len < 0) {
443 len = (int32_t)uprv_strlen(s);
444 }
445 if (len == 3 && _isAlphaString(s, len)) {
446 return TRUE;
447 }
448 return FALSE;
449 }
450
451 U_CFUNC UBool
ultag_isScriptSubtag(const char * s,int32_t len)452 ultag_isScriptSubtag(const char* s, int32_t len) {
453 /*
454 * script = 4ALPHA ; ISO 15924 code
455 */
456 if (len < 0) {
457 len = (int32_t)uprv_strlen(s);
458 }
459 if (len == 4 && _isAlphaString(s, len)) {
460 return TRUE;
461 }
462 return FALSE;
463 }
464
465 U_CFUNC UBool
ultag_isRegionSubtag(const char * s,int32_t len)466 ultag_isRegionSubtag(const char* s, int32_t len) {
467 /*
468 * region = 2ALPHA ; ISO 3166-1 code
469 * / 3DIGIT ; UN M.49 code
470 */
471 if (len < 0) {
472 len = (int32_t)uprv_strlen(s);
473 }
474 if (len == 2 && _isAlphaString(s, len)) {
475 return TRUE;
476 }
477 if (len == 3 && _isNumericString(s, len)) {
478 return TRUE;
479 }
480 return FALSE;
481 }
482
483 static UBool
_isVariantSubtag(const char * s,int32_t len)484 _isVariantSubtag(const char* s, int32_t len) {
485 /*
486 * variant = 5*8alphanum ; registered variants
487 * / (DIGIT 3alphanum)
488 */
489 if (len < 0) {
490 len = (int32_t)uprv_strlen(s);
491 }
492 if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
493 return TRUE;
494 }
495 if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
496 return TRUE;
497 }
498 return FALSE;
499 }
500
501 static UBool
_isSepListOf(UBool (* test)(const char *,int32_t),const char * s,int32_t len)502 _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
503 const char *p = s;
504 const char *pSubtag = NULL;
505
506 if (len < 0) {
507 len = (int32_t)uprv_strlen(s);
508 }
509
510 while ((p - s) < len) {
511 if (*p == SEP) {
512 if (pSubtag == NULL) {
513 return FALSE;
514 }
515 if (!test(pSubtag, (int32_t)(p - pSubtag))) {
516 return FALSE;
517 }
518 pSubtag = NULL;
519 } else if (pSubtag == NULL) {
520 pSubtag = p;
521 }
522 p++;
523 }
524 if (pSubtag == NULL) {
525 return FALSE;
526 }
527 return test(pSubtag, (int32_t)(p - pSubtag));
528 }
529
530 U_CFUNC UBool
ultag_isVariantSubtags(const char * s,int32_t len)531 ultag_isVariantSubtags(const char* s, int32_t len) {
532 return _isSepListOf(&_isVariantSubtag, s, len);
533 }
534
535 // This is for the ICU-specific "lvariant" handling.
536 static UBool
_isPrivateuseVariantSubtag(const char * s,int32_t len)537 _isPrivateuseVariantSubtag(const char* s, int32_t len) {
538 /*
539 * variant = 1*8alphanum ; registered variants
540 * / (DIGIT 3alphanum)
541 */
542 return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
543 }
544
545 static UBool
_isExtensionSingleton(const char * s,int32_t len)546 _isExtensionSingleton(const char* s, int32_t len) {
547 /*
548 * extension = singleton 1*("-" (2*8alphanum))
549 *
550 * singleton = DIGIT ; 0 - 9
551 * / %x41-57 ; A - W
552 * / %x59-5A ; Y - Z
553 * / %x61-77 ; a - w
554 * / %x79-7A ; y - z
555 */
556 if (len < 0) {
557 len = (int32_t)uprv_strlen(s);
558 }
559 if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
560 return TRUE;
561 }
562 return FALSE;
563 }
564
565 static UBool
_isExtensionSubtag(const char * s,int32_t len)566 _isExtensionSubtag(const char* s, int32_t len) {
567 /*
568 * extension = singleton 1*("-" (2*8alphanum))
569 */
570 return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
571 }
572
573 U_CFUNC UBool
ultag_isExtensionSubtags(const char * s,int32_t len)574 ultag_isExtensionSubtags(const char* s, int32_t len) {
575 return _isSepListOf(&_isExtensionSubtag, s, len);
576 }
577
578 static UBool
_isPrivateuseValueSubtag(const char * s,int32_t len)579 _isPrivateuseValueSubtag(const char* s, int32_t len) {
580 /*
581 * privateuse = "x" 1*("-" (1*8alphanum))
582 */
583 return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
584 }
585
586 U_CFUNC UBool
ultag_isPrivateuseValueSubtags(const char * s,int32_t len)587 ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
588 return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
589 }
590
591 U_CFUNC UBool
ultag_isUnicodeLocaleAttribute(const char * s,int32_t len)592 ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
593 /*
594 * attribute = alphanum{3,8} ;
595 */
596 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
597 }
598
599 U_CFUNC UBool
ultag_isUnicodeLocaleAttributes(const char * s,int32_t len)600 ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
601 return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
602 }
603
604 U_CFUNC UBool
ultag_isUnicodeLocaleKey(const char * s,int32_t len)605 ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
606 /*
607 * key = alphanum alpha ;
608 */
609 if (len < 0) {
610 len = (int32_t)uprv_strlen(s);
611 }
612 if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
613 return TRUE;
614 }
615 return FALSE;
616 }
617
618 U_CFUNC UBool
_isUnicodeLocaleTypeSubtag(const char * s,int32_t len)619 _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
620 /*
621 * alphanum{3,8}
622 */
623 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
624 }
625
626 U_CFUNC UBool
ultag_isUnicodeLocaleType(const char * s,int32_t len)627 ultag_isUnicodeLocaleType(const char*s, int32_t len) {
628 /*
629 * type = alphanum{3,8} (sep alphanum{3,8})* ;
630 */
631 return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
632 }
633
634 static UBool
_isTKey(const char * s,int32_t len)635 _isTKey(const char* s, int32_t len)
636 {
637 /*
638 * tkey = alpha digit ;
639 */
640 if (len < 0) {
641 len = (int32_t)uprv_strlen(s);
642 }
643 if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
644 return TRUE;
645 }
646 return FALSE;
647 }
648
649 U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char * localeID)650 ultag_getTKeyStart(const char *localeID) {
651 const char *result = localeID;
652 const char *sep;
653 while((sep = uprv_strchr(result, SEP)) != nullptr) {
654 if (_isTKey(result, static_cast<int32_t>(sep - result))) {
655 return result;
656 }
657 result = ++sep;
658 }
659 if (_isTKey(result, -1)) {
660 return result;
661 }
662 return nullptr;
663 }
664
665 static UBool
_isTValue(const char * s,int32_t len)666 _isTValue(const char* s, int32_t len)
667 {
668 /*
669 * tvalue = (sep alphanum{3,8})+ ;
670 */
671 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
672 }
673
674 static UBool
_isTransformedExtensionSubtag(int32_t & state,const char * s,int32_t len)675 _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
676 {
677 const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end
678 const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
679 // unicode_region_subtag, unicode_variant_subtag, tkey or end
680 const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
681 // unicode_variant_subtag, tkey, or end
682 const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
683 // tkey, or end.
684 const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
685 // tkey or end.
686 const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here.
687 const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end
688
689
690 if (len < 0) {
691 len = (int32_t)uprv_strlen(s);
692 }
693 switch (state) {
694 case kStart:
695 if (ultag_isLanguageSubtag(s, len) && len != 4) {
696 state = kGotLanguage;
697 return TRUE;
698 }
699 if (_isTKey(s, len)) {
700 state = kGotTKey;
701 return TRUE;
702 }
703 return FALSE;
704 case kGotLanguage:
705 if (ultag_isScriptSubtag(s, len)) {
706 state = kGotScript;
707 return TRUE;
708 }
709 U_FALLTHROUGH;
710 case kGotScript:
711 if (ultag_isRegionSubtag(s, len)) {
712 state = kGotRegion;
713 return TRUE;
714 }
715 U_FALLTHROUGH;
716 case kGotRegion:
717 U_FALLTHROUGH;
718 case kGotVariant:
719 if (_isVariantSubtag(s, len)) {
720 state = kGotVariant;
721 return TRUE;
722 }
723 if (_isTKey(s, len)) {
724 state = kGotTKey;
725 return TRUE;
726 }
727 return FALSE;
728 case kGotTKey:
729 if (_isTValue(s, len)) {
730 state = kGotTValue;
731 return TRUE;
732 }
733 return FALSE;
734 case kGotTValue:
735 if (_isTKey(s, len)) {
736 state = kGotTKey;
737 return TRUE;
738 }
739 if (_isTValue(s, len)) {
740 return TRUE;
741 }
742 return FALSE;
743 }
744 return FALSE;
745 }
746
747 static UBool
_isUnicodeExtensionSubtag(int32_t & state,const char * s,int32_t len)748 _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
749 {
750 const int32_t kStart = 0; // Start, wait for a key or attribute or end
751 const int32_t kGotKey = 1; // Got a key, wait for type or key or end
752 const int32_t kGotType = 2; // Got a type, wait for key or end
753
754 switch (state) {
755 case kStart:
756 if (ultag_isUnicodeLocaleKey(s, len)) {
757 state = kGotKey;
758 return TRUE;
759 }
760 if (ultag_isUnicodeLocaleAttribute(s, len)) {
761 return TRUE;
762 }
763 return FALSE;
764 case kGotKey:
765 if (ultag_isUnicodeLocaleKey(s, len)) {
766 return TRUE;
767 }
768 if (_isUnicodeLocaleTypeSubtag(s, len)) {
769 state = kGotType;
770 return TRUE;
771 }
772 return FALSE;
773 case kGotType:
774 if (ultag_isUnicodeLocaleKey(s, len)) {
775 state = kGotKey;
776 return TRUE;
777 }
778 if (_isUnicodeLocaleTypeSubtag(s, len)) {
779 return TRUE;
780 }
781 return FALSE;
782 }
783 return FALSE;
784 }
785
786 static UBool
_isStatefulSepListOf(UBool (* test)(int32_t &,const char *,int32_t),const char * s,int32_t len)787 _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
788 {
789 int32_t state = 0;
790 const char* p;
791 const char* start = s;
792 int32_t subtagLen = 0;
793
794 if (len < 0) {
795 len = (int32_t)uprv_strlen(s);
796 }
797
798 for (p = s; len > 0; p++, len--) {
799 if (*p == SEP) {
800 if (!test(state, start, subtagLen)) {
801 return FALSE;
802 }
803 subtagLen = 0;
804 start = p + 1;
805 } else {
806 subtagLen++;
807 }
808 }
809
810 if (test(state, start, subtagLen) && state >= 0) {
811 return TRUE;
812 }
813 return FALSE;
814 }
815
816 U_CFUNC UBool
ultag_isTransformedExtensionSubtags(const char * s,int32_t len)817 ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
818 {
819 return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
820 }
821
822 U_CFUNC UBool
ultag_isUnicodeExtensionSubtags(const char * s,int32_t len)823 ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
824 return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
825 }
826
827
828 /*
829 * -------------------------------------------------
830 *
831 * Helper functions
832 *
833 * -------------------------------------------------
834 */
835
836 static UBool
_addVariantToList(VariantListEntry ** first,VariantListEntry * var)837 _addVariantToList(VariantListEntry **first, VariantListEntry *var) {
838 UBool bAdded = TRUE;
839
840 if (*first == NULL) {
841 var->next = NULL;
842 *first = var;
843 } else {
844 VariantListEntry *prev, *cur;
845 int32_t cmp;
846
847 /* variants order should be preserved */
848 prev = NULL;
849 cur = *first;
850 while (TRUE) {
851 if (cur == NULL) {
852 prev->next = var;
853 var->next = NULL;
854 break;
855 }
856
857 /* Checking for duplicate variant */
858 cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
859 if (cmp == 0) {
860 /* duplicated variant */
861 bAdded = FALSE;
862 break;
863 }
864 prev = cur;
865 cur = cur->next;
866 }
867 }
868
869 return bAdded;
870 }
871
872 static UBool
_addAttributeToList(AttributeListEntry ** first,AttributeListEntry * attr)873 _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
874 UBool bAdded = TRUE;
875
876 if (*first == NULL) {
877 attr->next = NULL;
878 *first = attr;
879 } else {
880 AttributeListEntry *prev, *cur;
881 int32_t cmp;
882
883 /* reorder variants in alphabetical order */
884 prev = NULL;
885 cur = *first;
886 while (TRUE) {
887 if (cur == NULL) {
888 prev->next = attr;
889 attr->next = NULL;
890 break;
891 }
892 cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
893 if (cmp < 0) {
894 if (prev == NULL) {
895 *first = attr;
896 } else {
897 prev->next = attr;
898 }
899 attr->next = cur;
900 break;
901 }
902 if (cmp == 0) {
903 /* duplicated variant */
904 bAdded = FALSE;
905 break;
906 }
907 prev = cur;
908 cur = cur->next;
909 }
910 }
911
912 return bAdded;
913 }
914
915
916 static UBool
_addExtensionToList(ExtensionListEntry ** first,ExtensionListEntry * ext,UBool localeToBCP)917 _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
918 UBool bAdded = TRUE;
919
920 if (*first == NULL) {
921 ext->next = NULL;
922 *first = ext;
923 } else {
924 ExtensionListEntry *prev, *cur;
925 int32_t cmp;
926
927 /* reorder variants in alphabetical order */
928 prev = NULL;
929 cur = *first;
930 while (TRUE) {
931 if (cur == NULL) {
932 prev->next = ext;
933 ext->next = NULL;
934 break;
935 }
936 if (localeToBCP) {
937 /* special handling for locale to bcp conversion */
938 int32_t len, curlen;
939
940 len = (int32_t)uprv_strlen(ext->key);
941 curlen = (int32_t)uprv_strlen(cur->key);
942
943 if (len == 1 && curlen == 1) {
944 if (*(ext->key) == *(cur->key)) {
945 cmp = 0;
946 } else if (*(ext->key) == PRIVATEUSE) {
947 cmp = 1;
948 } else if (*(cur->key) == PRIVATEUSE) {
949 cmp = -1;
950 } else {
951 cmp = *(ext->key) - *(cur->key);
952 }
953 } else if (len == 1) {
954 cmp = *(ext->key) - LDMLEXT;
955 } else if (curlen == 1) {
956 cmp = LDMLEXT - *(cur->key);
957 } else {
958 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
959 /* Both are u extension keys - we need special handling for 'attribute' */
960 if (cmp != 0) {
961 if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
962 cmp = 1;
963 } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
964 cmp = -1;
965 }
966 }
967 }
968 } else {
969 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
970 }
971 if (cmp < 0) {
972 if (prev == NULL) {
973 *first = ext;
974 } else {
975 prev->next = ext;
976 }
977 ext->next = cur;
978 break;
979 }
980 if (cmp == 0) {
981 /* duplicated extension key */
982 bAdded = FALSE;
983 break;
984 }
985 prev = cur;
986 cur = cur->next;
987 }
988 }
989
990 return bAdded;
991 }
992
993 static void
_initializeULanguageTag(ULanguageTag * langtag)994 _initializeULanguageTag(ULanguageTag* langtag) {
995 int32_t i;
996
997 langtag->buf = NULL;
998
999 langtag->language = EMPTY;
1000 for (i = 0; i < MAXEXTLANG; i++) {
1001 langtag->extlang[i] = NULL;
1002 }
1003
1004 langtag->script = EMPTY;
1005 langtag->region = EMPTY;
1006
1007 langtag->variants = NULL;
1008 langtag->extensions = NULL;
1009
1010 langtag->legacy = EMPTY;
1011 langtag->privateuse = EMPTY;
1012 }
1013
1014 static void
_appendLanguageToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1015 _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1016 char buf[ULOC_LANG_CAPACITY];
1017 UErrorCode tmpStatus = U_ZERO_ERROR;
1018 int32_t len, i;
1019
1020 if (U_FAILURE(*status)) {
1021 return;
1022 }
1023
1024 len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1025 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1026 if (strict) {
1027 *status = U_ILLEGAL_ARGUMENT_ERROR;
1028 return;
1029 }
1030 len = 0;
1031 }
1032
1033 /* Note: returned language code is in lower case letters */
1034
1035 if (len == 0) {
1036 sink.Append(LANG_UND, LANG_UND_LEN);
1037 } else if (!ultag_isLanguageSubtag(buf, len)) {
1038 /* invalid language code */
1039 if (strict) {
1040 *status = U_ILLEGAL_ARGUMENT_ERROR;
1041 return;
1042 }
1043 sink.Append(LANG_UND, LANG_UND_LEN);
1044 } else {
1045 /* resolve deprecated */
1046 for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1047 // 2-letter deprecated subtags are listede before 3-letter
1048 // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1049 // across the 1st 3-letter subtag, if the input is a 2-letter code.
1050 // to avoid continuing to try when there's no match.
1051 if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1052 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1053 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1054 len = (int32_t)uprv_strlen(buf);
1055 break;
1056 }
1057 }
1058 sink.Append(buf, len);
1059 }
1060 }
1061
1062 static void
_appendScriptToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1063 _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1064 char buf[ULOC_SCRIPT_CAPACITY];
1065 UErrorCode tmpStatus = U_ZERO_ERROR;
1066 int32_t len;
1067
1068 if (U_FAILURE(*status)) {
1069 return;
1070 }
1071
1072 len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1073 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1074 if (strict) {
1075 *status = U_ILLEGAL_ARGUMENT_ERROR;
1076 }
1077 return;
1078 }
1079
1080 if (len > 0) {
1081 if (!ultag_isScriptSubtag(buf, len)) {
1082 /* invalid script code */
1083 if (strict) {
1084 *status = U_ILLEGAL_ARGUMENT_ERROR;
1085 }
1086 return;
1087 } else {
1088 sink.Append("-", 1);
1089 sink.Append(buf, len);
1090 }
1091 }
1092 }
1093
1094 static void
_appendRegionToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)1095 _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1096 char buf[ULOC_COUNTRY_CAPACITY];
1097 UErrorCode tmpStatus = U_ZERO_ERROR;
1098 int32_t len;
1099
1100 if (U_FAILURE(*status)) {
1101 return;
1102 }
1103
1104 len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1105 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1106 if (strict) {
1107 *status = U_ILLEGAL_ARGUMENT_ERROR;
1108 }
1109 return;
1110 }
1111
1112 if (len > 0) {
1113 if (!ultag_isRegionSubtag(buf, len)) {
1114 /* invalid region code */
1115 if (strict) {
1116 *status = U_ILLEGAL_ARGUMENT_ERROR;
1117 }
1118 return;
1119 } else {
1120 sink.Append("-", 1);
1121 /* resolve deprecated */
1122 for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1123 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1124 uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1125 len = (int32_t)uprv_strlen(buf);
1126 break;
1127 }
1128 }
1129 sink.Append(buf, len);
1130 }
1131 }
1132 }
1133
_sortVariants(VariantListEntry * first)1134 static void _sortVariants(VariantListEntry* first) {
1135 for (VariantListEntry* var1 = first; var1 != NULL; var1 = var1->next) {
1136 for (VariantListEntry* var2 = var1->next; var2 != NULL; var2 = var2->next) {
1137 // Swap var1->variant and var2->variant.
1138 if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) {
1139 const char* temp = var1->variant;
1140 var1->variant = var2->variant;
1141 var2->variant = temp;
1142 }
1143 }
1144 }
1145 }
1146
1147 static void
_appendVariantsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool * hadPosix,UErrorCode * status)1148 _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1149 char buf[ULOC_FULLNAME_CAPACITY];
1150 UErrorCode tmpStatus = U_ZERO_ERROR;
1151 int32_t len, i;
1152
1153 if (U_FAILURE(*status)) {
1154 return;
1155 }
1156
1157 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1158 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1159 if (strict) {
1160 *status = U_ILLEGAL_ARGUMENT_ERROR;
1161 }
1162 return;
1163 }
1164
1165 if (len > 0) {
1166 char *p, *pVar;
1167 UBool bNext = TRUE;
1168 VariantListEntry *var;
1169 VariantListEntry *varFirst = NULL;
1170
1171 pVar = NULL;
1172 p = buf;
1173 while (bNext) {
1174 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1175 if (*p == 0) {
1176 bNext = FALSE;
1177 } else {
1178 *p = 0; /* terminate */
1179 }
1180 if (pVar == NULL) {
1181 if (strict) {
1182 *status = U_ILLEGAL_ARGUMENT_ERROR;
1183 break;
1184 }
1185 /* ignore empty variant */
1186 } else {
1187 /* ICU uses upper case letters for variants, but
1188 the canonical format is lowercase in BCP47 */
1189 for (i = 0; *(pVar + i) != 0; i++) {
1190 *(pVar + i) = uprv_tolower(*(pVar + i));
1191 }
1192
1193 /* validate */
1194 if (_isVariantSubtag(pVar, -1)) {
1195 if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1196 /* emit the variant to the list */
1197 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1198 if (var == NULL) {
1199 *status = U_MEMORY_ALLOCATION_ERROR;
1200 break;
1201 }
1202 var->variant = pVar;
1203 if (!_addVariantToList(&varFirst, var)) {
1204 /* duplicated variant */
1205 uprv_free(var);
1206 if (strict) {
1207 *status = U_ILLEGAL_ARGUMENT_ERROR;
1208 break;
1209 }
1210 }
1211 } else {
1212 /* Special handling for POSIX variant, need to remember that we had it and then */
1213 /* treat it like an extension later. */
1214 *hadPosix = TRUE;
1215 }
1216 } else if (strict) {
1217 *status = U_ILLEGAL_ARGUMENT_ERROR;
1218 break;
1219 } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1220 /* Handle private use subtags separately */
1221 break;
1222 }
1223 }
1224 /* reset variant starting position */
1225 pVar = NULL;
1226 } else if (pVar == NULL) {
1227 pVar = p;
1228 }
1229 p++;
1230 }
1231
1232 if (U_SUCCESS(*status)) {
1233 if (varFirst != NULL) {
1234 int32_t varLen;
1235
1236 /* per UTS35, we should sort the variants */
1237 _sortVariants(varFirst);
1238
1239 /* write out validated/normalized variants to the target */
1240 var = varFirst;
1241 while (var != NULL) {
1242 sink.Append("-", 1);
1243 varLen = (int32_t)uprv_strlen(var->variant);
1244 sink.Append(var->variant, varLen);
1245 var = var->next;
1246 }
1247 }
1248 }
1249
1250 /* clean up */
1251 var = varFirst;
1252 while (var != NULL) {
1253 VariantListEntry *tmpVar = var->next;
1254 uprv_free(var);
1255 var = tmpVar;
1256 }
1257
1258 if (U_FAILURE(*status)) {
1259 return;
1260 }
1261 }
1262 }
1263
1264 static void
_appendKeywordsToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1265 _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1266 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1267 int32_t attrBufLength = 0;
1268
1269 icu::MemoryPool<AttributeListEntry> attrPool;
1270 icu::MemoryPool<ExtensionListEntry> extPool;
1271 icu::MemoryPool<icu::CharString> strPool;
1272
1273 icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1274 if (U_FAILURE(*status) && !hadPosix) {
1275 return;
1276 }
1277 if (keywordEnum.isValid() || hadPosix) {
1278 /* reorder extensions */
1279 int32_t len;
1280 const char *key;
1281 ExtensionListEntry *firstExt = NULL;
1282 ExtensionListEntry *ext;
1283 AttributeListEntry *firstAttr = NULL;
1284 AttributeListEntry *attr;
1285 icu::MemoryPool<icu::CharString> extBufPool;
1286 const char *bcpKey=nullptr, *bcpValue=nullptr;
1287 UErrorCode tmpStatus = U_ZERO_ERROR;
1288 int32_t keylen;
1289 UBool isBcpUExt;
1290
1291 while (TRUE) {
1292 key = uenum_next(keywordEnum.getAlias(), NULL, status);
1293 if (key == NULL) {
1294 break;
1295 }
1296
1297 icu::CharString buf;
1298 {
1299 icu::CharStringByteSink sink(&buf);
1300 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
1301 }
1302 len = buf.length();
1303
1304 if (U_FAILURE(tmpStatus)) {
1305 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1306 *status = U_MEMORY_ALLOCATION_ERROR;
1307 break;
1308 }
1309 if (strict) {
1310 *status = U_ILLEGAL_ARGUMENT_ERROR;
1311 break;
1312 }
1313 /* ignore this keyword */
1314 tmpStatus = U_ZERO_ERROR;
1315 continue;
1316 }
1317
1318 keylen = (int32_t)uprv_strlen(key);
1319 isBcpUExt = (keylen > 1);
1320
1321 /* special keyword used for representing Unicode locale attributes */
1322 if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1323 if (len > 0) {
1324 int32_t i = 0;
1325 while (TRUE) {
1326 attrBufLength = 0;
1327 for (; i < len; i++) {
1328 if (buf[i] != '-') {
1329 attrBuf[attrBufLength++] = buf[i];
1330 } else {
1331 i++;
1332 break;
1333 }
1334 }
1335 if (attrBufLength > 0) {
1336 attrBuf[attrBufLength] = 0;
1337
1338 } else if (i >= len){
1339 break;
1340 }
1341
1342 /* create AttributeListEntry */
1343 attr = attrPool.create();
1344 if (attr == NULL) {
1345 *status = U_MEMORY_ALLOCATION_ERROR;
1346 break;
1347 }
1348 icu::CharString* attrValue =
1349 strPool.create(attrBuf, attrBufLength, *status);
1350 if (attrValue == NULL) {
1351 *status = U_MEMORY_ALLOCATION_ERROR;
1352 break;
1353 }
1354 if (U_FAILURE(*status)) {
1355 break;
1356 }
1357 attr->attribute = attrValue->data();
1358
1359 if (!_addAttributeToList(&firstAttr, attr)) {
1360 if (strict) {
1361 *status = U_ILLEGAL_ARGUMENT_ERROR;
1362 break;
1363 }
1364 }
1365 }
1366 /* for a place holder ExtensionListEntry */
1367 bcpKey = LOCALE_ATTRIBUTE_KEY;
1368 bcpValue = NULL;
1369 }
1370 } else if (isBcpUExt) {
1371 bcpKey = uloc_toUnicodeLocaleKey(key);
1372 if (bcpKey == NULL) {
1373 if (strict) {
1374 *status = U_ILLEGAL_ARGUMENT_ERROR;
1375 break;
1376 }
1377 continue;
1378 }
1379
1380 /* we've checked buf is null-terminated above */
1381 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1382 if (bcpValue == NULL) {
1383 if (strict) {
1384 *status = U_ILLEGAL_ARGUMENT_ERROR;
1385 break;
1386 }
1387 continue;
1388 }
1389 if (bcpValue == buf.data()) {
1390 /*
1391 When uloc_toUnicodeLocaleType(key, buf) returns the
1392 input value as is, the value is well-formed, but has
1393 no known mapping. This implementation normalizes the
1394 value to lower case
1395 */
1396 icu::CharString* extBuf = extBufPool.create(buf, tmpStatus);
1397
1398 if (extBuf == nullptr) {
1399 *status = U_MEMORY_ALLOCATION_ERROR;
1400 break;
1401 }
1402 if (U_FAILURE(tmpStatus)) {
1403 *status = tmpStatus;
1404 break;
1405 }
1406
1407 T_CString_toLowerCase(extBuf->data());
1408 bcpValue = extBuf->data();
1409 }
1410 } else {
1411 if (*key == PRIVATEUSE) {
1412 if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1413 if (strict) {
1414 *status = U_ILLEGAL_ARGUMENT_ERROR;
1415 break;
1416 }
1417 continue;
1418 }
1419 } else {
1420 if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1421 if (strict) {
1422 *status = U_ILLEGAL_ARGUMENT_ERROR;
1423 break;
1424 }
1425 continue;
1426 }
1427 }
1428 bcpKey = key;
1429 icu::CharString* extBuf =
1430 extBufPool.create(buf.data(), len, tmpStatus);
1431 if (extBuf == nullptr) {
1432 *status = U_MEMORY_ALLOCATION_ERROR;
1433 break;
1434 }
1435 if (U_FAILURE(tmpStatus)) {
1436 *status = tmpStatus;
1437 break;
1438 }
1439 bcpValue = extBuf->data();
1440 }
1441
1442 /* create ExtensionListEntry */
1443 ext = extPool.create();
1444 if (ext == NULL) {
1445 *status = U_MEMORY_ALLOCATION_ERROR;
1446 break;
1447 }
1448 ext->key = bcpKey;
1449 ext->value = bcpValue;
1450
1451 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1452 if (strict) {
1453 *status = U_ILLEGAL_ARGUMENT_ERROR;
1454 break;
1455 }
1456 }
1457 }
1458
1459 /* Special handling for POSIX variant - add the keywords for POSIX */
1460 if (hadPosix) {
1461 /* create ExtensionListEntry for POSIX */
1462 ext = extPool.create();
1463 if (ext == NULL) {
1464 *status = U_MEMORY_ALLOCATION_ERROR;
1465 return;
1466 }
1467 ext->key = POSIX_KEY;
1468 ext->value = POSIX_VALUE;
1469
1470 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1471 // Silently ignore errors.
1472 }
1473 }
1474
1475 if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
1476 UBool startLDMLExtension = FALSE;
1477 for (ext = firstExt; ext; ext = ext->next) {
1478 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1479 /* first LDML u singlton extension */
1480 sink.Append("-u", 2);
1481 startLDMLExtension = TRUE;
1482 }
1483
1484 /* write out the sorted BCP47 attributes, extensions and private use */
1485 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1486 /* write the value for the attributes */
1487 for (attr = firstAttr; attr; attr = attr->next) {
1488 sink.Append("-", 1);
1489 sink.Append(
1490 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1491 }
1492 } else {
1493 sink.Append("-", 1);
1494 sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1495 if (uprv_strcmp(ext->value, "true") != 0 &&
1496 uprv_strcmp(ext->value, "yes") != 0) {
1497 sink.Append("-", 1);
1498 sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1499 }
1500 }
1501 }
1502 }
1503 }
1504 }
1505
1506 /**
1507 * Append keywords parsed from LDML extension value
1508 * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1509 * Note: char* buf is used for storing keywords
1510 */
1511 static void
_appendLDMLExtensionAsKeywords(const char * ldmlext,ExtensionListEntry ** appendTo,icu::MemoryPool<ExtensionListEntry> & extPool,icu::MemoryPool<icu::CharString> & kwdBuf,UBool * posixVariant,UErrorCode * status)1512 _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1513 const char *pTag; /* beginning of current subtag */
1514 const char *pKwds; /* beginning of key-type pairs */
1515 UBool variantExists = *posixVariant;
1516
1517 ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */
1518 ExtensionListEntry *kwd, *nextKwd;
1519
1520 int32_t len;
1521
1522 /* Reset the posixVariant value */
1523 *posixVariant = FALSE;
1524
1525 pTag = ldmlext;
1526 pKwds = NULL;
1527
1528 {
1529 AttributeListEntry *attrFirst = NULL; /* first attribute */
1530 AttributeListEntry *attr, *nextAttr;
1531
1532 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1533 int32_t attrBufIdx = 0;
1534
1535 icu::MemoryPool<AttributeListEntry> attrPool;
1536
1537 /* Iterate through u extension attributes */
1538 while (*pTag) {
1539 /* locate next separator char */
1540 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1541
1542 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1543 pKwds = pTag;
1544 break;
1545 }
1546
1547 /* add this attribute to the list */
1548 attr = attrPool.create();
1549 if (attr == NULL) {
1550 *status = U_MEMORY_ALLOCATION_ERROR;
1551 return;
1552 }
1553
1554 if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1555 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1556 attrBuf[attrBufIdx + len] = 0;
1557 attr->attribute = &attrBuf[attrBufIdx];
1558 attrBufIdx += (len + 1);
1559 } else {
1560 *status = U_ILLEGAL_ARGUMENT_ERROR;
1561 return;
1562 }
1563
1564 // duplicate attribute is ignored, causes no error.
1565 _addAttributeToList(&attrFirst, attr);
1566
1567 /* next tag */
1568 pTag += len;
1569 if (*pTag) {
1570 /* next to the separator */
1571 pTag++;
1572 }
1573 }
1574
1575 if (attrFirst) {
1576 /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1577
1578 kwd = extPool.create();
1579 if (kwd == NULL) {
1580 *status = U_MEMORY_ALLOCATION_ERROR;
1581 return;
1582 }
1583
1584 icu::CharString* value = kwdBuf.create();
1585 if (value == NULL) {
1586 *status = U_MEMORY_ALLOCATION_ERROR;
1587 return;
1588 }
1589
1590 /* attribute subtags sorted in alphabetical order as type */
1591 attr = attrFirst;
1592 while (attr != NULL) {
1593 nextAttr = attr->next;
1594 if (attr != attrFirst) {
1595 value->append('-', *status);
1596 }
1597 value->append(attr->attribute, *status);
1598 attr = nextAttr;
1599 }
1600 if (U_FAILURE(*status)) {
1601 return;
1602 }
1603
1604 kwd->key = LOCALE_ATTRIBUTE_KEY;
1605 kwd->value = value->data();
1606
1607 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1608 *status = U_ILLEGAL_ARGUMENT_ERROR;
1609 return;
1610 }
1611 }
1612 }
1613
1614 if (pKwds) {
1615 const char *pBcpKey = NULL; /* u extenstion key subtag */
1616 const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */
1617 int32_t bcpKeyLen = 0;
1618 int32_t bcpTypeLen = 0;
1619 UBool isDone = FALSE;
1620
1621 pTag = pKwds;
1622 /* BCP47 representation of LDML key/type pairs */
1623 while (!isDone) {
1624 const char *pNextBcpKey = NULL;
1625 int32_t nextBcpKeyLen = 0;
1626 UBool emitKeyword = FALSE;
1627
1628 if (*pTag) {
1629 /* locate next separator char */
1630 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1631
1632 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1633 if (pBcpKey) {
1634 emitKeyword = TRUE;
1635 pNextBcpKey = pTag;
1636 nextBcpKeyLen = len;
1637 } else {
1638 pBcpKey = pTag;
1639 bcpKeyLen = len;
1640 }
1641 } else {
1642 U_ASSERT(pBcpKey != NULL);
1643 /* within LDML type subtags */
1644 if (pBcpType) {
1645 bcpTypeLen += (len + 1);
1646 } else {
1647 pBcpType = pTag;
1648 bcpTypeLen = len;
1649 }
1650 }
1651
1652 /* next tag */
1653 pTag += len;
1654 if (*pTag) {
1655 /* next to the separator */
1656 pTag++;
1657 }
1658 } else {
1659 /* processing last one */
1660 emitKeyword = TRUE;
1661 isDone = TRUE;
1662 }
1663
1664 if (emitKeyword) {
1665 const char *pKey = NULL; /* LDML key */
1666 const char *pType = NULL; /* LDML type */
1667
1668 char bcpKeyBuf[3]; /* BCP key length is always 2 for now */
1669
1670 U_ASSERT(pBcpKey != NULL);
1671
1672 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1673 /* the BCP key is invalid */
1674 *status = U_ILLEGAL_ARGUMENT_ERROR;
1675 return;
1676 }
1677 U_ASSERT(bcpKeyLen <= 2);
1678
1679 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1680 bcpKeyBuf[bcpKeyLen] = 0;
1681
1682 /* u extension key to LDML key */
1683 pKey = uloc_toLegacyKey(bcpKeyBuf);
1684 if (pKey == NULL) {
1685 *status = U_ILLEGAL_ARGUMENT_ERROR;
1686 return;
1687 }
1688 if (pKey == bcpKeyBuf) {
1689 /*
1690 The key returned by toLegacyKey points to the input buffer.
1691 We normalize the result key to lower case.
1692 */
1693 T_CString_toLowerCase(bcpKeyBuf);
1694 icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1695 if (key == NULL) {
1696 *status = U_MEMORY_ALLOCATION_ERROR;
1697 return;
1698 }
1699 if (U_FAILURE(*status)) {
1700 return;
1701 }
1702 pKey = key->data();
1703 }
1704
1705 if (pBcpType) {
1706 char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */
1707 if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1708 /* the BCP type is too long */
1709 *status = U_ILLEGAL_ARGUMENT_ERROR;
1710 return;
1711 }
1712
1713 uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1714 bcpTypeBuf[bcpTypeLen] = 0;
1715
1716 /* BCP type to locale type */
1717 pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1718 if (pType == NULL) {
1719 *status = U_ILLEGAL_ARGUMENT_ERROR;
1720 return;
1721 }
1722 if (pType == bcpTypeBuf) {
1723 /*
1724 The type returned by toLegacyType points to the input buffer.
1725 We normalize the result type to lower case.
1726 */
1727 /* normalize to lower case */
1728 T_CString_toLowerCase(bcpTypeBuf);
1729 icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1730 if (type == NULL) {
1731 *status = U_MEMORY_ALLOCATION_ERROR;
1732 return;
1733 }
1734 if (U_FAILURE(*status)) {
1735 return;
1736 }
1737 pType = type->data();
1738 }
1739 } else {
1740 /* typeless - default type value is "yes" */
1741 pType = LOCALE_TYPE_YES;
1742 }
1743
1744 /* Special handling for u-va-posix, since we want to treat this as a variant,
1745 not as a keyword */
1746 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1747 *posixVariant = TRUE;
1748 } else {
1749 /* create an ExtensionListEntry for this keyword */
1750 kwd = extPool.create();
1751 if (kwd == NULL) {
1752 *status = U_MEMORY_ALLOCATION_ERROR;
1753 return;
1754 }
1755
1756 kwd->key = pKey;
1757 kwd->value = pType;
1758
1759 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1760 // duplicate keyword is allowed, Only the first
1761 // is honored.
1762 }
1763 }
1764
1765 pBcpKey = pNextBcpKey;
1766 bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
1767 pBcpType = NULL;
1768 bcpTypeLen = 0;
1769 }
1770 }
1771 }
1772
1773 kwd = kwdFirst;
1774 while (kwd != NULL) {
1775 nextKwd = kwd->next;
1776 _addExtensionToList(appendTo, kwd, FALSE);
1777 kwd = nextKwd;
1778 }
1779 }
1780
1781
1782 static void
_appendKeywords(ULanguageTag * langtag,icu::ByteSink & sink,UErrorCode * status)1783 _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1784 int32_t i, n;
1785 int32_t len;
1786 ExtensionListEntry *kwdFirst = NULL;
1787 ExtensionListEntry *kwd;
1788 const char *key, *type;
1789 icu::MemoryPool<ExtensionListEntry> extPool;
1790 icu::MemoryPool<icu::CharString> kwdBuf;
1791 UBool posixVariant = FALSE;
1792
1793 if (U_FAILURE(*status)) {
1794 return;
1795 }
1796
1797 n = ultag_getExtensionsSize(langtag);
1798
1799 /* resolve locale keywords and reordering keys */
1800 for (i = 0; i < n; i++) {
1801 key = ultag_getExtensionKey(langtag, i);
1802 type = ultag_getExtensionValue(langtag, i);
1803 if (*key == LDMLEXT) {
1804 /* Determine if variants already exists */
1805 if (ultag_getVariantsSize(langtag)) {
1806 posixVariant = TRUE;
1807 }
1808
1809 _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1810 if (U_FAILURE(*status)) {
1811 break;
1812 }
1813 } else {
1814 kwd = extPool.create();
1815 if (kwd == NULL) {
1816 *status = U_MEMORY_ALLOCATION_ERROR;
1817 break;
1818 }
1819 kwd->key = key;
1820 kwd->value = type;
1821 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1822 *status = U_ILLEGAL_ARGUMENT_ERROR;
1823 break;
1824 }
1825 }
1826 }
1827
1828 if (U_SUCCESS(*status)) {
1829 type = ultag_getPrivateUse(langtag);
1830 if ((int32_t)uprv_strlen(type) > 0) {
1831 /* add private use as a keyword */
1832 kwd = extPool.create();
1833 if (kwd == NULL) {
1834 *status = U_MEMORY_ALLOCATION_ERROR;
1835 } else {
1836 kwd->key = PRIVATEUSE_KEY;
1837 kwd->value = type;
1838 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1839 *status = U_ILLEGAL_ARGUMENT_ERROR;
1840 }
1841 }
1842 }
1843 }
1844
1845 /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1846
1847 if (U_SUCCESS(*status) && posixVariant) {
1848 len = (int32_t) uprv_strlen(_POSIX);
1849 sink.Append(_POSIX, len);
1850 }
1851
1852 if (U_SUCCESS(*status) && kwdFirst != NULL) {
1853 /* write out the sorted keywords */
1854 UBool firstValue = TRUE;
1855 kwd = kwdFirst;
1856 do {
1857 if (firstValue) {
1858 sink.Append("@", 1);
1859 firstValue = FALSE;
1860 } else {
1861 sink.Append(";", 1);
1862 }
1863
1864 /* key */
1865 len = (int32_t)uprv_strlen(kwd->key);
1866 sink.Append(kwd->key, len);
1867 sink.Append("=", 1);
1868
1869 /* type */
1870 len = (int32_t)uprv_strlen(kwd->value);
1871 sink.Append(kwd->value, len);
1872
1873 kwd = kwd->next;
1874 } while (kwd);
1875 }
1876 }
1877
1878 static void
_appendPrivateuseToLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UBool hadPosix,UErrorCode * status)1879 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1880 (void)hadPosix;
1881 char buf[ULOC_FULLNAME_CAPACITY];
1882 char tmpAppend[ULOC_FULLNAME_CAPACITY];
1883 UErrorCode tmpStatus = U_ZERO_ERROR;
1884 int32_t len, i;
1885 int32_t reslen = 0;
1886 int32_t capacity = sizeof tmpAppend;
1887
1888 if (U_FAILURE(*status)) {
1889 return;
1890 }
1891
1892 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1893 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1894 if (strict) {
1895 *status = U_ILLEGAL_ARGUMENT_ERROR;
1896 }
1897 return;
1898 }
1899
1900 if (len > 0) {
1901 char *p, *pPriv;
1902 UBool bNext = TRUE;
1903 UBool firstValue = TRUE;
1904 UBool writeValue;
1905
1906 pPriv = NULL;
1907 p = buf;
1908 while (bNext) {
1909 writeValue = FALSE;
1910 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1911 if (*p == 0) {
1912 bNext = FALSE;
1913 } else {
1914 *p = 0; /* terminate */
1915 }
1916 if (pPriv != NULL) {
1917 /* Private use in the canonical format is lowercase in BCP47 */
1918 for (i = 0; *(pPriv + i) != 0; i++) {
1919 *(pPriv + i) = uprv_tolower(*(pPriv + i));
1920 }
1921
1922 /* validate */
1923 if (_isPrivateuseValueSubtag(pPriv, -1)) {
1924 if (firstValue) {
1925 if (!_isVariantSubtag(pPriv, -1)) {
1926 writeValue = TRUE;
1927 }
1928 } else {
1929 writeValue = TRUE;
1930 }
1931 } else if (strict) {
1932 *status = U_ILLEGAL_ARGUMENT_ERROR;
1933 break;
1934 } else {
1935 break;
1936 }
1937
1938 if (writeValue) {
1939 if (reslen < capacity) {
1940 tmpAppend[reslen++] = SEP;
1941 }
1942
1943 if (firstValue) {
1944 if (reslen < capacity) {
1945 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1946 }
1947
1948 if (reslen < capacity) {
1949 tmpAppend[reslen++] = SEP;
1950 }
1951
1952 len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1953 if (reslen < capacity) {
1954 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1955 }
1956 reslen += len;
1957
1958 if (reslen < capacity) {
1959 tmpAppend[reslen++] = SEP;
1960 }
1961
1962 firstValue = FALSE;
1963 }
1964
1965 len = (int32_t)uprv_strlen(pPriv);
1966 if (reslen < capacity) {
1967 uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1968 }
1969 reslen += len;
1970 }
1971 }
1972 /* reset private use starting position */
1973 pPriv = NULL;
1974 } else if (pPriv == NULL) {
1975 pPriv = p;
1976 }
1977 p++;
1978 }
1979
1980 if (U_FAILURE(*status)) {
1981 return;
1982 }
1983 }
1984
1985 if (U_SUCCESS(*status)) {
1986 len = reslen;
1987 sink.Append(tmpAppend, len);
1988 }
1989 }
1990
1991 /*
1992 * -------------------------------------------------
1993 *
1994 * ultag_ functions
1995 *
1996 * -------------------------------------------------
1997 */
1998
1999 /* Bit flags used by the parser */
2000 #define LANG 0x0001
2001 #define EXTL 0x0002
2002 #define SCRT 0x0004
2003 #define REGN 0x0008
2004 #define VART 0x0010
2005 #define EXTS 0x0020
2006 #define EXTV 0x0040
2007 #define PRIV 0x0080
2008
2009 /**
2010 * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function.
2011 * As a work-around, optimization is disabled for this function on VS2015 and VS2017.
2012 * This work-around should be removed once the following versions of Visual Studio are no
2013 * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4.
2014 */
2015 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2016 #pragma optimize( "", off )
2017 #endif
2018
2019 static ULanguageTag*
ultag_parse(const char * tag,int32_t tagLen,int32_t * parsedLen,UErrorCode * status)2020 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2021 char *tagBuf;
2022 int16_t next;
2023 char *pSubtag, *pNext, *pLastGoodPosition;
2024 int32_t subtagLen;
2025 int32_t extlangIdx;
2026 ExtensionListEntry *pExtension;
2027 char *pExtValueSubtag, *pExtValueSubtagEnd;
2028 int32_t i;
2029 UBool privateuseVar = FALSE;
2030 int32_t legacyLen = 0;
2031
2032 if (parsedLen != NULL) {
2033 *parsedLen = 0;
2034 }
2035
2036 if (U_FAILURE(*status)) {
2037 return NULL;
2038 }
2039
2040 if (tagLen < 0) {
2041 tagLen = (int32_t)uprv_strlen(tag);
2042 }
2043
2044 /* copy the entire string */
2045 tagBuf = (char*)uprv_malloc(tagLen + 1);
2046 if (tagBuf == NULL) {
2047 *status = U_MEMORY_ALLOCATION_ERROR;
2048 return NULL;
2049 }
2050
2051 if (tagLen > 0) {
2052 uprv_memcpy(tagBuf, tag, tagLen);
2053 }
2054 *(tagBuf + tagLen) = 0;
2055
2056 /* create a ULanguageTag */
2057 icu::LocalULanguageTagPointer t(
2058 (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2059 if (t.isNull()) {
2060 uprv_free(tagBuf);
2061 *status = U_MEMORY_ALLOCATION_ERROR;
2062 return NULL;
2063 }
2064 _initializeULanguageTag(t.getAlias());
2065 t->buf = tagBuf;
2066
2067 if (tagLen < MINLEN) {
2068 /* the input tag is too short - return empty ULanguageTag */
2069 return t.orphan();
2070 }
2071
2072 size_t parsedLenDelta = 0;
2073 // Legacy tag will be consider together. Legacy tag with intervening
2074 // script and region such as art-DE-lojban or art-Latn-lojban won't be
2075 // matched.
2076 /* check if the tag is legacy */
2077 for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) {
2078 int32_t checkLegacyLen = static_cast<int32_t>(uprv_strlen(LEGACY[i]));
2079 if (tagLen < checkLegacyLen) {
2080 continue;
2081 }
2082 if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') {
2083 // make sure next char is '-'.
2084 continue;
2085 }
2086 if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) {
2087 int32_t newTagLength;
2088
2089 legacyLen = checkLegacyLen; /* back up for output parsedLen */
2090 int32_t replacementLen = static_cast<int32_t>(uprv_strlen(LEGACY[i+1]));
2091 newTagLength = replacementLen + tagLen - checkLegacyLen;
2092 if (tagLen < newTagLength) {
2093 uprv_free(tagBuf);
2094 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2095 if (tagBuf == NULL) {
2096 *status = U_MEMORY_ALLOCATION_ERROR;
2097 return NULL;
2098 }
2099 t->buf = tagBuf;
2100 tagLen = newTagLength;
2101 }
2102 parsedLenDelta = checkLegacyLen - replacementLen;
2103 uprv_strcpy(t->buf, LEGACY[i + 1]);
2104 if (checkLegacyLen != tagLen) {
2105 uprv_strcpy(t->buf + replacementLen, tag + checkLegacyLen);
2106 }
2107 break;
2108 }
2109 }
2110
2111 if (legacyLen == 0) {
2112 for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2113 const char* redundantTag = REDUNDANT[i];
2114 size_t redundantTagLen = uprv_strlen(redundantTag);
2115 // The preferred tag for a redundant tag is always shorter than redundant
2116 // tag. A redundant tag may or may not be followed by other subtags.
2117 // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2118 if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2119 const char* redundantTagEnd = tagBuf + redundantTagLen;
2120 if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
2121 const char* preferredTag = REDUNDANT[i + 1];
2122 size_t preferredTagLen = uprv_strlen(preferredTag);
2123 uprv_strncpy(t->buf, preferredTag, preferredTagLen);
2124 if (*redundantTagEnd == SEP) {
2125 uprv_memmove(tagBuf + preferredTagLen,
2126 redundantTagEnd,
2127 tagLen - redundantTagLen + 1);
2128 } else {
2129 tagBuf[preferredTagLen] = '\0';
2130 }
2131 // parsedLen should be the length of the input
2132 // before redundantTag is replaced by preferredTag.
2133 // Save the delta to add it back later.
2134 parsedLenDelta = redundantTagLen - preferredTagLen;
2135 break;
2136 }
2137 }
2138 }
2139 }
2140
2141 /*
2142 * langtag = language
2143 * ["-" script]
2144 * ["-" region]
2145 * *("-" variant)
2146 * *("-" extension)
2147 * ["-" privateuse]
2148 */
2149
2150 next = LANG | PRIV;
2151 pNext = pLastGoodPosition = tagBuf;
2152 extlangIdx = 0;
2153 pExtension = NULL;
2154 pExtValueSubtag = NULL;
2155 pExtValueSubtagEnd = NULL;
2156
2157 while (pNext) {
2158 char *pSep;
2159
2160 pSubtag = pNext;
2161
2162 /* locate next separator char */
2163 pSep = pSubtag;
2164 while (*pSep) {
2165 if (*pSep == SEP) {
2166 break;
2167 }
2168 pSep++;
2169 }
2170 if (*pSep == 0) {
2171 /* last subtag */
2172 pNext = NULL;
2173 } else {
2174 pNext = pSep + 1;
2175 }
2176 subtagLen = (int32_t)(pSep - pSubtag);
2177
2178 if (next & LANG) {
2179 if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2180 *pSep = 0; /* terminate */
2181 // TODO: move deprecated language code handling here.
2182 t->language = T_CString_toLowerCase(pSubtag);
2183
2184 pLastGoodPosition = pSep;
2185 next = SCRT | REGN | VART | EXTS | PRIV;
2186 if (subtagLen <= 3)
2187 next |= EXTL;
2188 continue;
2189 }
2190 }
2191 if (next & EXTL) {
2192 if (_isExtlangSubtag(pSubtag, subtagLen)) {
2193 *pSep = 0;
2194 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2195
2196 pLastGoodPosition = pSep;
2197 if (extlangIdx < 3) {
2198 next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2199 } else {
2200 next = SCRT | REGN | VART | EXTS | PRIV;
2201 }
2202 continue;
2203 }
2204 }
2205 if (next & SCRT) {
2206 if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2207 char *p = pSubtag;
2208
2209 *pSep = 0;
2210
2211 /* to title case */
2212 *p = uprv_toupper(*p);
2213 p++;
2214 for (; *p; p++) {
2215 *p = uprv_tolower(*p);
2216 }
2217
2218 t->script = pSubtag;
2219
2220 pLastGoodPosition = pSep;
2221 next = REGN | VART | EXTS | PRIV;
2222 continue;
2223 }
2224 }
2225 if (next & REGN) {
2226 if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2227 *pSep = 0;
2228 // TODO: move deprecated region code handling here.
2229 t->region = T_CString_toUpperCase(pSubtag);
2230
2231 pLastGoodPosition = pSep;
2232 next = VART | EXTS | PRIV;
2233 continue;
2234 }
2235 }
2236 if (next & VART) {
2237 if (_isVariantSubtag(pSubtag, subtagLen) ||
2238 (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2239 VariantListEntry *var;
2240 UBool isAdded;
2241
2242 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2243 if (var == NULL) {
2244 *status = U_MEMORY_ALLOCATION_ERROR;
2245 return NULL;
2246 }
2247 *pSep = 0;
2248 var->variant = T_CString_toUpperCase(pSubtag);
2249 isAdded = _addVariantToList(&(t->variants), var);
2250 if (!isAdded) {
2251 /* duplicated variant entry */
2252 uprv_free(var);
2253 break;
2254 }
2255 pLastGoodPosition = pSep;
2256 next = VART | EXTS | PRIV;
2257 continue;
2258 }
2259 }
2260 if (next & EXTS) {
2261 if (_isExtensionSingleton(pSubtag, subtagLen)) {
2262 if (pExtension != NULL) {
2263 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2264 /* the previous extension is incomplete */
2265 uprv_free(pExtension);
2266 pExtension = NULL;
2267 break;
2268 }
2269
2270 /* terminate the previous extension value */
2271 *pExtValueSubtagEnd = 0;
2272 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2273
2274 /* insert the extension to the list */
2275 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2276 pLastGoodPosition = pExtValueSubtagEnd;
2277 } else {
2278 /* stop parsing here */
2279 uprv_free(pExtension);
2280 pExtension = NULL;
2281 break;
2282 }
2283 }
2284
2285 /* create a new extension */
2286 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2287 if (pExtension == NULL) {
2288 *status = U_MEMORY_ALLOCATION_ERROR;
2289 return NULL;
2290 }
2291 *pSep = 0;
2292 pExtension->key = T_CString_toLowerCase(pSubtag);
2293 pExtension->value = NULL; /* will be set later */
2294
2295 /*
2296 * reset the start and the end location of extension value
2297 * subtags for this extension
2298 */
2299 pExtValueSubtag = NULL;
2300 pExtValueSubtagEnd = NULL;
2301
2302 next = EXTV;
2303 continue;
2304 }
2305 }
2306 if (next & EXTV) {
2307 if (_isExtensionSubtag(pSubtag, subtagLen)) {
2308 if (pExtValueSubtag == NULL) {
2309 /* if the start postion of this extension's value is not yet,
2310 this one is the first value subtag */
2311 pExtValueSubtag = pSubtag;
2312 }
2313
2314 /* Mark the end of this subtag */
2315 pExtValueSubtagEnd = pSep;
2316 next = EXTS | EXTV | PRIV;
2317
2318 continue;
2319 }
2320 }
2321 if (next & PRIV) {
2322 if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2323 char *pPrivuseVal;
2324
2325 if (pExtension != NULL) {
2326 /* Process the last extension */
2327 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2328 /* the previous extension is incomplete */
2329 uprv_free(pExtension);
2330 pExtension = NULL;
2331 break;
2332 } else {
2333 /* terminate the previous extension value */
2334 *pExtValueSubtagEnd = 0;
2335 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2336
2337 /* insert the extension to the list */
2338 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2339 pLastGoodPosition = pExtValueSubtagEnd;
2340 pExtension = NULL;
2341 } else {
2342 /* stop parsing here */
2343 uprv_free(pExtension);
2344 pExtension = NULL;
2345 break;
2346 }
2347 }
2348 }
2349
2350 /* The rest of part will be private use value subtags */
2351 if (pNext == NULL) {
2352 /* empty private use subtag */
2353 break;
2354 }
2355 /* back up the private use value start position */
2356 pPrivuseVal = pNext;
2357
2358 /* validate private use value subtags */
2359 while (pNext) {
2360 pSubtag = pNext;
2361 pSep = pSubtag;
2362 while (*pSep) {
2363 if (*pSep == SEP) {
2364 break;
2365 }
2366 pSep++;
2367 }
2368 if (*pSep == 0) {
2369 /* last subtag */
2370 pNext = NULL;
2371 } else {
2372 pNext = pSep + 1;
2373 }
2374 subtagLen = (int32_t)(pSep - pSubtag);
2375
2376 if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2377 *pSep = 0;
2378 next = VART;
2379 privateuseVar = TRUE;
2380 break;
2381 } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2382 pLastGoodPosition = pSep;
2383 } else {
2384 break;
2385 }
2386 }
2387
2388 if (next == VART) {
2389 continue;
2390 }
2391
2392 if (pLastGoodPosition - pPrivuseVal > 0) {
2393 *pLastGoodPosition = 0;
2394 t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2395 }
2396 /* No more subtags, exiting the parse loop */
2397 break;
2398 }
2399 break;
2400 }
2401
2402 /* If we fell through here, it means this subtag is illegal - quit parsing */
2403 break;
2404 }
2405
2406 if (pExtension != NULL) {
2407 /* Process the last extension */
2408 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2409 /* the previous extension is incomplete */
2410 uprv_free(pExtension);
2411 } else {
2412 /* terminate the previous extension value */
2413 *pExtValueSubtagEnd = 0;
2414 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2415 /* insert the extension to the list */
2416 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2417 pLastGoodPosition = pExtValueSubtagEnd;
2418 } else {
2419 uprv_free(pExtension);
2420 }
2421 }
2422 }
2423
2424 if (parsedLen != NULL) {
2425 *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2426 }
2427
2428 return t.orphan();
2429 }
2430
2431 // Ticket #12705 - Turn optimization back on.
2432 #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924)
2433 #pragma optimize( "", on )
2434 #endif
2435
2436 static void
ultag_close(ULanguageTag * langtag)2437 ultag_close(ULanguageTag* langtag) {
2438
2439 if (langtag == NULL) {
2440 return;
2441 }
2442
2443 uprv_free(langtag->buf);
2444
2445 if (langtag->variants) {
2446 VariantListEntry *curVar = langtag->variants;
2447 while (curVar) {
2448 VariantListEntry *nextVar = curVar->next;
2449 uprv_free(curVar);
2450 curVar = nextVar;
2451 }
2452 }
2453
2454 if (langtag->extensions) {
2455 ExtensionListEntry *curExt = langtag->extensions;
2456 while (curExt) {
2457 ExtensionListEntry *nextExt = curExt->next;
2458 uprv_free(curExt);
2459 curExt = nextExt;
2460 }
2461 }
2462
2463 uprv_free(langtag);
2464 }
2465
2466 static const char*
ultag_getLanguage(const ULanguageTag * langtag)2467 ultag_getLanguage(const ULanguageTag* langtag) {
2468 return langtag->language;
2469 }
2470
2471 #if 0
2472 static const char*
2473 ultag_getJDKLanguage(const ULanguageTag* langtag) {
2474 int32_t i;
2475 for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
2476 if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2477 return DEPRECATEDLANGS[i + 1];
2478 }
2479 }
2480 return langtag->language;
2481 }
2482 #endif
2483
2484 static const char*
ultag_getExtlang(const ULanguageTag * langtag,int32_t idx)2485 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2486 if (idx >= 0 && idx < MAXEXTLANG) {
2487 return langtag->extlang[idx];
2488 }
2489 return NULL;
2490 }
2491
2492 static int32_t
ultag_getExtlangSize(const ULanguageTag * langtag)2493 ultag_getExtlangSize(const ULanguageTag* langtag) {
2494 int32_t size = 0;
2495 int32_t i;
2496 for (i = 0; i < MAXEXTLANG; i++) {
2497 if (langtag->extlang[i]) {
2498 size++;
2499 }
2500 }
2501 return size;
2502 }
2503
2504 static const char*
ultag_getScript(const ULanguageTag * langtag)2505 ultag_getScript(const ULanguageTag* langtag) {
2506 return langtag->script;
2507 }
2508
2509 static const char*
ultag_getRegion(const ULanguageTag * langtag)2510 ultag_getRegion(const ULanguageTag* langtag) {
2511 return langtag->region;
2512 }
2513
2514 static const char*
ultag_getVariant(const ULanguageTag * langtag,int32_t idx)2515 ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2516 const char *var = NULL;
2517 VariantListEntry *cur = langtag->variants;
2518 int32_t i = 0;
2519 while (cur) {
2520 if (i == idx) {
2521 var = cur->variant;
2522 break;
2523 }
2524 cur = cur->next;
2525 i++;
2526 }
2527 return var;
2528 }
2529
2530 static int32_t
ultag_getVariantsSize(const ULanguageTag * langtag)2531 ultag_getVariantsSize(const ULanguageTag* langtag) {
2532 int32_t size = 0;
2533 VariantListEntry *cur = langtag->variants;
2534 while (TRUE) {
2535 if (cur == NULL) {
2536 break;
2537 }
2538 size++;
2539 cur = cur->next;
2540 }
2541 return size;
2542 }
2543
2544 static const char*
ultag_getExtensionKey(const ULanguageTag * langtag,int32_t idx)2545 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2546 const char *key = NULL;
2547 ExtensionListEntry *cur = langtag->extensions;
2548 int32_t i = 0;
2549 while (cur) {
2550 if (i == idx) {
2551 key = cur->key;
2552 break;
2553 }
2554 cur = cur->next;
2555 i++;
2556 }
2557 return key;
2558 }
2559
2560 static const char*
ultag_getExtensionValue(const ULanguageTag * langtag,int32_t idx)2561 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2562 const char *val = NULL;
2563 ExtensionListEntry *cur = langtag->extensions;
2564 int32_t i = 0;
2565 while (cur) {
2566 if (i == idx) {
2567 val = cur->value;
2568 break;
2569 }
2570 cur = cur->next;
2571 i++;
2572 }
2573 return val;
2574 }
2575
2576 static int32_t
ultag_getExtensionsSize(const ULanguageTag * langtag)2577 ultag_getExtensionsSize(const ULanguageTag* langtag) {
2578 int32_t size = 0;
2579 ExtensionListEntry *cur = langtag->extensions;
2580 while (TRUE) {
2581 if (cur == NULL) {
2582 break;
2583 }
2584 size++;
2585 cur = cur->next;
2586 }
2587 return size;
2588 }
2589
2590 static const char*
ultag_getPrivateUse(const ULanguageTag * langtag)2591 ultag_getPrivateUse(const ULanguageTag* langtag) {
2592 return langtag->privateuse;
2593 }
2594
2595 #if 0
2596 static const char*
2597 ultag_getLegacy(const ULanguageTag* langtag) {
2598 return langtag->legacy;
2599 }
2600 #endif
2601
2602
2603 /*
2604 * -------------------------------------------------
2605 *
2606 * Locale/BCP47 conversion APIs, exposed as uloc_*
2607 *
2608 * -------------------------------------------------
2609 */
2610 U_CAPI int32_t U_EXPORT2
uloc_toLanguageTag(const char * localeID,char * langtag,int32_t langtagCapacity,UBool strict,UErrorCode * status)2611 uloc_toLanguageTag(const char* localeID,
2612 char* langtag,
2613 int32_t langtagCapacity,
2614 UBool strict,
2615 UErrorCode* status) {
2616 if (U_FAILURE(*status)) {
2617 return 0;
2618 }
2619
2620 icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2621 ulocimp_toLanguageTag(localeID, sink, strict, status);
2622
2623 int32_t reslen = sink.NumberOfBytesAppended();
2624
2625 if (U_FAILURE(*status)) {
2626 return reslen;
2627 }
2628
2629 if (sink.Overflowed()) {
2630 *status = U_BUFFER_OVERFLOW_ERROR;
2631 } else {
2632 u_terminateChars(langtag, langtagCapacity, reslen, status);
2633 }
2634
2635 return reslen;
2636 }
2637
2638
2639 U_CAPI void U_EXPORT2
ulocimp_toLanguageTag(const char * localeID,icu::ByteSink & sink,UBool strict,UErrorCode * status)2640 ulocimp_toLanguageTag(const char* localeID,
2641 icu::ByteSink& sink,
2642 UBool strict,
2643 UErrorCode* status) {
2644 icu::CharString canonical;
2645 int32_t reslen;
2646 UErrorCode tmpStatus = U_ZERO_ERROR;
2647 UBool hadPosix = FALSE;
2648 const char* pKeywordStart;
2649
2650 /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
2651 int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2652 if (resultCapacity > 0) {
2653 char* buffer;
2654
2655 for (;;) {
2656 buffer = canonical.getAppendBuffer(
2657 /*minCapacity=*/resultCapacity,
2658 /*desiredCapacityHint=*/resultCapacity,
2659 resultCapacity,
2660 tmpStatus);
2661
2662 if (U_FAILURE(tmpStatus)) {
2663 *status = tmpStatus;
2664 return;
2665 }
2666
2667 reslen =
2668 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2669
2670 if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2671 break;
2672 }
2673
2674 resultCapacity = reslen;
2675 tmpStatus = U_ZERO_ERROR;
2676 }
2677
2678 if (U_FAILURE(tmpStatus)) {
2679 *status = U_ILLEGAL_ARGUMENT_ERROR;
2680 return;
2681 }
2682
2683 canonical.append(buffer, reslen, tmpStatus);
2684 if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2685 tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
2686 }
2687
2688 if (U_FAILURE(tmpStatus)) {
2689 *status = tmpStatus;
2690 return;
2691 }
2692 }
2693
2694 /* For handling special case - private use only tag */
2695 pKeywordStart = locale_getKeywordsStart(canonical.data());
2696 if (pKeywordStart == canonical.data()) {
2697 int kwdCnt = 0;
2698 UBool done = FALSE;
2699
2700 icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2701 if (U_SUCCESS(tmpStatus)) {
2702 kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2703 if (kwdCnt == 1) {
2704 const char *key;
2705 int32_t len = 0;
2706
2707 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2708 if (len == 1 && *key == PRIVATEUSE) {
2709 icu::CharString buf;
2710 {
2711 icu::CharStringByteSink sink(&buf);
2712 ulocimp_getKeywordValue(localeID, key, sink, &tmpStatus);
2713 }
2714 if (U_SUCCESS(tmpStatus)) {
2715 if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
2716 /* return private use only tag */
2717 sink.Append("und-x-", 6);
2718 sink.Append(buf.data(), buf.length());
2719 done = TRUE;
2720 } else if (strict) {
2721 *status = U_ILLEGAL_ARGUMENT_ERROR;
2722 done = TRUE;
2723 }
2724 /* if not strict mode, then "und" will be returned */
2725 } else {
2726 *status = U_ILLEGAL_ARGUMENT_ERROR;
2727 done = TRUE;
2728 }
2729 }
2730 }
2731 if (done) {
2732 return;
2733 }
2734 }
2735 }
2736
2737 _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2738 _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2739 _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2740 _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2741 _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2742 _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2743 }
2744
2745
2746 U_CAPI int32_t U_EXPORT2
uloc_forLanguageTag(const char * langtag,char * localeID,int32_t localeIDCapacity,int32_t * parsedLength,UErrorCode * status)2747 uloc_forLanguageTag(const char* langtag,
2748 char* localeID,
2749 int32_t localeIDCapacity,
2750 int32_t* parsedLength,
2751 UErrorCode* status) {
2752 if (U_FAILURE(*status)) {
2753 return 0;
2754 }
2755
2756 icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2757 ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2758
2759 int32_t reslen = sink.NumberOfBytesAppended();
2760
2761 if (U_FAILURE(*status)) {
2762 return reslen;
2763 }
2764
2765 if (sink.Overflowed()) {
2766 *status = U_BUFFER_OVERFLOW_ERROR;
2767 } else {
2768 u_terminateChars(localeID, localeIDCapacity, reslen, status);
2769 }
2770
2771 return reslen;
2772 }
2773
2774
2775 U_CAPI void U_EXPORT2
ulocimp_forLanguageTag(const char * langtag,int32_t tagLen,icu::ByteSink & sink,int32_t * parsedLength,UErrorCode * status)2776 ulocimp_forLanguageTag(const char* langtag,
2777 int32_t tagLen,
2778 icu::ByteSink& sink,
2779 int32_t* parsedLength,
2780 UErrorCode* status) {
2781 UBool isEmpty = TRUE;
2782 const char *subtag, *p;
2783 int32_t len;
2784 int32_t i, n;
2785 UBool noRegion = TRUE;
2786
2787 icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2788 if (U_FAILURE(*status)) {
2789 return;
2790 }
2791
2792 /* language */
2793 subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2794 if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2795 len = (int32_t)uprv_strlen(subtag);
2796 if (len > 0) {
2797 sink.Append(subtag, len);
2798 isEmpty = FALSE;
2799 }
2800 }
2801
2802 /* script */
2803 subtag = ultag_getScript(lt.getAlias());
2804 len = (int32_t)uprv_strlen(subtag);
2805 if (len > 0) {
2806 sink.Append("_", 1);
2807 isEmpty = FALSE;
2808
2809 /* write out the script in title case */
2810 char c = uprv_toupper(*subtag);
2811 sink.Append(&c, 1);
2812 sink.Append(subtag + 1, len - 1);
2813 }
2814
2815 /* region */
2816 subtag = ultag_getRegion(lt.getAlias());
2817 len = (int32_t)uprv_strlen(subtag);
2818 if (len > 0) {
2819 sink.Append("_", 1);
2820 isEmpty = FALSE;
2821
2822 /* write out the region in upper case */
2823 p = subtag;
2824 while (*p) {
2825 char c = uprv_toupper(*p);
2826 sink.Append(&c, 1);
2827 p++;
2828 }
2829 noRegion = FALSE;
2830 }
2831
2832 /* variants */
2833 _sortVariants(lt.getAlias()->variants);
2834 n = ultag_getVariantsSize(lt.getAlias());
2835 if (n > 0) {
2836 if (noRegion) {
2837 sink.Append("_", 1);
2838 isEmpty = FALSE;
2839 }
2840
2841 for (i = 0; i < n; i++) {
2842 subtag = ultag_getVariant(lt.getAlias(), i);
2843 sink.Append("_", 1);
2844
2845 /* write out the variant in upper case */
2846 p = subtag;
2847 while (*p) {
2848 char c = uprv_toupper(*p);
2849 sink.Append(&c, 1);
2850 p++;
2851 }
2852 }
2853 }
2854
2855 /* keywords */
2856 n = ultag_getExtensionsSize(lt.getAlias());
2857 subtag = ultag_getPrivateUse(lt.getAlias());
2858 if (n > 0 || uprv_strlen(subtag) > 0) {
2859 if (isEmpty && n > 0) {
2860 /* need a language */
2861 sink.Append(LANG_UND, LANG_UND_LEN);
2862 }
2863 _appendKeywords(lt.getAlias(), sink, status);
2864 }
2865 }
2866