1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1997-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  loclikely.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2010feb25
16 *   created by: Markus W. Scherer
17 *
18 *   Code for likely and minimized locale subtags, separated out from other .cpp files
19 *   that then do not depend on resource bundle code and likely-subtags data.
20 */
21 
22 #include "unicode/bytestream.h"
23 #include "unicode/utypes.h"
24 #include "unicode/locid.h"
25 #include "unicode/putil.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uloc.h"
28 #include "unicode/ures.h"
29 #include "unicode/uscript.h"
30 #include "bytesinkutil.h"
31 #include "charstr.h"
32 #include "cmemory.h"
33 #include "cstring.h"
34 #include "ulocimp.h"
35 #include "ustr_imp.h"
36 
37 /**
38  * These are the canonical strings for unknown languages, scripts and regions.
39  **/
40 static const char* const unknownLanguage = "und";
41 static const char* const unknownScript = "Zzzz";
42 static const char* const unknownRegion = "ZZ";
43 
44 /**
45  * This function looks for the localeID in the likelySubtags resource.
46  *
47  * @param localeID The tag to find.
48  * @param buffer A buffer to hold the matching entry
49  * @param bufferLength The length of the output buffer
50  * @return A pointer to "buffer" if found, or a null pointer if not.
51  */
52 static const char*  U_CALLCONV
findLikelySubtags(const char * localeID,char * buffer,int32_t bufferLength,UErrorCode * err)53 findLikelySubtags(const char* localeID,
54                   char* buffer,
55                   int32_t bufferLength,
56                   UErrorCode* err) {
57     const char* result = NULL;
58 
59     if (!U_FAILURE(*err)) {
60         int32_t resLen = 0;
61         const UChar* s = NULL;
62         UErrorCode tmpErr = U_ZERO_ERROR;
63         icu::LocalUResourceBundlePointer subtags(ures_openDirect(NULL, "likelySubtags", &tmpErr));
64         if (U_SUCCESS(tmpErr)) {
65             icu::CharString und;
66             if (localeID != NULL) {
67                 if (*localeID == '\0') {
68                     localeID = unknownLanguage;
69                 } else if (*localeID == '_') {
70                     und.append(unknownLanguage, *err);
71                     und.append(localeID, *err);
72                     if (U_FAILURE(*err)) {
73                         return NULL;
74                     }
75                     localeID = und.data();
76                 }
77             }
78             s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
79 
80             if (U_FAILURE(tmpErr)) {
81                 /*
82                  * If a resource is missing, it's not really an error, it's
83                  * just that we don't have any data for that particular locale ID.
84                  */
85                 if (tmpErr != U_MISSING_RESOURCE_ERROR) {
86                     *err = tmpErr;
87                 }
88             }
89             else if (resLen >= bufferLength) {
90                 /* The buffer should never overflow. */
91                 *err = U_INTERNAL_PROGRAM_ERROR;
92             }
93             else {
94                 u_UCharsToChars(s, buffer, resLen + 1);
95                 if (resLen >= 3 &&
96                     uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
97                     (resLen == 3 || buffer[3] == '_')) {
98                     uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
99                 }
100                 result = buffer;
101             }
102         } else {
103             *err = tmpErr;
104         }
105     }
106 
107     return result;
108 }
109 
110 /**
111  * Append a tag to a buffer, adding the separator if necessary.  The buffer
112  * must be large enough to contain the resulting tag plus any separator
113  * necessary. The tag must not be a zero-length string.
114  *
115  * @param tag The tag to add.
116  * @param tagLength The length of the tag.
117  * @param buffer The output buffer.
118  * @param bufferLength The length of the output buffer.  This is an input/ouput parameter.
119  **/
120 static void U_CALLCONV
appendTag(const char * tag,int32_t tagLength,char * buffer,int32_t * bufferLength,UBool withSeparator)121 appendTag(
122     const char* tag,
123     int32_t tagLength,
124     char* buffer,
125     int32_t* bufferLength,
126     UBool withSeparator) {
127 
128     if (withSeparator) {
129         buffer[*bufferLength] = '_';
130         ++(*bufferLength);
131     }
132 
133     uprv_memmove(
134         &buffer[*bufferLength],
135         tag,
136         tagLength);
137 
138     *bufferLength += tagLength;
139 }
140 
141 /**
142  * Create a tag string from the supplied parameters.  The lang, script and region
143  * parameters may be NULL pointers. If they are, their corresponding length parameters
144  * must be less than or equal to 0.
145  *
146  * If any of the language, script or region parameters are empty, and the alternateTags
147  * parameter is not NULL, it will be parsed for potential language, script and region tags
148  * to be used when constructing the new tag.  If the alternateTags parameter is NULL, or
149  * it contains no language tag, the default tag for the unknown language is used.
150  *
151  * If the length of the new string exceeds the capacity of the output buffer,
152  * the function copies as many bytes to the output buffer as it can, and returns
153  * the error U_BUFFER_OVERFLOW_ERROR.
154  *
155  * If an illegal argument is provided, the function returns the error
156  * U_ILLEGAL_ARGUMENT_ERROR.
157  *
158  * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
159  * the tag string fits in the output buffer, but the null terminator doesn't.
160  *
161  * @param lang The language tag to use.
162  * @param langLength The length of the language tag.
163  * @param script The script tag to use.
164  * @param scriptLength The length of the script tag.
165  * @param region The region tag to use.
166  * @param regionLength The length of the region tag.
167  * @param trailing Any trailing data to append to the new tag.
168  * @param trailingLength The length of the trailing data.
169  * @param alternateTags A string containing any alternate tags.
170  * @param sink The output sink receiving the tag string.
171  * @param err A pointer to a UErrorCode for error reporting.
172  **/
173 static void U_CALLCONV
createTagStringWithAlternates(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * trailing,int32_t trailingLength,const char * alternateTags,icu::ByteSink & sink,UErrorCode * err)174 createTagStringWithAlternates(
175     const char* lang,
176     int32_t langLength,
177     const char* script,
178     int32_t scriptLength,
179     const char* region,
180     int32_t regionLength,
181     const char* trailing,
182     int32_t trailingLength,
183     const char* alternateTags,
184     icu::ByteSink& sink,
185     UErrorCode* err) {
186 
187     if (U_FAILURE(*err)) {
188         goto error;
189     }
190     else if (langLength >= ULOC_LANG_CAPACITY ||
191              scriptLength >= ULOC_SCRIPT_CAPACITY ||
192              regionLength >= ULOC_COUNTRY_CAPACITY) {
193         goto error;
194     }
195     else {
196         /**
197          * ULOC_FULLNAME_CAPACITY will provide enough capacity
198          * that we can build a string that contains the language,
199          * script and region code without worrying about overrunning
200          * the user-supplied buffer.
201          **/
202         char tagBuffer[ULOC_FULLNAME_CAPACITY];
203         int32_t tagLength = 0;
204         UBool regionAppended = FALSE;
205 
206         if (langLength > 0) {
207             appendTag(
208                 lang,
209                 langLength,
210                 tagBuffer,
211                 &tagLength,
212                 /*withSeparator=*/FALSE);
213         }
214         else if (alternateTags == NULL) {
215             /*
216              * Use the empty string for an unknown language, if
217              * we found no language.
218              */
219         }
220         else {
221             /*
222              * Parse the alternateTags string for the language.
223              */
224             char alternateLang[ULOC_LANG_CAPACITY];
225             int32_t alternateLangLength = sizeof(alternateLang);
226 
227             alternateLangLength =
228                 uloc_getLanguage(
229                     alternateTags,
230                     alternateLang,
231                     alternateLangLength,
232                     err);
233             if(U_FAILURE(*err) ||
234                 alternateLangLength >= ULOC_LANG_CAPACITY) {
235                 goto error;
236             }
237             else if (alternateLangLength == 0) {
238                 /*
239                  * Use the empty string for an unknown language, if
240                  * we found no language.
241                  */
242             }
243             else {
244                 appendTag(
245                     alternateLang,
246                     alternateLangLength,
247                     tagBuffer,
248                     &tagLength,
249                     /*withSeparator=*/FALSE);
250             }
251         }
252 
253         if (scriptLength > 0) {
254             appendTag(
255                 script,
256                 scriptLength,
257                 tagBuffer,
258                 &tagLength,
259                 /*withSeparator=*/TRUE);
260         }
261         else if (alternateTags != NULL) {
262             /*
263              * Parse the alternateTags string for the script.
264              */
265             char alternateScript[ULOC_SCRIPT_CAPACITY];
266 
267             const int32_t alternateScriptLength =
268                 uloc_getScript(
269                     alternateTags,
270                     alternateScript,
271                     sizeof(alternateScript),
272                     err);
273 
274             if (U_FAILURE(*err) ||
275                 alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
276                 goto error;
277             }
278             else if (alternateScriptLength > 0) {
279                 appendTag(
280                     alternateScript,
281                     alternateScriptLength,
282                     tagBuffer,
283                     &tagLength,
284                     /*withSeparator=*/TRUE);
285             }
286         }
287 
288         if (regionLength > 0) {
289             appendTag(
290                 region,
291                 regionLength,
292                 tagBuffer,
293                 &tagLength,
294                 /*withSeparator=*/TRUE);
295 
296             regionAppended = TRUE;
297         }
298         else if (alternateTags != NULL) {
299             /*
300              * Parse the alternateTags string for the region.
301              */
302             char alternateRegion[ULOC_COUNTRY_CAPACITY];
303 
304             const int32_t alternateRegionLength =
305                 uloc_getCountry(
306                     alternateTags,
307                     alternateRegion,
308                     sizeof(alternateRegion),
309                     err);
310             if (U_FAILURE(*err) ||
311                 alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
312                 goto error;
313             }
314             else if (alternateRegionLength > 0) {
315                 appendTag(
316                     alternateRegion,
317                     alternateRegionLength,
318                     tagBuffer,
319                     &tagLength,
320                     /*withSeparator=*/TRUE);
321 
322                 regionAppended = TRUE;
323             }
324         }
325 
326         /**
327          * Copy the partial tag from our internal buffer to the supplied
328          * target.
329          **/
330         sink.Append(tagBuffer, tagLength);
331 
332         if (trailingLength > 0) {
333             if (*trailing != '@') {
334                 sink.Append("_", 1);
335                 if (!regionAppended) {
336                     /* extra separator is required */
337                     sink.Append("_", 1);
338                 }
339             }
340 
341             /*
342              * Copy the trailing data into the supplied buffer.
343              */
344             sink.Append(trailing, trailingLength);
345         }
346 
347         return;
348     }
349 
350 error:
351 
352     /**
353      * An overflow indicates the locale ID passed in
354      * is ill-formed.  If we got here, and there was
355      * no previous error, it's an implicit overflow.
356      **/
357     if (*err ==  U_BUFFER_OVERFLOW_ERROR ||
358         U_SUCCESS(*err)) {
359         *err = U_ILLEGAL_ARGUMENT_ERROR;
360     }
361 }
362 
363 /**
364  * Create a tag string from the supplied parameters.  The lang, script and region
365  * parameters may be NULL pointers. If they are, their corresponding length parameters
366  * must be less than or equal to 0.  If the lang parameter is an empty string, the
367  * default value for an unknown language is written to the output buffer.
368  *
369  * If the length of the new string exceeds the capacity of the output buffer,
370  * the function copies as many bytes to the output buffer as it can, and returns
371  * the error U_BUFFER_OVERFLOW_ERROR.
372  *
373  * If an illegal argument is provided, the function returns the error
374  * U_ILLEGAL_ARGUMENT_ERROR.
375  *
376  * @param lang The language tag to use.
377  * @param langLength The length of the language tag.
378  * @param script The script tag to use.
379  * @param scriptLength The length of the script tag.
380  * @param region The region tag to use.
381  * @param regionLength The length of the region tag.
382  * @param trailing Any trailing data to append to the new tag.
383  * @param trailingLength The length of the trailing data.
384  * @param sink The output sink receiving the tag string.
385  * @param err A pointer to a UErrorCode for error reporting.
386  **/
387 static void U_CALLCONV
createTagString(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * trailing,int32_t trailingLength,icu::ByteSink & sink,UErrorCode * err)388 createTagString(
389     const char* lang,
390     int32_t langLength,
391     const char* script,
392     int32_t scriptLength,
393     const char* region,
394     int32_t regionLength,
395     const char* trailing,
396     int32_t trailingLength,
397     icu::ByteSink& sink,
398     UErrorCode* err)
399 {
400     createTagStringWithAlternates(
401                 lang,
402                 langLength,
403                 script,
404                 scriptLength,
405                 region,
406                 regionLength,
407                 trailing,
408                 trailingLength,
409                 NULL,
410                 sink,
411                 err);
412 }
413 
414 /**
415  * Parse the language, script, and region subtags from a tag string, and copy the
416  * results into the corresponding output parameters. The buffers are null-terminated,
417  * unless overflow occurs.
418  *
419  * The langLength, scriptLength, and regionLength parameters are input/output
420  * parameters, and must contain the capacity of their corresponding buffers on
421  * input.  On output, they will contain the actual length of the buffers, not
422  * including the null terminator.
423  *
424  * If the length of any of the output subtags exceeds the capacity of the corresponding
425  * buffer, the function copies as many bytes to the output buffer as it can, and returns
426  * the error U_BUFFER_OVERFLOW_ERROR.  It will not parse any more subtags once overflow
427  * occurs.
428  *
429  * If an illegal argument is provided, the function returns the error
430  * U_ILLEGAL_ARGUMENT_ERROR.
431  *
432  * @param localeID The locale ID to parse.
433  * @param lang The language tag buffer.
434  * @param langLength The length of the language tag.
435  * @param script The script tag buffer.
436  * @param scriptLength The length of the script tag.
437  * @param region The region tag buffer.
438  * @param regionLength The length of the region tag.
439  * @param err A pointer to a UErrorCode for error reporting.
440  * @return The number of chars of the localeID parameter consumed.
441  **/
442 static int32_t U_CALLCONV
parseTagString(const char * localeID,char * lang,int32_t * langLength,char * script,int32_t * scriptLength,char * region,int32_t * regionLength,UErrorCode * err)443 parseTagString(
444     const char* localeID,
445     char* lang,
446     int32_t* langLength,
447     char* script,
448     int32_t* scriptLength,
449     char* region,
450     int32_t* regionLength,
451     UErrorCode* err)
452 {
453     const char* position = localeID;
454     int32_t subtagLength = 0;
455 
456     if(U_FAILURE(*err) ||
457        localeID == NULL ||
458        lang == NULL ||
459        langLength == NULL ||
460        script == NULL ||
461        scriptLength == NULL ||
462        region == NULL ||
463        regionLength == NULL) {
464         goto error;
465     }
466 
467     subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err);
468 
469     /*
470      * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
471      * to be an error, because it indicates the user-supplied tag is
472      * not well-formed.
473      */
474     if(U_FAILURE(*err)) {
475         goto error;
476     }
477 
478     *langLength = subtagLength;
479 
480     /*
481      * If no language was present, use the empty string instead.
482      * Otherwise, move past any separator.
483      */
484     if (_isIDSeparator(*position)) {
485         ++position;
486     }
487 
488     subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err);
489 
490     if(U_FAILURE(*err)) {
491         goto error;
492     }
493 
494     *scriptLength = subtagLength;
495 
496     if (*scriptLength > 0) {
497         if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
498             /**
499              * If the script part is the "unknown" script, then don't return it.
500              **/
501             *scriptLength = 0;
502         }
503 
504         /*
505          * Move past any separator.
506          */
507         if (_isIDSeparator(*position)) {
508             ++position;
509         }
510     }
511 
512     subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err);
513 
514     if(U_FAILURE(*err)) {
515         goto error;
516     }
517 
518     *regionLength = subtagLength;
519 
520     if (*regionLength > 0) {
521         if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
522             /**
523              * If the region part is the "unknown" region, then don't return it.
524              **/
525             *regionLength = 0;
526         }
527     } else if (*position != 0 && *position != '@') {
528         /* back up over consumed trailing separator */
529         --position;
530     }
531 
532 exit:
533 
534     return (int32_t)(position - localeID);
535 
536 error:
537 
538     /**
539      * If we get here, we have no explicit error, it's the result of an
540      * illegal argument.
541      **/
542     if (!U_FAILURE(*err)) {
543         *err = U_ILLEGAL_ARGUMENT_ERROR;
544     }
545 
546     goto exit;
547 }
548 
549 static UBool U_CALLCONV
createLikelySubtagsString(const char * lang,int32_t langLength,const char * script,int32_t scriptLength,const char * region,int32_t regionLength,const char * variants,int32_t variantsLength,icu::ByteSink & sink,UErrorCode * err)550 createLikelySubtagsString(
551     const char* lang,
552     int32_t langLength,
553     const char* script,
554     int32_t scriptLength,
555     const char* region,
556     int32_t regionLength,
557     const char* variants,
558     int32_t variantsLength,
559     icu::ByteSink& sink,
560     UErrorCode* err) {
561     /**
562      * ULOC_FULLNAME_CAPACITY will provide enough capacity
563      * that we can build a string that contains the language,
564      * script and region code without worrying about overrunning
565      * the user-supplied buffer.
566      **/
567     char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
568 
569     if(U_FAILURE(*err)) {
570         goto error;
571     }
572 
573     /**
574      * Try the language with the script and region first.
575      **/
576     if (scriptLength > 0 && regionLength > 0) {
577 
578         const char* likelySubtags = NULL;
579 
580         icu::CharString tagBuffer;
581         {
582             icu::CharStringByteSink sink(&tagBuffer);
583             createTagString(
584                 lang,
585                 langLength,
586                 script,
587                 scriptLength,
588                 region,
589                 regionLength,
590                 NULL,
591                 0,
592                 sink,
593                 err);
594         }
595         if(U_FAILURE(*err)) {
596             goto error;
597         }
598 
599         likelySubtags =
600             findLikelySubtags(
601                 tagBuffer.data(),
602                 likelySubtagsBuffer,
603                 sizeof(likelySubtagsBuffer),
604                 err);
605         if(U_FAILURE(*err)) {
606             goto error;
607         }
608 
609         if (likelySubtags != NULL) {
610             /* Always use the language tag from the
611                maximal string, since it may be more
612                specific than the one provided. */
613             createTagStringWithAlternates(
614                         NULL,
615                         0,
616                         NULL,
617                         0,
618                         NULL,
619                         0,
620                         variants,
621                         variantsLength,
622                         likelySubtags,
623                         sink,
624                         err);
625             return TRUE;
626         }
627     }
628 
629     /**
630      * Try the language with just the script.
631      **/
632     if (scriptLength > 0) {
633 
634         const char* likelySubtags = NULL;
635 
636         icu::CharString tagBuffer;
637         {
638             icu::CharStringByteSink sink(&tagBuffer);
639             createTagString(
640                 lang,
641                 langLength,
642                 script,
643                 scriptLength,
644                 NULL,
645                 0,
646                 NULL,
647                 0,
648                 sink,
649                 err);
650         }
651         if(U_FAILURE(*err)) {
652             goto error;
653         }
654 
655         likelySubtags =
656             findLikelySubtags(
657                 tagBuffer.data(),
658                 likelySubtagsBuffer,
659                 sizeof(likelySubtagsBuffer),
660                 err);
661         if(U_FAILURE(*err)) {
662             goto error;
663         }
664 
665         if (likelySubtags != NULL) {
666             /* Always use the language tag from the
667                maximal string, since it may be more
668                specific than the one provided. */
669             createTagStringWithAlternates(
670                         NULL,
671                         0,
672                         NULL,
673                         0,
674                         region,
675                         regionLength,
676                         variants,
677                         variantsLength,
678                         likelySubtags,
679                         sink,
680                         err);
681             return TRUE;
682         }
683     }
684 
685     /**
686      * Try the language with just the region.
687      **/
688     if (regionLength > 0) {
689 
690         const char* likelySubtags = NULL;
691 
692         icu::CharString tagBuffer;
693         {
694             icu::CharStringByteSink sink(&tagBuffer);
695             createTagString(
696                 lang,
697                 langLength,
698                 NULL,
699                 0,
700                 region,
701                 regionLength,
702                 NULL,
703                 0,
704                 sink,
705                 err);
706         }
707         if(U_FAILURE(*err)) {
708             goto error;
709         }
710 
711         likelySubtags =
712             findLikelySubtags(
713                 tagBuffer.data(),
714                 likelySubtagsBuffer,
715                 sizeof(likelySubtagsBuffer),
716                 err);
717         if(U_FAILURE(*err)) {
718             goto error;
719         }
720 
721         if (likelySubtags != NULL) {
722             /* Always use the language tag from the
723                maximal string, since it may be more
724                specific than the one provided. */
725             createTagStringWithAlternates(
726                         NULL,
727                         0,
728                         script,
729                         scriptLength,
730                         NULL,
731                         0,
732                         variants,
733                         variantsLength,
734                         likelySubtags,
735                         sink,
736                         err);
737             return TRUE;
738         }
739     }
740 
741     /**
742      * Finally, try just the language.
743      **/
744     {
745         const char* likelySubtags = NULL;
746 
747         icu::CharString tagBuffer;
748         {
749             icu::CharStringByteSink sink(&tagBuffer);
750             createTagString(
751                 lang,
752                 langLength,
753                 NULL,
754                 0,
755                 NULL,
756                 0,
757                 NULL,
758                 0,
759                 sink,
760                 err);
761         }
762         if(U_FAILURE(*err)) {
763             goto error;
764         }
765 
766         likelySubtags =
767             findLikelySubtags(
768                 tagBuffer.data(),
769                 likelySubtagsBuffer,
770                 sizeof(likelySubtagsBuffer),
771                 err);
772         if(U_FAILURE(*err)) {
773             goto error;
774         }
775 
776         if (likelySubtags != NULL) {
777             /* Always use the language tag from the
778                maximal string, since it may be more
779                specific than the one provided. */
780             createTagStringWithAlternates(
781                         NULL,
782                         0,
783                         script,
784                         scriptLength,
785                         region,
786                         regionLength,
787                         variants,
788                         variantsLength,
789                         likelySubtags,
790                         sink,
791                         err);
792             return TRUE;
793         }
794     }
795 
796     return FALSE;
797 
798 error:
799 
800     if (!U_FAILURE(*err)) {
801         *err = U_ILLEGAL_ARGUMENT_ERROR;
802     }
803 
804     return FALSE;
805 }
806 
807 #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
808     int32_t count = 0; \
809     int32_t i; \
810     for (i = 0; i < trailingLength; i++) { \
811         if (trailing[i] == '-' || trailing[i] == '_') { \
812             count = 0; \
813             if (count > 8) { \
814                 goto error; \
815             } \
816         } else if (trailing[i] == '@') { \
817             break; \
818         } else if (count > 8) { \
819             goto error; \
820         } else { \
821             count++; \
822         } \
823     } \
824 } UPRV_BLOCK_MACRO_END
825 
826 static UBool
_uloc_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode * err)827 _uloc_addLikelySubtags(const char* localeID,
828                        icu::ByteSink& sink,
829                        UErrorCode* err) {
830     char lang[ULOC_LANG_CAPACITY];
831     int32_t langLength = sizeof(lang);
832     char script[ULOC_SCRIPT_CAPACITY];
833     int32_t scriptLength = sizeof(script);
834     char region[ULOC_COUNTRY_CAPACITY];
835     int32_t regionLength = sizeof(region);
836     const char* trailing = "";
837     int32_t trailingLength = 0;
838     int32_t trailingIndex = 0;
839     UBool success = FALSE;
840 
841     if(U_FAILURE(*err)) {
842         goto error;
843     }
844     if (localeID == NULL) {
845         goto error;
846     }
847 
848     trailingIndex = parseTagString(
849         localeID,
850         lang,
851         &langLength,
852         script,
853         &scriptLength,
854         region,
855         &regionLength,
856         err);
857     if(U_FAILURE(*err)) {
858         /* Overflow indicates an illegal argument error */
859         if (*err == U_BUFFER_OVERFLOW_ERROR) {
860             *err = U_ILLEGAL_ARGUMENT_ERROR;
861         }
862 
863         goto error;
864     }
865 
866     /* Find the length of the trailing portion. */
867     while (_isIDSeparator(localeID[trailingIndex])) {
868         trailingIndex++;
869     }
870     trailing = &localeID[trailingIndex];
871     trailingLength = (int32_t)uprv_strlen(trailing);
872 
873     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
874 
875     success =
876         createLikelySubtagsString(
877             lang,
878             langLength,
879             script,
880             scriptLength,
881             region,
882             regionLength,
883             trailing,
884             trailingLength,
885             sink,
886             err);
887 
888     if (!success) {
889         const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
890 
891         /*
892          * If we get here, we need to return localeID.
893          */
894         sink.Append(localeID, localIDLength);
895     }
896 
897     return success;
898 
899 error:
900 
901     if (!U_FAILURE(*err)) {
902         *err = U_ILLEGAL_ARGUMENT_ERROR;
903     }
904     return FALSE;
905 }
906 
907 // Add likely subtags to the sink
908 // return true if the value in the sink is produced by a match during the lookup
909 // return false if the value in the sink is the same as input because there are
910 // no match after the lookup.
911 static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
912 
913 static void
_uloc_minimizeSubtags(const char * localeID,icu::ByteSink & sink,UErrorCode * err)914 _uloc_minimizeSubtags(const char* localeID,
915                       icu::ByteSink& sink,
916                       UErrorCode* err) {
917     icu::CharString maximizedTagBuffer;
918 
919     char lang[ULOC_LANG_CAPACITY];
920     int32_t langLength = sizeof(lang);
921     char script[ULOC_SCRIPT_CAPACITY];
922     int32_t scriptLength = sizeof(script);
923     char region[ULOC_COUNTRY_CAPACITY];
924     int32_t regionLength = sizeof(region);
925     const char* trailing = "";
926     int32_t trailingLength = 0;
927     int32_t trailingIndex = 0;
928     UBool successGetMax = FALSE;
929 
930     if(U_FAILURE(*err)) {
931         goto error;
932     }
933     else if (localeID == NULL) {
934         goto error;
935     }
936 
937     trailingIndex =
938         parseTagString(
939             localeID,
940             lang,
941             &langLength,
942             script,
943             &scriptLength,
944             region,
945             &regionLength,
946             err);
947     if(U_FAILURE(*err)) {
948 
949         /* Overflow indicates an illegal argument error */
950         if (*err == U_BUFFER_OVERFLOW_ERROR) {
951             *err = U_ILLEGAL_ARGUMENT_ERROR;
952         }
953 
954         goto error;
955     }
956 
957     /* Find the spot where the variants or the keywords begin, if any. */
958     while (_isIDSeparator(localeID[trailingIndex])) {
959         trailingIndex++;
960     }
961     trailing = &localeID[trailingIndex];
962     trailingLength = (int32_t)uprv_strlen(trailing);
963 
964     CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
965 
966     {
967         icu::CharString base;
968         {
969             icu::CharStringByteSink baseSink(&base);
970             createTagString(
971                 lang,
972                 langLength,
973                 script,
974                 scriptLength,
975                 region,
976                 regionLength,
977                 NULL,
978                 0,
979                 baseSink,
980                 err);
981         }
982 
983         /**
984          * First, we need to first get the maximization
985          * from AddLikelySubtags.
986          **/
987         {
988             icu::CharStringByteSink maxSink(&maximizedTagBuffer);
989             successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
990         }
991     }
992 
993     if(U_FAILURE(*err)) {
994         goto error;
995     }
996 
997     if (!successGetMax) {
998         /**
999          * If we got here, return the locale ID parameter unchanged.
1000          **/
1001         const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
1002         sink.Append(localeID, localeIDLength);
1003         return;
1004     }
1005 
1006     // In the following, the lang, script, region are referring to those in
1007     // the maximizedTagBuffer, not the one in the localeID.
1008     langLength = sizeof(lang);
1009     scriptLength = sizeof(script);
1010     regionLength = sizeof(region);
1011     parseTagString(
1012         maximizedTagBuffer.data(),
1013         lang,
1014         &langLength,
1015         script,
1016         &scriptLength,
1017         region,
1018         &regionLength,
1019         err);
1020     if(U_FAILURE(*err)) {
1021         goto error;
1022     }
1023 
1024     /**
1025      * Start first with just the language.
1026      **/
1027     {
1028         icu::CharString tagBuffer;
1029         {
1030             icu::CharStringByteSink tagSink(&tagBuffer);
1031             createLikelySubtagsString(
1032                 lang,
1033                 langLength,
1034                 NULL,
1035                 0,
1036                 NULL,
1037                 0,
1038                 NULL,
1039                 0,
1040                 tagSink,
1041                 err);
1042         }
1043 
1044         if(U_FAILURE(*err)) {
1045             goto error;
1046         }
1047         else if (!tagBuffer.isEmpty() &&
1048                  uprv_strnicmp(
1049                     maximizedTagBuffer.data(),
1050                     tagBuffer.data(),
1051                     tagBuffer.length()) == 0) {
1052 
1053             createTagString(
1054                         lang,
1055                         langLength,
1056                         NULL,
1057                         0,
1058                         NULL,
1059                         0,
1060                         trailing,
1061                         trailingLength,
1062                         sink,
1063                         err);
1064             return;
1065         }
1066     }
1067 
1068     /**
1069      * Next, try the language and region.
1070      **/
1071     if (regionLength > 0) {
1072 
1073         icu::CharString tagBuffer;
1074         {
1075             icu::CharStringByteSink tagSink(&tagBuffer);
1076             createLikelySubtagsString(
1077                 lang,
1078                 langLength,
1079                 NULL,
1080                 0,
1081                 region,
1082                 regionLength,
1083                 NULL,
1084                 0,
1085                 tagSink,
1086                 err);
1087         }
1088 
1089         if(U_FAILURE(*err)) {
1090             goto error;
1091         }
1092         else if (!tagBuffer.isEmpty() &&
1093                  uprv_strnicmp(
1094                     maximizedTagBuffer.data(),
1095                     tagBuffer.data(),
1096                     tagBuffer.length()) == 0) {
1097 
1098             createTagString(
1099                         lang,
1100                         langLength,
1101                         NULL,
1102                         0,
1103                         region,
1104                         regionLength,
1105                         trailing,
1106                         trailingLength,
1107                         sink,
1108                         err);
1109             return;
1110         }
1111     }
1112 
1113     /**
1114      * Finally, try the language and script.  This is our last chance,
1115      * since trying with all three subtags would only yield the
1116      * maximal version that we already have.
1117      **/
1118     if (scriptLength > 0) {
1119         icu::CharString tagBuffer;
1120         {
1121             icu::CharStringByteSink tagSink(&tagBuffer);
1122             createLikelySubtagsString(
1123                 lang,
1124                 langLength,
1125                 script,
1126                 scriptLength,
1127                 NULL,
1128                 0,
1129                 NULL,
1130                 0,
1131                 tagSink,
1132                 err);
1133         }
1134 
1135         if(U_FAILURE(*err)) {
1136             goto error;
1137         }
1138         else if (!tagBuffer.isEmpty() &&
1139                  uprv_strnicmp(
1140                     maximizedTagBuffer.data(),
1141                     tagBuffer.data(),
1142                     tagBuffer.length()) == 0) {
1143 
1144             createTagString(
1145                         lang,
1146                         langLength,
1147                         script,
1148                         scriptLength,
1149                         NULL,
1150                         0,
1151                         trailing,
1152                         trailingLength,
1153                         sink,
1154                         err);
1155             return;
1156         }
1157     }
1158 
1159     {
1160         /**
1161          * If we got here, return the max + trail.
1162          **/
1163         createTagString(
1164                     lang,
1165                     langLength,
1166                     script,
1167                     scriptLength,
1168                     region,
1169                     regionLength,
1170                     trailing,
1171                     trailingLength,
1172                     sink,
1173                     err);
1174         return;
1175     }
1176 
1177 error:
1178 
1179     if (!U_FAILURE(*err)) {
1180         *err = U_ILLEGAL_ARGUMENT_ERROR;
1181     }
1182 }
1183 
1184 static UBool
do_canonicalize(const char * localeID,char * buffer,int32_t bufferCapacity,UErrorCode * err)1185 do_canonicalize(const char*    localeID,
1186          char* buffer,
1187          int32_t bufferCapacity,
1188          UErrorCode* err)
1189 {
1190     uloc_canonicalize(
1191         localeID,
1192         buffer,
1193         bufferCapacity,
1194         err);
1195 
1196     if (*err == U_STRING_NOT_TERMINATED_WARNING ||
1197         *err == U_BUFFER_OVERFLOW_ERROR) {
1198         *err = U_ILLEGAL_ARGUMENT_ERROR;
1199 
1200         return FALSE;
1201     }
1202     else if (U_FAILURE(*err)) {
1203 
1204         return FALSE;
1205     }
1206     else {
1207         return TRUE;
1208     }
1209 }
1210 
1211 U_CAPI int32_t U_EXPORT2
uloc_addLikelySubtags(const char * localeID,char * maximizedLocaleID,int32_t maximizedLocaleIDCapacity,UErrorCode * status)1212 uloc_addLikelySubtags(const char* localeID,
1213                       char* maximizedLocaleID,
1214                       int32_t maximizedLocaleIDCapacity,
1215                       UErrorCode* status) {
1216     if (U_FAILURE(*status)) {
1217         return 0;
1218     }
1219 
1220     icu::CheckedArrayByteSink sink(
1221             maximizedLocaleID, maximizedLocaleIDCapacity);
1222 
1223     ulocimp_addLikelySubtags(localeID, sink, status);
1224     int32_t reslen = sink.NumberOfBytesAppended();
1225 
1226     if (U_FAILURE(*status)) {
1227         return sink.Overflowed() ? reslen : -1;
1228     }
1229 
1230     if (sink.Overflowed()) {
1231         *status = U_BUFFER_OVERFLOW_ERROR;
1232     } else {
1233         u_terminateChars(
1234                 maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
1235     }
1236 
1237     return reslen;
1238 }
1239 
1240 static UBool
_ulocimp_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode * status)1241 _ulocimp_addLikelySubtags(const char* localeID,
1242                           icu::ByteSink& sink,
1243                           UErrorCode* status) {
1244     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1245 
1246     if (do_canonicalize(localeID, localeBuffer, sizeof localeBuffer, status)) {
1247         return _uloc_addLikelySubtags(localeBuffer, sink, status);
1248     }
1249     return FALSE;
1250 }
1251 
1252 U_CAPI void U_EXPORT2
ulocimp_addLikelySubtags(const char * localeID,icu::ByteSink & sink,UErrorCode * status)1253 ulocimp_addLikelySubtags(const char* localeID,
1254                          icu::ByteSink& sink,
1255                          UErrorCode* status) {
1256     _ulocimp_addLikelySubtags(localeID, sink, status);
1257 }
1258 
1259 U_CAPI int32_t U_EXPORT2
uloc_minimizeSubtags(const char * localeID,char * minimizedLocaleID,int32_t minimizedLocaleIDCapacity,UErrorCode * status)1260 uloc_minimizeSubtags(const char* localeID,
1261                      char* minimizedLocaleID,
1262                      int32_t minimizedLocaleIDCapacity,
1263                      UErrorCode* status) {
1264     if (U_FAILURE(*status)) {
1265         return 0;
1266     }
1267 
1268     icu::CheckedArrayByteSink sink(
1269             minimizedLocaleID, minimizedLocaleIDCapacity);
1270 
1271     ulocimp_minimizeSubtags(localeID, sink, status);
1272     int32_t reslen = sink.NumberOfBytesAppended();
1273 
1274     if (U_FAILURE(*status)) {
1275         return sink.Overflowed() ? reslen : -1;
1276     }
1277 
1278     if (sink.Overflowed()) {
1279         *status = U_BUFFER_OVERFLOW_ERROR;
1280     } else {
1281         u_terminateChars(
1282                 minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
1283     }
1284 
1285     return reslen;
1286 }
1287 
1288 U_CAPI void U_EXPORT2
ulocimp_minimizeSubtags(const char * localeID,icu::ByteSink & sink,UErrorCode * status)1289 ulocimp_minimizeSubtags(const char* localeID,
1290                         icu::ByteSink& sink,
1291                         UErrorCode* status) {
1292     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1293 
1294     if (do_canonicalize(localeID, localeBuffer, sizeof localeBuffer, status)) {
1295         _uloc_minimizeSubtags(localeBuffer, sink, status);
1296     }
1297 }
1298 
1299 // Pairs of (language subtag, + or -) for finding out fast if common languages
1300 // are LTR (minus) or RTL (plus).
1301 static const char LANG_DIR_STRING[] =
1302         "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
1303 
1304 // Implemented here because this calls ulocimp_addLikelySubtags().
1305 U_CAPI UBool U_EXPORT2
uloc_isRightToLeft(const char * locale)1306 uloc_isRightToLeft(const char *locale) {
1307     UErrorCode errorCode = U_ZERO_ERROR;
1308     char script[8];
1309     int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
1310     if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1311             scriptLength == 0) {
1312         // Fastpath: We know the likely scripts and their writing direction
1313         // for some common languages.
1314         errorCode = U_ZERO_ERROR;
1315         char lang[8];
1316         int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
1317         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1318             return FALSE;
1319         }
1320         if (langLength > 0) {
1321             const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
1322             if (langPtr != NULL) {
1323                 switch (langPtr[langLength]) {
1324                 case '-': return FALSE;
1325                 case '+': return TRUE;
1326                 default: break;  // partial match of a longer code
1327                 }
1328             }
1329         }
1330         // Otherwise, find the likely script.
1331         errorCode = U_ZERO_ERROR;
1332         icu::CharString likely;
1333         {
1334             icu::CharStringByteSink sink(&likely);
1335             ulocimp_addLikelySubtags(locale, sink, &errorCode);
1336         }
1337         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
1338             return FALSE;
1339         }
1340         scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
1341         if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
1342                 scriptLength == 0) {
1343             return FALSE;
1344         }
1345     }
1346     UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
1347     return uscript_isRightToLeft(scriptCode);
1348 }
1349 
1350 U_NAMESPACE_BEGIN
1351 
1352 UBool
isRightToLeft() const1353 Locale::isRightToLeft() const {
1354     return uloc_isRightToLeft(getBaseName());
1355 }
1356 
1357 U_NAMESPACE_END
1358 
1359 // The following must at least allow for rg key value (6) plus terminator (1).
1360 #define ULOC_RG_BUFLEN 8
1361 
1362 U_CAPI int32_t U_EXPORT2
ulocimp_getRegionForSupplementalData(const char * localeID,UBool inferRegion,char * region,int32_t regionCapacity,UErrorCode * status)1363 ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
1364                                      char *region, int32_t regionCapacity, UErrorCode* status) {
1365     if (U_FAILURE(*status)) {
1366         return 0;
1367     }
1368     char rgBuf[ULOC_RG_BUFLEN];
1369     UErrorCode rgStatus = U_ZERO_ERROR;
1370 
1371     // First check for rg keyword value
1372     int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
1373     if (U_FAILURE(rgStatus) || rgLen != 6) {
1374         rgLen = 0;
1375     } else {
1376         // rgBuf guaranteed to be zero terminated here, with text len 6
1377         char *rgPtr = rgBuf;
1378         for (; *rgPtr!= 0; rgPtr++) {
1379             *rgPtr = uprv_toupper(*rgPtr);
1380         }
1381         rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
1382     }
1383 
1384     if (rgLen == 0) {
1385         // No valid rg keyword value, try for unicode_region_subtag
1386         rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
1387         if (U_FAILURE(*status)) {
1388             rgLen = 0;
1389         } else if (rgLen == 0 && inferRegion) {
1390             // no unicode_region_subtag but inferRegion TRUE, try likely subtags
1391             rgStatus = U_ZERO_ERROR;
1392             icu::CharString locBuf;
1393             {
1394                 icu::CharStringByteSink sink(&locBuf);
1395                 ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
1396             }
1397             if (U_SUCCESS(rgStatus)) {
1398                 rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
1399                 if (U_FAILURE(*status)) {
1400                     rgLen = 0;
1401                 }
1402             }
1403         }
1404     }
1405 
1406     rgBuf[rgLen] = 0;
1407     uprv_strncpy(region, rgBuf, regionCapacity);
1408     return u_terminateChars(region, regionCapacity, rgLen, status);
1409 }
1410 
1411