1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45 
46 U_NAMESPACE_USE
47 
48 /* UCaseMap service object -------------------------------------------------- */
49 
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52         iter(NULL),
53 #endif
54         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55     ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57 
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60     delete iter;
61 #endif
62 }
63 
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66     if(U_FAILURE(*pErrorCode)) {
67         return NULL;
68     }
69     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70     if(csm==NULL) {
71         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72         return NULL;
73     } else if (U_FAILURE(*pErrorCode)) {
74         delete csm;
75         return NULL;
76     }
77     return csm;
78 }
79 
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82     delete csm;
83 }
84 
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87     return csm->locale;
88 }
89 
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92     return csm->options;
93 }
94 
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97     if(U_FAILURE(*pErrorCode)) {
98         return;
99     }
100     if (locale != NULL && *locale == 0) {
101         csm->locale[0] = 0;
102         csm->caseLocale = UCASE_LOC_ROOT;
103         return;
104     }
105 
106     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108         *pErrorCode=U_ZERO_ERROR;
109         /* we only really need the language code for case mappings */
110         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111     }
112     if(length==sizeof(csm->locale)) {
113         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114     }
115     if(U_SUCCESS(*pErrorCode)) {
116         csm->caseLocale=UCASE_LOC_UNKNOWN;
117         csm->caseLocale = ucase_getCaseLocale(csm->locale);
118     } else {
119         csm->locale[0]=0;
120         csm->caseLocale = UCASE_LOC_ROOT;
121     }
122 }
123 
124 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126     if(U_FAILURE(*pErrorCode)) {
127         return;
128     }
129     csm->options=options;
130 }
131 
132 /* UTF-8 string case mappings ----------------------------------------------- */
133 
134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
135 
136 namespace {
137 
138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
139 inline UBool
appendResult(int32_t cpLength,int32_t result,const UChar * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)140 appendResult(int32_t cpLength, int32_t result, const UChar *s,
141              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142     U_ASSERT(U_SUCCESS(errorCode));
143 
144     /* decode the result */
145     if(result<0) {
146         /* (not) original code point */
147         if(edits!=NULL) {
148             edits->addUnchanged(cpLength);
149         }
150         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152         }
153     } else {
154         if(result<=UCASE_MAX_STRING_LENGTH) {
155             // string: "result" is the UTF-16 length
156             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157         } else {
158             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159         }
160     }
161     return TRUE;
162 }
163 
164 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
167 
168 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)169 utf8_caseContextIterator(void *context, int8_t dir) {
170     UCaseContext *csc=(UCaseContext *)context;
171     UChar32 c;
172 
173     if(dir<0) {
174         /* reset for backward iteration */
175         csc->index=csc->cpStart;
176         csc->dir=dir;
177     } else if(dir>0) {
178         /* reset for forward iteration */
179         csc->index=csc->cpLimit;
180         csc->dir=dir;
181     } else {
182         /* continue current iteration direction */
183         dir=csc->dir;
184     }
185 
186     if(dir<0) {
187         if(csc->start<csc->index) {
188             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
189             return c;
190         }
191     } else {
192         if(csc->index<csc->limit) {
193             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
194             return c;
195         }
196     }
197     return U_SENTINEL;
198 }
199 
200 /**
201  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
203  */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)204 void toLower(int32_t caseLocale, uint32_t options,
205              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
206              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
207     const int8_t *latinToLower;
208     if (caseLocale == UCASE_LOC_ROOT ||
209             (caseLocale >= 0 ?
210                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
211                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
212         latinToLower = LatinCase::TO_LOWER_NORMAL;
213     } else {
214         latinToLower = LatinCase::TO_LOWER_TR_LT;
215     }
216     const UTrie2 *trie = ucase_getTrie();
217     int32_t prev = srcStart;
218     int32_t srcIndex = srcStart;
219     for (;;) {
220         // fast path for simple cases
221         int32_t cpStart;
222         UChar32 c;
223         for (;;) {
224             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
225                 c = U_SENTINEL;
226                 break;
227             }
228             uint8_t lead = src[srcIndex++];
229             if (lead <= 0x7f) {
230                 int8_t d = latinToLower[lead];
231                 if (d == LatinCase::EXC) {
232                     cpStart = srcIndex - 1;
233                     c = lead;
234                     break;
235                 }
236                 if (d == 0) { continue; }
237                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
238                                               sink, options, edits, errorCode);
239                 char ascii = (char)(lead + d);
240                 sink.Append(&ascii, 1);
241                 if (edits != nullptr) {
242                     edits->addReplace(1, 1);
243                 }
244                 prev = srcIndex;
245                 continue;
246             } else if (lead < 0xe3) {
247                 uint8_t t;
248                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
249                         (t = src[srcIndex] - 0x80) <= 0x3f) {
250                     // U+0080..U+017F
251                     ++srcIndex;
252                     c = ((lead - 0xc0) << 6) | t;
253                     int8_t d = latinToLower[c];
254                     if (d == LatinCase::EXC) {
255                         cpStart = srcIndex - 2;
256                         break;
257                     }
258                     if (d == 0) { continue; }
259                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
260                                                   sink, options, edits, errorCode);
261                     ByteSinkUtil::appendTwoBytes(c + d, sink);
262                     if (edits != nullptr) {
263                         edits->addReplace(2, 2);
264                     }
265                     prev = srcIndex;
266                     continue;
267                 }
268             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
269                     (srcIndex + 2) <= srcLimit &&
270                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
271                 // most of CJK: no case mappings
272                 srcIndex += 2;
273                 continue;
274             }
275             cpStart = --srcIndex;
276             U8_NEXT(src, srcIndex, srcLimit, c);
277             if (c < 0) {
278                 // ill-formed UTF-8
279                 continue;
280             }
281             uint16_t props = UTRIE2_GET16(trie, c);
282             if (UCASE_HAS_EXCEPTION(props)) { break; }
283             int32_t delta;
284             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
285                 continue;
286             }
287             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
288                                           sink, options, edits, errorCode);
289             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
290             prev = srcIndex;
291         }
292         if (c < 0) {
293             break;
294         }
295         // slow path
296         const UChar *s;
297         if (caseLocale >= 0) {
298             csc->cpStart = cpStart;
299             csc->cpLimit = srcIndex;
300             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
301         } else {
302             c = ucase_toFullFolding(c, &s, options);
303         }
304         if (c >= 0) {
305             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
306                                           sink, options, edits, errorCode);
307             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
308             prev = srcIndex;
309         }
310     }
311     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
312                                   sink, options, edits, errorCode);
313 }
314 
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)315 void toUpper(int32_t caseLocale, uint32_t options,
316              const uint8_t *src, UCaseContext *csc, int32_t srcLength,
317              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
318     const int8_t *latinToUpper;
319     if (caseLocale == UCASE_LOC_TURKISH) {
320         latinToUpper = LatinCase::TO_UPPER_TR;
321     } else {
322         latinToUpper = LatinCase::TO_UPPER_NORMAL;
323     }
324     const UTrie2 *trie = ucase_getTrie();
325     int32_t prev = 0;
326     int32_t srcIndex = 0;
327     for (;;) {
328         // fast path for simple cases
329         int32_t cpStart;
330         UChar32 c;
331         for (;;) {
332             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
333                 c = U_SENTINEL;
334                 break;
335             }
336             uint8_t lead = src[srcIndex++];
337             if (lead <= 0x7f) {
338                 int8_t d = latinToUpper[lead];
339                 if (d == LatinCase::EXC) {
340                     cpStart = srcIndex - 1;
341                     c = lead;
342                     break;
343                 }
344                 if (d == 0) { continue; }
345                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
346                                               sink, options, edits, errorCode);
347                 char ascii = (char)(lead + d);
348                 sink.Append(&ascii, 1);
349                 if (edits != nullptr) {
350                     edits->addReplace(1, 1);
351                 }
352                 prev = srcIndex;
353                 continue;
354             } else if (lead < 0xe3) {
355                 uint8_t t;
356                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
357                         (t = src[srcIndex] - 0x80) <= 0x3f) {
358                     // U+0080..U+017F
359                     ++srcIndex;
360                     c = ((lead - 0xc0) << 6) | t;
361                     int8_t d = latinToUpper[c];
362                     if (d == LatinCase::EXC) {
363                         cpStart = srcIndex - 2;
364                         break;
365                     }
366                     if (d == 0) { continue; }
367                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
368                                                   sink, options, edits, errorCode);
369                     ByteSinkUtil::appendTwoBytes(c + d, sink);
370                     if (edits != nullptr) {
371                         edits->addReplace(2, 2);
372                     }
373                     prev = srcIndex;
374                     continue;
375                 }
376             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
377                     (srcIndex + 2) <= srcLength &&
378                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
379                 // most of CJK: no case mappings
380                 srcIndex += 2;
381                 continue;
382             }
383             cpStart = --srcIndex;
384             U8_NEXT(src, srcIndex, srcLength, c);
385             if (c < 0) {
386                 // ill-formed UTF-8
387                 continue;
388             }
389             uint16_t props = UTRIE2_GET16(trie, c);
390             if (UCASE_HAS_EXCEPTION(props)) { break; }
391             int32_t delta;
392             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
393                 continue;
394             }
395             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
396                                           sink, options, edits, errorCode);
397             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
398             prev = srcIndex;
399         }
400         if (c < 0) {
401             break;
402         }
403         // slow path
404         csc->cpStart = cpStart;
405         csc->cpLimit = srcIndex;
406         const UChar *s;
407         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
408         if (c >= 0) {
409             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
410                                           sink, options, edits, errorCode);
411             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
412             prev = srcIndex;
413         }
414     }
415     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
416                                   sink, options, edits, errorCode);
417 }
418 
419 }  // namespace
420 
421 #if !UCONFIG_NO_BREAK_ITERATION
422 
423 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)424 ucasemap_internalUTF8ToTitle(
425         int32_t caseLocale, uint32_t options, BreakIterator *iter,
426         const uint8_t *src, int32_t srcLength,
427         ByteSink &sink, icu::Edits *edits,
428         UErrorCode &errorCode) {
429     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
430         return;
431     }
432 
433     /* set up local variables */
434     UCaseContext csc=UCASECONTEXT_INITIALIZER;
435     csc.p=(void *)src;
436     csc.limit=srcLength;
437     int32_t prev=0;
438     UBool isFirstIndex=TRUE;
439 
440     /* titlecasing loop */
441     while(prev<srcLength) {
442         /* find next index where to titlecase */
443         int32_t index;
444         if(isFirstIndex) {
445             isFirstIndex=FALSE;
446             index=iter->first();
447         } else {
448             index=iter->next();
449         }
450         if(index==UBRK_DONE || index>srcLength) {
451             index=srcLength;
452         }
453 
454         /*
455          * Segment [prev..index[ into 3 parts:
456          * a) skipped characters (copy as-is) [prev..titleStart[
457          * b) first letter (titlecase)              [titleStart..titleLimit[
458          * c) subsequent characters (lowercase)                 [titleLimit..index[
459          */
460         if(prev<index) {
461             /* find and copy skipped characters [prev..titleStart[ */
462             int32_t titleStart=prev;
463             int32_t titleLimit=prev;
464             UChar32 c;
465             U8_NEXT(src, titleLimit, index, c);
466             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
467                 // Adjust the titlecasing index to the next cased character,
468                 // or to the next letter/number/symbol/private use.
469                 // Stop with titleStart<titleLimit<=index
470                 // if there is a character to be titlecased,
471                 // or else stop with titleStart==titleLimit==index.
472                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
473                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
474                     titleStart=titleLimit;
475                     if(titleLimit==index) {
476                         break;
477                     }
478                     U8_NEXT(src, titleLimit, index, c);
479                 }
480                 if (prev < titleStart) {
481                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
482                                                        sink, options, edits, errorCode)) {
483                         return;
484                     }
485                 }
486             }
487 
488             if(titleStart<titleLimit) {
489                 /* titlecase c which is from [titleStart..titleLimit[ */
490                 if(c>=0) {
491                     csc.cpStart=titleStart;
492                     csc.cpLimit=titleLimit;
493                     const UChar *s;
494                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
495                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
496                         return;
497                     }
498                 } else {
499                     // Malformed UTF-8.
500                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
501                                                        sink, options, edits, errorCode)) {
502                         return;
503                     }
504                 }
505 
506                 /* Special case Dutch IJ titlecasing */
507                 if (titleStart+1 < index &&
508                         caseLocale == UCASE_LOC_DUTCH &&
509                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
510                     if (src[titleStart+1] == 0x006A) {
511                         ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
512                         titleLimit++;
513                     } else if (src[titleStart+1] == 0x004A) {
514                         // Keep the capital J from getting lowercased.
515                         if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
516                                                            sink, options, edits, errorCode)) {
517                             return;
518                         }
519                         titleLimit++;
520                     }
521                 }
522 
523                 /* lowercase [titleLimit..index[ */
524                 if(titleLimit<index) {
525                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
526                         /* Normal operation: Lowercase the rest of the word. */
527                         toLower(caseLocale, options,
528                                 src, &csc, titleLimit, index,
529                                 sink, edits, errorCode);
530                         if(U_FAILURE(errorCode)) {
531                             return;
532                         }
533                     } else {
534                         /* Optionally just copy the rest of the word unchanged. */
535                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
536                                                            sink, options, edits, errorCode)) {
537                             return;
538                         }
539                     }
540                 }
541             }
542         }
543 
544         prev=index;
545     }
546 }
547 
548 #endif
549 
550 U_NAMESPACE_BEGIN
551 namespace GreekUpper {
552 
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)553 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
554     while (i < length) {
555         UChar32 c;
556         U8_NEXT(s, i, length, c);
557         int32_t type = ucase_getTypeOrIgnorable(c);
558         if ((type & UCASE_IGNORABLE) != 0) {
559             // Case-ignorable, continue with the loop.
560         } else if (type != UCASE_NONE) {
561             return TRUE;  // Followed by cased letter.
562         } else {
563             return FALSE;  // Uncased and not case-ignorable.
564         }
565     }
566     return FALSE;  // Not followed by cased letter.
567 }
568 
569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)570 void toUpper(uint32_t options,
571              const uint8_t *src, int32_t srcLength,
572              ByteSink &sink, Edits *edits,
573              UErrorCode &errorCode) {
574     uint32_t state = 0;
575     for (int32_t i = 0; i < srcLength;) {
576         int32_t nextIndex = i;
577         UChar32 c;
578         U8_NEXT(src, nextIndex, srcLength, c);
579         uint32_t nextState = 0;
580         int32_t type = ucase_getTypeOrIgnorable(c);
581         if ((type & UCASE_IGNORABLE) != 0) {
582             // c is case-ignorable
583             nextState |= (state & AFTER_CASED);
584         } else if (type != UCASE_NONE) {
585             // c is cased
586             nextState |= AFTER_CASED;
587         }
588         uint32_t data = getLetterData(c);
589         if (data > 0) {
590             uint32_t upper = data & UPPER_MASK;
591             // Add a dialytika to this iota or ypsilon vowel
592             // if we removed a tonos from the previous vowel,
593             // and that previous vowel did not also have (or gain) a dialytika.
594             // Adding one only to the final vowel in a longer sequence
595             // (which does not occur in normal writing) would require lookahead.
596             // Set the same flag as for preserving an existing dialytika.
597             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
598                     (upper == 0x399 || upper == 0x3A5)) {
599                 data |= HAS_DIALYTIKA;
600             }
601             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
602             if ((data & HAS_YPOGEGRAMMENI) != 0) {
603                 numYpogegrammeni = 1;
604             }
605             // Skip combining diacritics after this Greek letter.
606             int32_t nextNextIndex = nextIndex;
607             while (nextIndex < srcLength) {
608                 UChar32 c2;
609                 U8_NEXT(src, nextNextIndex, srcLength, c2);
610                 uint32_t diacriticData = getDiacriticData(c2);
611                 if (diacriticData != 0) {
612                     data |= diacriticData;
613                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
614                         ++numYpogegrammeni;
615                     }
616                     nextIndex = nextNextIndex;
617                 } else {
618                     break;  // not a Greek diacritic
619                 }
620             }
621             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
622                 nextState |= AFTER_VOWEL_WITH_ACCENT;
623             }
624             // Map according to Greek rules.
625             UBool addTonos = FALSE;
626             if (upper == 0x397 &&
627                     (data & HAS_ACCENT) != 0 &&
628                     numYpogegrammeni == 0 &&
629                     (state & AFTER_CASED) == 0 &&
630                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
631                 // Keep disjunctive "or" with (only) a tonos.
632                 // We use the same "word boundary" conditions as for the Final_Sigma test.
633                 if (i == nextIndex) {
634                     upper = 0x389;  // Preserve the precomposed form.
635                 } else {
636                     addTonos = TRUE;
637                 }
638             } else if ((data & HAS_DIALYTIKA) != 0) {
639                 // Preserve a vowel with dialytika in precomposed form if it exists.
640                 if (upper == 0x399) {
641                     upper = 0x3AA;
642                     data &= ~HAS_EITHER_DIALYTIKA;
643                 } else if (upper == 0x3A5) {
644                     upper = 0x3AB;
645                     data &= ~HAS_EITHER_DIALYTIKA;
646                 }
647             }
648 
649             UBool change;
650             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
651                 change = TRUE;  // common, simple usage
652             } else {
653                 // Find out first whether we are changing the text.
654                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
655                 change = (i + 2) > nextIndex ||
656                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
657                         numYpogegrammeni > 0;
658                 int32_t i2 = i + 2;
659                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
660                     change |= (i2 + 2) > nextIndex ||
661                             src[i2] != (uint8_t)u8"\u0308"[0] ||
662                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];
663                     i2 += 2;
664                 }
665                 if (addTonos) {
666                     change |= (i2 + 2) > nextIndex ||
667                             src[i2] != (uint8_t)u8"\u0301"[0] ||
668                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];
669                     i2 += 2;
670                 }
671                 int32_t oldLength = nextIndex - i;
672                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
673                 change |= oldLength != newLength;
674                 if (change) {
675                     if (edits != NULL) {
676                         edits->addReplace(oldLength, newLength);
677                     }
678                 } else {
679                     if (edits != NULL) {
680                         edits->addUnchanged(oldLength);
681                     }
682                     // Write unchanged text?
683                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
684                 }
685             }
686 
687             if (change) {
688                 ByteSinkUtil::appendTwoBytes(upper, sink);
689                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
690                     sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
691                 }
692                 if (addTonos) {
693                     sink.AppendU8(u8"\u0301", 2);
694                 }
695                 while (numYpogegrammeni > 0) {
696                     sink.AppendU8(u8"\u0399", 2);
697                     --numYpogegrammeni;
698                 }
699             }
700         } else if(c>=0) {
701             const UChar *s;
702             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
703             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
704                 return;
705             }
706         } else {
707             // Malformed UTF-8.
708             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
709                                                sink, options, edits, errorCode)) {
710                 return;
711             }
712         }
713         i = nextIndex;
714         state = nextState;
715     }
716 }
717 
718 }  // namespace GreekUpper
719 U_NAMESPACE_END
720 
721 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)722 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
723                              const uint8_t *src, int32_t srcLength,
724                              icu::ByteSink &sink, icu::Edits *edits,
725                              UErrorCode &errorCode) {
726     UCaseContext csc=UCASECONTEXT_INITIALIZER;
727     csc.p=(void *)src;
728     csc.limit=srcLength;
729     toLower(
730         caseLocale, options,
731         src, &csc, 0, srcLength,
732         sink, edits, errorCode);
733 }
734 
735 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)736 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
737                              const uint8_t *src, int32_t srcLength,
738                              icu::ByteSink &sink, icu::Edits *edits,
739                              UErrorCode &errorCode) {
740     if (caseLocale == UCASE_LOC_GREEK) {
741         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
742     } else {
743         UCaseContext csc=UCASECONTEXT_INITIALIZER;
744         csc.p=(void *)src;
745         csc.limit=srcLength;
746         toUpper(
747             caseLocale, options,
748             src, &csc, srcLength,
749             sink, edits, errorCode);
750     }
751 }
752 
753 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)754 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
755                           const uint8_t *src, int32_t srcLength,
756                           icu::ByteSink &sink, icu::Edits *edits,
757                           UErrorCode &errorCode) {
758     toLower(
759         -1, options,
760         src, nullptr, 0, srcLength,
761         sink, edits, errorCode);
762 }
763 
764 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)765 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
766                  const char *src, int32_t srcLength,
767                  UTF8CaseMapper *stringCaseMapper,
768                  icu::ByteSink &sink, icu::Edits *edits,
769                  UErrorCode &errorCode) {
770     /* check argument values */
771     if (U_FAILURE(errorCode)) {
772         return;
773     }
774     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
775         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
776         return;
777     }
778 
779     // Get the string length.
780     if (srcLength == -1) {
781         srcLength = (int32_t)uprv_strlen((const char *)src);
782     }
783 
784     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
785         edits->reset();
786     }
787     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
788                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
789     sink.Flush();
790     if (U_SUCCESS(errorCode)) {
791         if (edits != nullptr) {
792             edits->copyErrorTo(errorCode);
793         }
794     }
795 }
796 
797 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)798 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
799                  char *dest, int32_t destCapacity,
800                  const char *src, int32_t srcLength,
801                  UTF8CaseMapper *stringCaseMapper,
802                  icu::Edits *edits,
803                  UErrorCode &errorCode) {
804     /* check argument values */
805     if(U_FAILURE(errorCode)) {
806         return 0;
807     }
808     if( destCapacity<0 ||
809         (dest==NULL && destCapacity>0) ||
810         (src==NULL && srcLength!=0) || srcLength<-1
811     ) {
812         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
813         return 0;
814     }
815 
816     /* get the string length */
817     if(srcLength==-1) {
818         srcLength=(int32_t)uprv_strlen((const char *)src);
819     }
820 
821     /* check for overlapping source and destination */
822     if( dest!=NULL &&
823         ((src>=dest && src<(dest+destCapacity)) ||
824          (dest>=src && dest<(src+srcLength)))
825     ) {
826         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
827         return 0;
828     }
829 
830     CheckedArrayByteSink sink(dest, destCapacity);
831     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
832         edits->reset();
833     }
834     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
835                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
836     sink.Flush();
837     if (U_SUCCESS(errorCode)) {
838         if (sink.Overflowed()) {
839             errorCode = U_BUFFER_OVERFLOW_ERROR;
840         } else if (edits != nullptr) {
841             edits->copyErrorTo(errorCode);
842         }
843     }
844     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
845 }
846 
847 /* public API functions */
848 
849 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)850 ucasemap_utf8ToLower(const UCaseMap *csm,
851                      char *dest, int32_t destCapacity,
852                      const char *src, int32_t srcLength,
853                      UErrorCode *pErrorCode) {
854     return ucasemap_mapUTF8(
855         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
856         dest, destCapacity,
857         src, srcLength,
858         ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
859 }
860 
861 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)862 ucasemap_utf8ToUpper(const UCaseMap *csm,
863                      char *dest, int32_t destCapacity,
864                      const char *src, int32_t srcLength,
865                      UErrorCode *pErrorCode) {
866     return ucasemap_mapUTF8(
867         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
868         dest, destCapacity,
869         src, srcLength,
870         ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
871 }
872 
873 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)874 ucasemap_utf8FoldCase(const UCaseMap *csm,
875                       char *dest, int32_t destCapacity,
876                       const char *src, int32_t srcLength,
877                       UErrorCode *pErrorCode) {
878     return ucasemap_mapUTF8(
879         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
880         dest, destCapacity,
881         src, srcLength,
882         ucasemap_internalUTF8Fold, NULL, *pErrorCode);
883 }
884 
885 U_NAMESPACE_BEGIN
886 
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)887 void CaseMap::utf8ToLower(
888         const char *locale, uint32_t options,
889         StringPiece src, ByteSink &sink, Edits *edits,
890         UErrorCode &errorCode) {
891     ucasemap_mapUTF8(
892         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893         src.data(), src.length(),
894         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
895 }
896 
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)897 void CaseMap::utf8ToUpper(
898         const char *locale, uint32_t options,
899         StringPiece src, ByteSink &sink, Edits *edits,
900         UErrorCode &errorCode) {
901     ucasemap_mapUTF8(
902         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
903         src.data(), src.length(),
904         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
905 }
906 
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)907 void CaseMap::utf8Fold(
908         uint32_t options,
909         StringPiece src, ByteSink &sink, Edits *edits,
910         UErrorCode &errorCode) {
911     ucasemap_mapUTF8(
912         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
913         src.data(), src.length(),
914         ucasemap_internalUTF8Fold, sink, edits, errorCode);
915 }
916 
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)917 int32_t CaseMap::utf8ToLower(
918         const char *locale, uint32_t options,
919         const char *src, int32_t srcLength,
920         char *dest, int32_t destCapacity, Edits *edits,
921         UErrorCode &errorCode) {
922     return ucasemap_mapUTF8(
923         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
924         dest, destCapacity,
925         src, srcLength,
926         ucasemap_internalUTF8ToLower, edits, errorCode);
927 }
928 
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)929 int32_t CaseMap::utf8ToUpper(
930         const char *locale, uint32_t options,
931         const char *src, int32_t srcLength,
932         char *dest, int32_t destCapacity, Edits *edits,
933         UErrorCode &errorCode) {
934     return ucasemap_mapUTF8(
935         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
936         dest, destCapacity,
937         src, srcLength,
938         ucasemap_internalUTF8ToUpper, edits, errorCode);
939 }
940 
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)941 int32_t CaseMap::utf8Fold(
942         uint32_t options,
943         const char *src, int32_t srcLength,
944         char *dest, int32_t destCapacity, Edits *edits,
945         UErrorCode &errorCode) {
946     return ucasemap_mapUTF8(
947         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
948         dest, destCapacity,
949         src, srcLength,
950         ucasemap_internalUTF8Fold, edits, errorCode);
951 }
952 
953 U_NAMESPACE_END
954