1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2001-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ustrcase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002feb20
16 *   created by: Markus W. Scherer
17 *
18 *   Implementation file for string casing C API functions.
19 *   Uses functions from uchar.c for basic functionality that requires access
20 *   to the Unicode Character Database (uprops.dat).
21 */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/stringoptions.h"
28 #include "unicode/ustring.h"
29 #include "unicode/ucasemap.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf16.h"
33 #include "cmemory.h"
34 #include "ucase.h"
35 #include "ucasemap_imp.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38 
39 U_NAMESPACE_BEGIN
40 
41 namespace {
42 
checkOverflowAndEditsError(int32_t destIndex,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)43 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
44                                    Edits *edits, UErrorCode &errorCode) {
45     if (U_SUCCESS(errorCode)) {
46         if (destIndex > destCapacity) {
47             errorCode = U_BUFFER_OVERFLOW_ERROR;
48         } else if (edits != NULL) {
49             edits->copyErrorTo(errorCode);
50         }
51     }
52     return destIndex;
53 }
54 
55 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
56 inline int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s,int32_t cpLength,uint32_t options,icu::Edits * edits)57 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
58              int32_t result, const UChar *s,
59              int32_t cpLength, uint32_t options, icu::Edits *edits) {
60     UChar32 c;
61     int32_t length;
62 
63     /* decode the result */
64     if(result<0) {
65         /* (not) original code point */
66         if(edits!=NULL) {
67             edits->addUnchanged(cpLength);
68         }
69         if(options & U_OMIT_UNCHANGED_TEXT) {
70             return destIndex;
71         }
72         c=~result;
73         if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
74             dest[destIndex++]=(UChar)c;
75             return destIndex;
76         }
77         length=cpLength;
78     } else {
79         if(result<=UCASE_MAX_STRING_LENGTH) {
80             c=U_SENTINEL;
81             length=result;
82         } else if(destIndex<destCapacity && result<=0xffff) {  // BMP slightly-fastpath
83             dest[destIndex++]=(UChar)result;
84             if(edits!=NULL) {
85                 edits->addReplace(cpLength, 1);
86             }
87             return destIndex;
88         } else {
89             c=result;
90             length=U16_LENGTH(c);
91         }
92         if(edits!=NULL) {
93             edits->addReplace(cpLength, length);
94         }
95     }
96     if(length>(INT32_MAX-destIndex)) {
97         return -1;  // integer overflow
98     }
99 
100     if(destIndex<destCapacity) {
101         /* append the result */
102         if(c>=0) {
103             /* code point */
104             UBool isError=FALSE;
105             U16_APPEND(dest, destIndex, destCapacity, c, isError);
106             if(isError) {
107                 /* overflow, nothing written */
108                 destIndex+=length;
109             }
110         } else {
111             /* string */
112             if((destIndex+length)<=destCapacity) {
113                 while(length>0) {
114                     dest[destIndex++]=*s++;
115                     --length;
116                 }
117             } else {
118                 /* overflow */
119                 destIndex+=length;
120             }
121         }
122     } else {
123         /* preflight */
124         destIndex+=length;
125     }
126     return destIndex;
127 }
128 
129 inline int32_t
appendUChar(UChar * dest,int32_t destIndex,int32_t destCapacity,UChar c)130 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
131     if(destIndex<destCapacity) {
132         dest[destIndex]=c;
133     } else if(destIndex==INT32_MAX) {
134         return -1;  // integer overflow
135     }
136     return destIndex+1;
137 }
138 
139 int32_t
appendNonEmptyUnchanged(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length,uint32_t options,icu::Edits * edits)140 appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
141                         const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
142     if(edits!=NULL) {
143         edits->addUnchanged(length);
144     }
145     if(options & U_OMIT_UNCHANGED_TEXT) {
146         return destIndex;
147     }
148     if(length>(INT32_MAX-destIndex)) {
149         return -1;  // integer overflow
150     }
151     if((destIndex+length)<=destCapacity) {
152         u_memcpy(dest+destIndex, s, length);
153     }
154     return destIndex + length;
155 }
156 
157 inline int32_t
appendUnchanged(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length,uint32_t options,icu::Edits * edits)158 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
159                 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
160     if (length <= 0) {
161         return destIndex;
162     }
163     return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
164 }
165 
166 UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)167 utf16_caseContextIterator(void *context, int8_t dir) {
168     UCaseContext *csc=(UCaseContext *)context;
169     UChar32 c;
170 
171     if(dir<0) {
172         /* reset for backward iteration */
173         csc->index=csc->cpStart;
174         csc->dir=dir;
175     } else if(dir>0) {
176         /* reset for forward iteration */
177         csc->index=csc->cpLimit;
178         csc->dir=dir;
179     } else {
180         /* continue current iteration direction */
181         dir=csc->dir;
182     }
183 
184     if(dir<0) {
185         if(csc->start<csc->index) {
186             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
187             return c;
188         }
189     } else {
190         if(csc->index<csc->limit) {
191             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
192             return c;
193         }
194     }
195     return U_SENTINEL;
196 }
197 
198 /**
199  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
200  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
201  */
toLower(int32_t caseLocale,uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::Edits * edits,UErrorCode & errorCode)202 int32_t toLower(int32_t caseLocale, uint32_t options,
203                 UChar *dest, int32_t destCapacity,
204                 const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205                 icu::Edits *edits, UErrorCode &errorCode) {
206     const int8_t *latinToLower;
207     if (caseLocale == UCASE_LOC_ROOT ||
208             (caseLocale >= 0 ?
209                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211         latinToLower = LatinCase::TO_LOWER_NORMAL;
212     } else {
213         latinToLower = LatinCase::TO_LOWER_TR_LT;
214     }
215     const UTrie2 *trie = ucase_getTrie();
216     int32_t destIndex = 0;
217     int32_t prev = srcStart;
218     int32_t srcIndex = srcStart;
219     for (;;) {
220         // fast path for simple cases
221         UChar lead = 0;
222         while (srcIndex < srcLimit) {
223             lead = src[srcIndex];
224             int32_t delta;
225             if (lead < LatinCase::LONG_S) {
226                 int8_t d = latinToLower[lead];
227                 if (d == LatinCase::EXC) { break; }
228                 ++srcIndex;
229                 if (d == 0) { continue; }
230                 delta = d;
231             } else if (lead >= 0xd800) {
232                 break;  // surrogate or higher
233             } else {
234                 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
235                 if (UCASE_HAS_EXCEPTION(props)) { break; }
236                 ++srcIndex;
237                 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
238                     continue;
239                 }
240             }
241             lead += static_cast<UChar>(delta);
242             destIndex = appendUnchanged(dest, destIndex, destCapacity,
243                                         src + prev, srcIndex - 1 - prev, options, edits);
244             if (destIndex >= 0) {
245                 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
246                 if (edits != nullptr) {
247                     edits->addReplace(1, 1);
248                 }
249             }
250             if (destIndex < 0) {
251                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
252                 return 0;
253             }
254             prev = srcIndex;
255         }
256         if (srcIndex >= srcLimit) {
257             break;
258         }
259         // slow path
260         int32_t cpStart = srcIndex++;
261         UChar trail;
262         UChar32 c;
263         if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
264             c = U16_GET_SUPPLEMENTARY(lead, trail);
265             ++srcIndex;
266         } else {
267             c = lead;
268         }
269         const UChar *s;
270         if (caseLocale >= 0) {
271             csc->cpStart = cpStart;
272             csc->cpLimit = srcIndex;
273             c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
274         } else {
275             c = ucase_toFullFolding(c, &s, options);
276         }
277         if (c >= 0) {
278             destIndex = appendUnchanged(dest, destIndex, destCapacity,
279                                         src + prev, cpStart - prev, options, edits);
280             if (destIndex >= 0) {
281                 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
282                                          srcIndex - cpStart, options, edits);
283             }
284             if (destIndex < 0) {
285                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
286                 return 0;
287             }
288             prev = srcIndex;
289         }
290     }
291     destIndex = appendUnchanged(dest, destIndex, destCapacity,
292                                 src + prev, srcIndex - prev, options, edits);
293     if (destIndex < 0) {
294         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
295         return 0;
296     }
297     return destIndex;
298 }
299 
toUpper(int32_t caseLocale,uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)300 int32_t toUpper(int32_t caseLocale, uint32_t options,
301                 UChar *dest, int32_t destCapacity,
302                 const UChar *src, UCaseContext *csc, int32_t srcLength,
303                 icu::Edits *edits, UErrorCode &errorCode) {
304     const int8_t *latinToUpper;
305     if (caseLocale == UCASE_LOC_TURKISH) {
306         latinToUpper = LatinCase::TO_UPPER_TR;
307     } else {
308         latinToUpper = LatinCase::TO_UPPER_NORMAL;
309     }
310     const UTrie2 *trie = ucase_getTrie();
311     int32_t destIndex = 0;
312     int32_t prev = 0;
313     int32_t srcIndex = 0;
314     for (;;) {
315         // fast path for simple cases
316         UChar lead = 0;
317         while (srcIndex < srcLength) {
318             lead = src[srcIndex];
319             int32_t delta;
320             if (lead < LatinCase::LONG_S) {
321                 int8_t d = latinToUpper[lead];
322                 if (d == LatinCase::EXC) { break; }
323                 ++srcIndex;
324                 if (d == 0) { continue; }
325                 delta = d;
326             } else if (lead >= 0xd800) {
327                 break;  // surrogate or higher
328             } else {
329                 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
330                 if (UCASE_HAS_EXCEPTION(props)) { break; }
331                 ++srcIndex;
332                 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
333                     continue;
334                 }
335             }
336             lead += static_cast<UChar>(delta);
337             destIndex = appendUnchanged(dest, destIndex, destCapacity,
338                                         src + prev, srcIndex - 1 - prev, options, edits);
339             if (destIndex >= 0) {
340                 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
341                 if (edits != nullptr) {
342                     edits->addReplace(1, 1);
343                 }
344             }
345             if (destIndex < 0) {
346                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
347                 return 0;
348             }
349             prev = srcIndex;
350         }
351         if (srcIndex >= srcLength) {
352             break;
353         }
354         // slow path
355         int32_t cpStart;
356         csc->cpStart = cpStart = srcIndex++;
357         UChar trail;
358         UChar32 c;
359         if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
360             c = U16_GET_SUPPLEMENTARY(lead, trail);
361             ++srcIndex;
362         } else {
363             c = lead;
364         }
365         csc->cpLimit = srcIndex;
366         const UChar *s;
367         c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
368         if (c >= 0) {
369             destIndex = appendUnchanged(dest, destIndex, destCapacity,
370                                         src + prev, cpStart - prev, options, edits);
371             if (destIndex >= 0) {
372                 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
373                                          srcIndex - cpStart, options, edits);
374             }
375             if (destIndex < 0) {
376                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
377                 return 0;
378             }
379             prev = srcIndex;
380         }
381     }
382     destIndex = appendUnchanged(dest, destIndex, destCapacity,
383                                 src + prev, srcIndex - prev, options, edits);
384     if (destIndex < 0) {
385         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
386         return 0;
387     }
388     return destIndex;
389 }
390 
391 }  // namespace
392 
393 U_NAMESPACE_END
394 
395 U_NAMESPACE_USE
396 
397 #if !UCONFIG_NO_BREAK_ITERATION
398 
399 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)400 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
401                          UChar *dest, int32_t destCapacity,
402                          const UChar *src, int32_t srcLength,
403                          icu::Edits *edits,
404                          UErrorCode &errorCode) {
405     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
406         return 0;
407     }
408 
409     /* set up local variables */
410     UCaseContext csc=UCASECONTEXT_INITIALIZER;
411     csc.p=(void *)src;
412     csc.limit=srcLength;
413     int32_t destIndex=0;
414     int32_t prev=0;
415     UBool isFirstIndex=TRUE;
416 
417     /* titlecasing loop */
418     while(prev<srcLength) {
419         /* find next index where to titlecase */
420         int32_t index;
421         if(isFirstIndex) {
422             isFirstIndex=FALSE;
423             index=iter->first();
424         } else {
425             index=iter->next();
426         }
427         if(index==UBRK_DONE || index>srcLength) {
428             index=srcLength;
429         }
430 
431         /*
432          * Segment [prev..index[ into 3 parts:
433          * a) skipped characters (copy as-is) [prev..titleStart[
434          * b) first letter (titlecase)              [titleStart..titleLimit[
435          * c) subsequent characters (lowercase)                 [titleLimit..index[
436          */
437         if(prev<index) {
438             // Find and copy skipped characters [prev..titleStart[
439             int32_t titleStart=prev;
440             int32_t titleLimit=prev;
441             UChar32 c;
442             U16_NEXT(src, titleLimit, index, c);
443             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
444                 // Adjust the titlecasing index to the next cased character,
445                 // or to the next letter/number/symbol/private use.
446                 // Stop with titleStart<titleLimit<=index
447                 // if there is a character to be titlecased,
448                 // or else stop with titleStart==titleLimit==index.
449                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
450                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
451                     titleStart=titleLimit;
452                     if(titleLimit==index) {
453                         break;
454                     }
455                     U16_NEXT(src, titleLimit, index, c);
456                 }
457                 if (prev < titleStart) {
458                     destIndex=appendUnchanged(dest, destIndex, destCapacity,
459                                               src+prev, titleStart-prev, options, edits);
460                     if(destIndex<0) {
461                         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
462                         return 0;
463                     }
464                 }
465             }
466 
467             if(titleStart<titleLimit) {
468                 /* titlecase c which is from [titleStart..titleLimit[ */
469                 csc.cpStart=titleStart;
470                 csc.cpLimit=titleLimit;
471                 const UChar *s;
472                 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
473                 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
474                                        titleLimit-titleStart, options, edits);
475                 if(destIndex<0) {
476                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
477                     return 0;
478                 }
479 
480                 /* Special case Dutch IJ titlecasing */
481                 if (titleStart+1 < index &&
482                         caseLocale == UCASE_LOC_DUTCH &&
483                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
484                     if (src[titleStart+1] == 0x006A) {
485                         destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
486                         if(destIndex<0) {
487                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
488                             return 0;
489                         }
490                         if(edits!=NULL) {
491                             edits->addReplace(1, 1);
492                         }
493                         titleLimit++;
494                     } else if (src[titleStart+1] == 0x004A) {
495                         // Keep the capital J from getting lowercased.
496                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
497                                                   src+titleStart+1, 1, options, edits);
498                         if(destIndex<0) {
499                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
500                             return 0;
501                         }
502                         titleLimit++;
503                     }
504                 }
505 
506                 /* lowercase [titleLimit..index[ */
507                 if(titleLimit<index) {
508                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
509                         /* Normal operation: Lowercase the rest of the word. */
510                         destIndex+=
511                             toLower(
512                                 caseLocale, options,
513                                 dest+destIndex, destCapacity-destIndex,
514                                 src, &csc, titleLimit, index,
515                                 edits, errorCode);
516                         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
517                             errorCode=U_ZERO_ERROR;
518                         }
519                         if(U_FAILURE(errorCode)) {
520                             return destIndex;
521                         }
522                     } else {
523                         /* Optionally just copy the rest of the word unchanged. */
524                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
525                                                   src+titleLimit, index-titleLimit, options, edits);
526                         if(destIndex<0) {
527                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
528                             return 0;
529                         }
530                     }
531                 }
532             }
533         }
534 
535         prev=index;
536     }
537 
538     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
539 }
540 
541 #endif  // !UCONFIG_NO_BREAK_ITERATION
542 
543 U_NAMESPACE_BEGIN
544 namespace GreekUpper {
545 
546 // Data generated by prototype code, see
547 // http://site.icu-project.org/design/case/greek-upper
548 // TODO: Move this data into ucase.icu.
549 static const uint16_t data0370[] = {
550     // U+0370..03FF
551     0x0370,
552     0x0370,
553     0x0372,
554     0x0372,
555     0,
556     0,
557     0x0376,
558     0x0376,
559     0,
560     0,
561     0x037A,
562     0x03FD,
563     0x03FE,
564     0x03FF,
565     0,
566     0x037F,
567     0,
568     0,
569     0,
570     0,
571     0,
572     0,
573     0x0391 | HAS_VOWEL | HAS_ACCENT,
574     0,
575     0x0395 | HAS_VOWEL | HAS_ACCENT,
576     0x0397 | HAS_VOWEL | HAS_ACCENT,
577     0x0399 | HAS_VOWEL | HAS_ACCENT,
578     0,
579     0x039F | HAS_VOWEL | HAS_ACCENT,
580     0,
581     0x03A5 | HAS_VOWEL | HAS_ACCENT,
582     0x03A9 | HAS_VOWEL | HAS_ACCENT,
583     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
584     0x0391 | HAS_VOWEL,
585     0x0392,
586     0x0393,
587     0x0394,
588     0x0395 | HAS_VOWEL,
589     0x0396,
590     0x0397 | HAS_VOWEL,
591     0x0398,
592     0x0399 | HAS_VOWEL,
593     0x039A,
594     0x039B,
595     0x039C,
596     0x039D,
597     0x039E,
598     0x039F | HAS_VOWEL,
599     0x03A0,
600     0x03A1,
601     0,
602     0x03A3,
603     0x03A4,
604     0x03A5 | HAS_VOWEL,
605     0x03A6,
606     0x03A7,
607     0x03A8,
608     0x03A9 | HAS_VOWEL,
609     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
610     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
611     0x0391 | HAS_VOWEL | HAS_ACCENT,
612     0x0395 | HAS_VOWEL | HAS_ACCENT,
613     0x0397 | HAS_VOWEL | HAS_ACCENT,
614     0x0399 | HAS_VOWEL | HAS_ACCENT,
615     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
616     0x0391 | HAS_VOWEL,
617     0x0392,
618     0x0393,
619     0x0394,
620     0x0395 | HAS_VOWEL,
621     0x0396,
622     0x0397 | HAS_VOWEL,
623     0x0398,
624     0x0399 | HAS_VOWEL,
625     0x039A,
626     0x039B,
627     0x039C,
628     0x039D,
629     0x039E,
630     0x039F | HAS_VOWEL,
631     0x03A0,
632     0x03A1,
633     0x03A3,
634     0x03A3,
635     0x03A4,
636     0x03A5 | HAS_VOWEL,
637     0x03A6,
638     0x03A7,
639     0x03A8,
640     0x03A9 | HAS_VOWEL,
641     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
642     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
643     0x039F | HAS_VOWEL | HAS_ACCENT,
644     0x03A5 | HAS_VOWEL | HAS_ACCENT,
645     0x03A9 | HAS_VOWEL | HAS_ACCENT,
646     0x03CF,
647     0x0392,
648     0x0398,
649     0x03D2,
650     0x03D2 | HAS_ACCENT,
651     0x03D2 | HAS_DIALYTIKA,
652     0x03A6,
653     0x03A0,
654     0x03CF,
655     0x03D8,
656     0x03D8,
657     0x03DA,
658     0x03DA,
659     0x03DC,
660     0x03DC,
661     0x03DE,
662     0x03DE,
663     0x03E0,
664     0x03E0,
665     0,
666     0,
667     0,
668     0,
669     0,
670     0,
671     0,
672     0,
673     0,
674     0,
675     0,
676     0,
677     0,
678     0,
679     0x039A,
680     0x03A1,
681     0x03F9,
682     0x037F,
683     0x03F4,
684     0x0395 | HAS_VOWEL,
685     0,
686     0x03F7,
687     0x03F7,
688     0x03F9,
689     0x03FA,
690     0x03FA,
691     0x03FC,
692     0x03FD,
693     0x03FE,
694     0x03FF,
695 };
696 
697 static const uint16_t data1F00[] = {
698     // U+1F00..1FFF
699     0x0391 | HAS_VOWEL,
700     0x0391 | HAS_VOWEL,
701     0x0391 | HAS_VOWEL | HAS_ACCENT,
702     0x0391 | HAS_VOWEL | HAS_ACCENT,
703     0x0391 | HAS_VOWEL | HAS_ACCENT,
704     0x0391 | HAS_VOWEL | HAS_ACCENT,
705     0x0391 | HAS_VOWEL | HAS_ACCENT,
706     0x0391 | HAS_VOWEL | HAS_ACCENT,
707     0x0391 | HAS_VOWEL,
708     0x0391 | HAS_VOWEL,
709     0x0391 | HAS_VOWEL | HAS_ACCENT,
710     0x0391 | HAS_VOWEL | HAS_ACCENT,
711     0x0391 | HAS_VOWEL | HAS_ACCENT,
712     0x0391 | HAS_VOWEL | HAS_ACCENT,
713     0x0391 | HAS_VOWEL | HAS_ACCENT,
714     0x0391 | HAS_VOWEL | HAS_ACCENT,
715     0x0395 | HAS_VOWEL,
716     0x0395 | HAS_VOWEL,
717     0x0395 | HAS_VOWEL | HAS_ACCENT,
718     0x0395 | HAS_VOWEL | HAS_ACCENT,
719     0x0395 | HAS_VOWEL | HAS_ACCENT,
720     0x0395 | HAS_VOWEL | HAS_ACCENT,
721     0,
722     0,
723     0x0395 | HAS_VOWEL,
724     0x0395 | HAS_VOWEL,
725     0x0395 | HAS_VOWEL | HAS_ACCENT,
726     0x0395 | HAS_VOWEL | HAS_ACCENT,
727     0x0395 | HAS_VOWEL | HAS_ACCENT,
728     0x0395 | HAS_VOWEL | HAS_ACCENT,
729     0,
730     0,
731     0x0397 | HAS_VOWEL,
732     0x0397 | HAS_VOWEL,
733     0x0397 | HAS_VOWEL | HAS_ACCENT,
734     0x0397 | HAS_VOWEL | HAS_ACCENT,
735     0x0397 | HAS_VOWEL | HAS_ACCENT,
736     0x0397 | HAS_VOWEL | HAS_ACCENT,
737     0x0397 | HAS_VOWEL | HAS_ACCENT,
738     0x0397 | HAS_VOWEL | HAS_ACCENT,
739     0x0397 | HAS_VOWEL,
740     0x0397 | HAS_VOWEL,
741     0x0397 | HAS_VOWEL | HAS_ACCENT,
742     0x0397 | HAS_VOWEL | HAS_ACCENT,
743     0x0397 | HAS_VOWEL | HAS_ACCENT,
744     0x0397 | HAS_VOWEL | HAS_ACCENT,
745     0x0397 | HAS_VOWEL | HAS_ACCENT,
746     0x0397 | HAS_VOWEL | HAS_ACCENT,
747     0x0399 | HAS_VOWEL,
748     0x0399 | HAS_VOWEL,
749     0x0399 | HAS_VOWEL | HAS_ACCENT,
750     0x0399 | HAS_VOWEL | HAS_ACCENT,
751     0x0399 | HAS_VOWEL | HAS_ACCENT,
752     0x0399 | HAS_VOWEL | HAS_ACCENT,
753     0x0399 | HAS_VOWEL | HAS_ACCENT,
754     0x0399 | HAS_VOWEL | HAS_ACCENT,
755     0x0399 | HAS_VOWEL,
756     0x0399 | HAS_VOWEL,
757     0x0399 | HAS_VOWEL | HAS_ACCENT,
758     0x0399 | HAS_VOWEL | HAS_ACCENT,
759     0x0399 | HAS_VOWEL | HAS_ACCENT,
760     0x0399 | HAS_VOWEL | HAS_ACCENT,
761     0x0399 | HAS_VOWEL | HAS_ACCENT,
762     0x0399 | HAS_VOWEL | HAS_ACCENT,
763     0x039F | HAS_VOWEL,
764     0x039F | HAS_VOWEL,
765     0x039F | HAS_VOWEL | HAS_ACCENT,
766     0x039F | HAS_VOWEL | HAS_ACCENT,
767     0x039F | HAS_VOWEL | HAS_ACCENT,
768     0x039F | HAS_VOWEL | HAS_ACCENT,
769     0,
770     0,
771     0x039F | HAS_VOWEL,
772     0x039F | HAS_VOWEL,
773     0x039F | HAS_VOWEL | HAS_ACCENT,
774     0x039F | HAS_VOWEL | HAS_ACCENT,
775     0x039F | HAS_VOWEL | HAS_ACCENT,
776     0x039F | HAS_VOWEL | HAS_ACCENT,
777     0,
778     0,
779     0x03A5 | HAS_VOWEL,
780     0x03A5 | HAS_VOWEL,
781     0x03A5 | HAS_VOWEL | HAS_ACCENT,
782     0x03A5 | HAS_VOWEL | HAS_ACCENT,
783     0x03A5 | HAS_VOWEL | HAS_ACCENT,
784     0x03A5 | HAS_VOWEL | HAS_ACCENT,
785     0x03A5 | HAS_VOWEL | HAS_ACCENT,
786     0x03A5 | HAS_VOWEL | HAS_ACCENT,
787     0,
788     0x03A5 | HAS_VOWEL,
789     0,
790     0x03A5 | HAS_VOWEL | HAS_ACCENT,
791     0,
792     0x03A5 | HAS_VOWEL | HAS_ACCENT,
793     0,
794     0x03A5 | HAS_VOWEL | HAS_ACCENT,
795     0x03A9 | HAS_VOWEL,
796     0x03A9 | HAS_VOWEL,
797     0x03A9 | HAS_VOWEL | HAS_ACCENT,
798     0x03A9 | HAS_VOWEL | HAS_ACCENT,
799     0x03A9 | HAS_VOWEL | HAS_ACCENT,
800     0x03A9 | HAS_VOWEL | HAS_ACCENT,
801     0x03A9 | HAS_VOWEL | HAS_ACCENT,
802     0x03A9 | HAS_VOWEL | HAS_ACCENT,
803     0x03A9 | HAS_VOWEL,
804     0x03A9 | HAS_VOWEL,
805     0x03A9 | HAS_VOWEL | HAS_ACCENT,
806     0x03A9 | HAS_VOWEL | HAS_ACCENT,
807     0x03A9 | HAS_VOWEL | HAS_ACCENT,
808     0x03A9 | HAS_VOWEL | HAS_ACCENT,
809     0x03A9 | HAS_VOWEL | HAS_ACCENT,
810     0x03A9 | HAS_VOWEL | HAS_ACCENT,
811     0x0391 | HAS_VOWEL | HAS_ACCENT,
812     0x0391 | HAS_VOWEL | HAS_ACCENT,
813     0x0395 | HAS_VOWEL | HAS_ACCENT,
814     0x0395 | HAS_VOWEL | HAS_ACCENT,
815     0x0397 | HAS_VOWEL | HAS_ACCENT,
816     0x0397 | HAS_VOWEL | HAS_ACCENT,
817     0x0399 | HAS_VOWEL | HAS_ACCENT,
818     0x0399 | HAS_VOWEL | HAS_ACCENT,
819     0x039F | HAS_VOWEL | HAS_ACCENT,
820     0x039F | HAS_VOWEL | HAS_ACCENT,
821     0x03A5 | HAS_VOWEL | HAS_ACCENT,
822     0x03A5 | HAS_VOWEL | HAS_ACCENT,
823     0x03A9 | HAS_VOWEL | HAS_ACCENT,
824     0x03A9 | HAS_VOWEL | HAS_ACCENT,
825     0,
826     0,
827     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
828     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
829     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
830     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
831     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
832     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
833     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
834     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
835     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
836     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
837     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
838     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
839     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
840     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
841     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
842     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
843     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
844     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
845     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
846     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
847     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
848     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
849     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
850     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
851     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
852     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
853     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
854     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
855     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
856     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
857     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
858     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
859     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
860     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
861     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
862     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
863     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
864     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
865     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
866     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
867     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
868     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
869     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
870     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
871     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
872     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
873     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
874     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
875     0x0391 | HAS_VOWEL,
876     0x0391 | HAS_VOWEL,
877     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
878     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
879     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
880     0,
881     0x0391 | HAS_VOWEL | HAS_ACCENT,
882     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
883     0x0391 | HAS_VOWEL,
884     0x0391 | HAS_VOWEL,
885     0x0391 | HAS_VOWEL | HAS_ACCENT,
886     0x0391 | HAS_VOWEL | HAS_ACCENT,
887     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
888     0,
889     0x0399 | HAS_VOWEL,
890     0,
891     0,
892     0,
893     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
894     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
895     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
896     0,
897     0x0397 | HAS_VOWEL | HAS_ACCENT,
898     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
899     0x0395 | HAS_VOWEL | HAS_ACCENT,
900     0x0395 | HAS_VOWEL | HAS_ACCENT,
901     0x0397 | HAS_VOWEL | HAS_ACCENT,
902     0x0397 | HAS_VOWEL | HAS_ACCENT,
903     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
904     0,
905     0,
906     0,
907     0x0399 | HAS_VOWEL,
908     0x0399 | HAS_VOWEL,
909     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
910     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
911     0,
912     0,
913     0x0399 | HAS_VOWEL | HAS_ACCENT,
914     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
915     0x0399 | HAS_VOWEL,
916     0x0399 | HAS_VOWEL,
917     0x0399 | HAS_VOWEL | HAS_ACCENT,
918     0x0399 | HAS_VOWEL | HAS_ACCENT,
919     0,
920     0,
921     0,
922     0,
923     0x03A5 | HAS_VOWEL,
924     0x03A5 | HAS_VOWEL,
925     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
926     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
927     0x03A1,
928     0x03A1,
929     0x03A5 | HAS_VOWEL | HAS_ACCENT,
930     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
931     0x03A5 | HAS_VOWEL,
932     0x03A5 | HAS_VOWEL,
933     0x03A5 | HAS_VOWEL | HAS_ACCENT,
934     0x03A5 | HAS_VOWEL | HAS_ACCENT,
935     0x03A1,
936     0,
937     0,
938     0,
939     0,
940     0,
941     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
942     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
943     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
944     0,
945     0x03A9 | HAS_VOWEL | HAS_ACCENT,
946     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
947     0x039F | HAS_VOWEL | HAS_ACCENT,
948     0x039F | HAS_VOWEL | HAS_ACCENT,
949     0x03A9 | HAS_VOWEL | HAS_ACCENT,
950     0x03A9 | HAS_VOWEL | HAS_ACCENT,
951     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
952     0,
953     0,
954     0,
955 };
956 
957 // U+2126 Ohm sign
958 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
959 
getLetterData(UChar32 c)960 uint32_t getLetterData(UChar32 c) {
961     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
962         return 0;
963     } else if (c <= 0x3ff) {
964         return data0370[c - 0x370];
965     } else if (c <= 0x1fff) {
966         return data1F00[c - 0x1f00];
967     } else if (c == 0x2126) {
968         return data2126;
969     } else {
970         return 0;
971     }
972 }
973 
getDiacriticData(UChar32 c)974 uint32_t getDiacriticData(UChar32 c) {
975     switch (c) {
976     case 0x0300:  // varia
977     case 0x0301:  // tonos = oxia
978     case 0x0342:  // perispomeni
979     case 0x0302:  // circumflex can look like perispomeni
980     case 0x0303:  // tilde can look like perispomeni
981     case 0x0311:  // inverted breve can look like perispomeni
982         return HAS_ACCENT;
983     case 0x0308:  // dialytika = diaeresis
984         return HAS_COMBINING_DIALYTIKA;
985     case 0x0344:  // dialytika tonos
986         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
987     case 0x0345:  // ypogegrammeni = iota subscript
988         return HAS_YPOGEGRAMMENI;
989     case 0x0304:  // macron
990     case 0x0306:  // breve
991     case 0x0313:  // comma above
992     case 0x0314:  // reversed comma above
993     case 0x0343:  // koronis
994         return HAS_OTHER_GREEK_DIACRITIC;
995     default:
996         return 0;
997     }
998 }
999 
isFollowedByCasedLetter(const UChar * s,int32_t i,int32_t length)1000 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
1001     while (i < length) {
1002         UChar32 c;
1003         U16_NEXT(s, i, length, c);
1004         int32_t type = ucase_getTypeOrIgnorable(c);
1005         if ((type & UCASE_IGNORABLE) != 0) {
1006             // Case-ignorable, continue with the loop.
1007         } else if (type != UCASE_NONE) {
1008             return TRUE;  // Followed by cased letter.
1009         } else {
1010             return FALSE;  // Uncased and not case-ignorable.
1011         }
1012     }
1013     return FALSE;  // Not followed by cased letter.
1014 }
1015 
1016 /**
1017  * Greek string uppercasing with a state machine.
1018  * Probably simpler than a stateless function that has to figure out complex context-before
1019  * for each character.
1020  * TODO: Try to re-consolidate one way or another with the non-Greek function.
1021  */
toUpper(uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,Edits * edits,UErrorCode & errorCode)1022 int32_t toUpper(uint32_t options,
1023                 UChar *dest, int32_t destCapacity,
1024                 const UChar *src, int32_t srcLength,
1025                 Edits *edits,
1026                 UErrorCode &errorCode) {
1027     int32_t destIndex=0;
1028     uint32_t state = 0;
1029     for (int32_t i = 0; i < srcLength;) {
1030         int32_t nextIndex = i;
1031         UChar32 c;
1032         U16_NEXT(src, nextIndex, srcLength, c);
1033         uint32_t nextState = 0;
1034         int32_t type = ucase_getTypeOrIgnorable(c);
1035         if ((type & UCASE_IGNORABLE) != 0) {
1036             // c is case-ignorable
1037             nextState |= (state & AFTER_CASED);
1038         } else if (type != UCASE_NONE) {
1039             // c is cased
1040             nextState |= AFTER_CASED;
1041         }
1042         uint32_t data = getLetterData(c);
1043         if (data > 0) {
1044             uint32_t upper = data & UPPER_MASK;
1045             // Add a dialytika to this iota or ypsilon vowel
1046             // if we removed a tonos from the previous vowel,
1047             // and that previous vowel did not also have (or gain) a dialytika.
1048             // Adding one only to the final vowel in a longer sequence
1049             // (which does not occur in normal writing) would require lookahead.
1050             // Set the same flag as for preserving an existing dialytika.
1051             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1052                     (upper == 0x399 || upper == 0x3A5)) {
1053                 data |= HAS_DIALYTIKA;
1054             }
1055             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
1056             if ((data & HAS_YPOGEGRAMMENI) != 0) {
1057                 numYpogegrammeni = 1;
1058             }
1059             // Skip combining diacritics after this Greek letter.
1060             while (nextIndex < srcLength) {
1061                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
1062                 if (diacriticData != 0) {
1063                     data |= diacriticData;
1064                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1065                         ++numYpogegrammeni;
1066                     }
1067                     ++nextIndex;
1068                 } else {
1069                     break;  // not a Greek diacritic
1070                 }
1071             }
1072             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1073                 nextState |= AFTER_VOWEL_WITH_ACCENT;
1074             }
1075             // Map according to Greek rules.
1076             UBool addTonos = FALSE;
1077             if (upper == 0x397 &&
1078                     (data & HAS_ACCENT) != 0 &&
1079                     numYpogegrammeni == 0 &&
1080                     (state & AFTER_CASED) == 0 &&
1081                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
1082                 // Keep disjunctive "or" with (only) a tonos.
1083                 // We use the same "word boundary" conditions as for the Final_Sigma test.
1084                 if (i == nextIndex) {
1085                     upper = 0x389;  // Preserve the precomposed form.
1086                 } else {
1087                     addTonos = TRUE;
1088                 }
1089             } else if ((data & HAS_DIALYTIKA) != 0) {
1090                 // Preserve a vowel with dialytika in precomposed form if it exists.
1091                 if (upper == 0x399) {
1092                     upper = 0x3AA;
1093                     data &= ~HAS_EITHER_DIALYTIKA;
1094                 } else if (upper == 0x3A5) {
1095                     upper = 0x3AB;
1096                     data &= ~HAS_EITHER_DIALYTIKA;
1097                 }
1098             }
1099 
1100             UBool change;
1101             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
1102                 change = TRUE;  // common, simple usage
1103             } else {
1104                 // Find out first whether we are changing the text.
1105                 change = src[i] != upper || numYpogegrammeni > 0;
1106                 int32_t i2 = i + 1;
1107                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1108                     change |= i2 >= nextIndex || src[i2] != 0x308;
1109                     ++i2;
1110                 }
1111                 if (addTonos) {
1112                     change |= i2 >= nextIndex || src[i2] != 0x301;
1113                     ++i2;
1114                 }
1115                 int32_t oldLength = nextIndex - i;
1116                 int32_t newLength = (i2 - i) + numYpogegrammeni;
1117                 change |= oldLength != newLength;
1118                 if (change) {
1119                     if (edits != NULL) {
1120                         edits->addReplace(oldLength, newLength);
1121                     }
1122                 } else {
1123                     if (edits != NULL) {
1124                         edits->addUnchanged(oldLength);
1125                     }
1126                     // Write unchanged text?
1127                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
1128                 }
1129             }
1130 
1131             if (change) {
1132                 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
1133                 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
1134                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
1135                 }
1136                 if (destIndex >= 0 && addTonos) {
1137                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
1138                 }
1139                 while (destIndex >= 0 && numYpogegrammeni > 0) {
1140                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
1141                     --numYpogegrammeni;
1142                 }
1143                 if(destIndex<0) {
1144                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1145                     return 0;
1146                 }
1147             }
1148         } else {
1149             const UChar *s;
1150             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
1151             destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1152                                      nextIndex - i, options, edits);
1153             if (destIndex < 0) {
1154                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1155                 return 0;
1156             }
1157         }
1158         i = nextIndex;
1159         state = nextState;
1160     }
1161 
1162     return destIndex;
1163 }
1164 
1165 }  // namespace GreekUpper
1166 U_NAMESPACE_END
1167 
1168 /* functions available in the common library (for unistr_case.cpp) */
1169 
1170 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1171 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1172                          UChar *dest, int32_t destCapacity,
1173                          const UChar *src, int32_t srcLength,
1174                          icu::Edits *edits,
1175                          UErrorCode &errorCode) {
1176     UCaseContext csc=UCASECONTEXT_INITIALIZER;
1177     csc.p=(void *)src;
1178     csc.limit=srcLength;
1179     int32_t destIndex = toLower(
1180         caseLocale, options,
1181         dest, destCapacity,
1182         src, &csc, 0, srcLength,
1183         edits, errorCode);
1184     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1185 }
1186 
1187 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1188 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1189                          UChar *dest, int32_t destCapacity,
1190                          const UChar *src, int32_t srcLength,
1191                          icu::Edits *edits,
1192                          UErrorCode &errorCode) {
1193     int32_t destIndex;
1194     if (caseLocale == UCASE_LOC_GREEK) {
1195         destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1196                                         src, srcLength, edits, errorCode);
1197     } else {
1198         UCaseContext csc=UCASECONTEXT_INITIALIZER;
1199         csc.p=(void *)src;
1200         csc.limit=srcLength;
1201         destIndex = toUpper(
1202             caseLocale, options,
1203             dest, destCapacity,
1204             src, &csc, srcLength,
1205             edits, errorCode);
1206     }
1207     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1208 }
1209 
1210 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1211 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1212                       UChar *dest, int32_t destCapacity,
1213                       const UChar *src, int32_t srcLength,
1214                       icu::Edits *edits,
1215                       UErrorCode &errorCode) {
1216     int32_t destIndex = toLower(
1217         -1, options,
1218         dest, destCapacity,
1219         src, nullptr, 0, srcLength,
1220         edits, errorCode);
1221     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1222 }
1223 
1224 U_CFUNC int32_t
ustrcase_map(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)1225 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1226              UChar *dest, int32_t destCapacity,
1227              const UChar *src, int32_t srcLength,
1228              UStringCaseMapper *stringCaseMapper,
1229              icu::Edits *edits,
1230              UErrorCode &errorCode) {
1231     int32_t destLength;
1232 
1233     /* check argument values */
1234     if(U_FAILURE(errorCode)) {
1235         return 0;
1236     }
1237     if( destCapacity<0 ||
1238         (dest==NULL && destCapacity>0) ||
1239         src==NULL ||
1240         srcLength<-1
1241     ) {
1242         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1243         return 0;
1244     }
1245 
1246     /* get the string length */
1247     if(srcLength==-1) {
1248         srcLength=u_strlen(src);
1249     }
1250 
1251     /* check for overlapping source and destination */
1252     if( dest!=NULL &&
1253         ((src>=dest && src<(dest+destCapacity)) ||
1254          (dest>=src && dest<(src+srcLength)))
1255     ) {
1256         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1257         return 0;
1258     }
1259 
1260     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
1261         edits->reset();
1262     }
1263     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1264                                 dest, destCapacity, src, srcLength, edits, errorCode);
1265     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1266 }
1267 
1268 U_CFUNC int32_t
ustrcase_mapWithOverlap(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode & errorCode)1269 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1270                         UChar *dest, int32_t destCapacity,
1271                         const UChar *src, int32_t srcLength,
1272                         UStringCaseMapper *stringCaseMapper,
1273                         UErrorCode &errorCode) {
1274     UChar buffer[300];
1275     UChar *temp;
1276 
1277     int32_t destLength;
1278 
1279     /* check argument values */
1280     if(U_FAILURE(errorCode)) {
1281         return 0;
1282     }
1283     if( destCapacity<0 ||
1284         (dest==NULL && destCapacity>0) ||
1285         src==NULL ||
1286         srcLength<-1
1287     ) {
1288         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1289         return 0;
1290     }
1291 
1292     /* get the string length */
1293     if(srcLength==-1) {
1294         srcLength=u_strlen(src);
1295     }
1296 
1297     /* check for overlapping source and destination */
1298     if( dest!=NULL &&
1299         ((src>=dest && src<(dest+destCapacity)) ||
1300          (dest>=src && dest<(src+srcLength)))
1301     ) {
1302         /* overlap: provide a temporary destination buffer and later copy the result */
1303         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1304             /* the stack buffer is large enough */
1305             temp=buffer;
1306         } else {
1307             /* allocate a buffer */
1308             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1309             if(temp==NULL) {
1310                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1311                 return 0;
1312             }
1313         }
1314     } else {
1315         temp=dest;
1316     }
1317 
1318     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1319                                 temp, destCapacity, src, srcLength, NULL, errorCode);
1320     if(temp!=dest) {
1321         /* copy the result string to the destination buffer */
1322         if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1323             u_memmove(dest, temp, destLength);
1324         }
1325         if(temp!=buffer) {
1326             uprv_free(temp);
1327         }
1328     }
1329 
1330     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1331 }
1332 
1333 /* public API functions */
1334 
1335 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1336 u_strFoldCase(UChar *dest, int32_t destCapacity,
1337               const UChar *src, int32_t srcLength,
1338               uint32_t options,
1339               UErrorCode *pErrorCode) {
1340     return ustrcase_mapWithOverlap(
1341         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1342         dest, destCapacity,
1343         src, srcLength,
1344         ustrcase_internalFold, *pErrorCode);
1345 }
1346 
1347 U_NAMESPACE_BEGIN
1348 
fold(uint32_t options,const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1349 int32_t CaseMap::fold(
1350         uint32_t options,
1351         const UChar *src, int32_t srcLength,
1352         UChar *dest, int32_t destCapacity, Edits *edits,
1353         UErrorCode &errorCode) {
1354     return ustrcase_map(
1355         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1356         dest, destCapacity,
1357         src, srcLength,
1358         ustrcase_internalFold, edits, errorCode);
1359 }
1360 
1361 U_NAMESPACE_END
1362 
1363 /* case-insensitive string comparisons -------------------------------------- */
1364 
1365 /*
1366  * This function is a copy of unorm_cmpEquivFold() minus the parts for
1367  * canonical equivalence.
1368  * Keep the functions in sync, and see there for how this works.
1369  * The duplication is for modularization:
1370  * It makes caseless (but not canonical caseless) matches independent of
1371  * the normalization code.
1372  */
1373 
1374 /* stack element for previous-level source/decomposition pointers */
1375 struct CmpEquivLevel {
1376     const UChar *start, *s, *limit;
1377 };
1378 typedef struct CmpEquivLevel CmpEquivLevel;
1379 
1380 /**
1381  * Internal implementation code comparing string with case fold.
1382  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1383  *
1384  * @param s1            input string 1
1385  * @param length1       length of string 1, or -1 (NULL terminated)
1386  * @param s2            input string 2
1387  * @param length2       length of string 2, or -1 (NULL terminated)
1388  * @param options       compare options
1389  * @param matchLen1     (output) length of partial prefix match in s1
1390  * @param matchLen2     (output) length of partial prefix match in s2
1391  * @param pErrorCode    receives error status
1392  * @return The result of comparison
1393  */
_cmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1394 static int32_t _cmpFold(
1395             const UChar *s1, int32_t length1,
1396             const UChar *s2, int32_t length2,
1397             uint32_t options,
1398             int32_t *matchLen1, int32_t *matchLen2,
1399             UErrorCode *pErrorCode) {
1400     int32_t cmpRes = 0;
1401 
1402     /* current-level start/limit - s1/s2 as current */
1403     const UChar *start1, *start2, *limit1, *limit2;
1404 
1405     /* points to the original start address */
1406     const UChar *org1, *org2;
1407 
1408     /* points to the end of match + 1 */
1409     const UChar *m1, *m2;
1410 
1411     /* case folding variables */
1412     const UChar *p;
1413     int32_t length;
1414 
1415     /* stacks of previous-level start/current/limit */
1416     CmpEquivLevel stack1[2], stack2[2];
1417 
1418     /* case folding buffers, only use current-level start/limit */
1419     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1420 
1421     /* track which is the current level per string */
1422     int32_t level1, level2;
1423 
1424     /* current code units, and code points for lookups */
1425     UChar32 c1, c2, cp1, cp2;
1426 
1427     /* no argument error checking because this itself is not an API */
1428 
1429     /*
1430      * assume that at least the option U_COMPARE_IGNORE_CASE is set
1431      * otherwise this function would have to behave exactly as uprv_strCompare()
1432      */
1433     if(U_FAILURE(*pErrorCode)) {
1434         return 0;
1435     }
1436 
1437     /* initialize */
1438     if(matchLen1) {
1439         U_ASSERT(matchLen2 !=NULL);
1440         *matchLen1=0;
1441         *matchLen2=0;
1442     }
1443 
1444     start1=m1=org1=s1;
1445     if(length1==-1) {
1446         limit1=NULL;
1447     } else {
1448         limit1=s1+length1;
1449     }
1450 
1451     start2=m2=org2=s2;
1452     if(length2==-1) {
1453         limit2=NULL;
1454     } else {
1455         limit2=s2+length2;
1456     }
1457 
1458     level1=level2=0;
1459     c1=c2=-1;
1460 
1461     /* comparison loop */
1462     for(;;) {
1463         /*
1464          * here a code unit value of -1 means "get another code unit"
1465          * below it will mean "this source is finished"
1466          */
1467 
1468         if(c1<0) {
1469             /* get next code unit from string 1, post-increment */
1470             for(;;) {
1471                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1472                     if(level1==0) {
1473                         c1=-1;
1474                         break;
1475                     }
1476                 } else {
1477                     ++s1;
1478                     break;
1479                 }
1480 
1481                 /* reached end of level buffer, pop one level */
1482                 do {
1483                     --level1;
1484                     start1=stack1[level1].start;    /*Not uninitialized*/
1485                 } while(start1==NULL);
1486                 s1=stack1[level1].s;                /*Not uninitialized*/
1487                 limit1=stack1[level1].limit;        /*Not uninitialized*/
1488             }
1489         }
1490 
1491         if(c2<0) {
1492             /* get next code unit from string 2, post-increment */
1493             for(;;) {
1494                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1495                     if(level2==0) {
1496                         c2=-1;
1497                         break;
1498                     }
1499                 } else {
1500                     ++s2;
1501                     break;
1502                 }
1503 
1504                 /* reached end of level buffer, pop one level */
1505                 do {
1506                     --level2;
1507                     start2=stack2[level2].start;    /*Not uninitialized*/
1508                 } while(start2==NULL);
1509                 s2=stack2[level2].s;                /*Not uninitialized*/
1510                 limit2=stack2[level2].limit;        /*Not uninitialized*/
1511             }
1512         }
1513 
1514         /*
1515          * compare c1 and c2
1516          * either variable c1, c2 is -1 only if the corresponding string is finished
1517          */
1518         if(c1==c2) {
1519             const UChar *next1, *next2;
1520 
1521             if(c1<0) {
1522                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
1523                 break;
1524             }
1525 
1526             /*
1527              * Note: Move the match positions in both strings at the same time
1528              *      only when corresponding code point(s) in the original strings
1529              *      are fully consumed. For example, when comparing s1="Fust" and
1530              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1531              *      the first code point in the case-folded data. But the second "s"
1532              *      has no matching code point in s1, so this implementation returns
1533              *      2 as the prefix match length ("Fu").
1534              */
1535             next1=next2=NULL;
1536             if(level1==0) {
1537                 next1=s1;
1538             } else if(s1==limit1) {
1539                 /* Note: This implementation only use a single level of stack.
1540                  *      If this code needs to be changed to use multiple levels
1541                  *      of stacks, the code above should check if the current
1542                  *      code is at the end of all stacks.
1543                  */
1544                 U_ASSERT(level1==1);
1545 
1546                 /* is s1 at the end of the current stack? */
1547                 next1=stack1[0].s;
1548             }
1549 
1550             if (next1!=NULL) {
1551                 if(level2==0) {
1552                     next2=s2;
1553                 } else if(s2==limit2) {
1554                     U_ASSERT(level2==1);
1555 
1556                     /* is s2 at the end of the current stack? */
1557                     next2=stack2[0].s;
1558                 }
1559                 if(next2!=NULL) {
1560                     m1=next1;
1561                     m2=next2;
1562                 }
1563             }
1564             c1=c2=-1;       /* make us fetch new code units */
1565             continue;
1566         } else if(c1<0) {
1567             cmpRes=-1;      /* string 1 ends before string 2 */
1568             break;
1569         } else if(c2<0) {
1570             cmpRes=1;       /* string 2 ends before string 1 */
1571             break;
1572         }
1573         /* c1!=c2 && c1>=0 && c2>=0 */
1574 
1575         /* get complete code points for c1, c2 for lookups if either is a surrogate */
1576         cp1=c1;
1577         if(U_IS_SURROGATE(c1)) {
1578             UChar c;
1579 
1580             if(U_IS_SURROGATE_LEAD(c1)) {
1581                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1582                     /* advance ++s1; only below if cp1 decomposes/case-folds */
1583                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
1584                 }
1585             } else /* isTrail(c1) */ {
1586                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1587                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
1588                 }
1589             }
1590         }
1591 
1592         cp2=c2;
1593         if(U_IS_SURROGATE(c2)) {
1594             UChar c;
1595 
1596             if(U_IS_SURROGATE_LEAD(c2)) {
1597                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1598                     /* advance ++s2; only below if cp2 decomposes/case-folds */
1599                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
1600                 }
1601             } else /* isTrail(c2) */ {
1602                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1603                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
1604                 }
1605             }
1606         }
1607 
1608         /*
1609          * go down one level for each string
1610          * continue with the main loop as soon as there is a real change
1611          */
1612 
1613         if( level1==0 &&
1614             (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1615         ) {
1616             /* cp1 case-folds to the code point "length" or to p[length] */
1617             if(U_IS_SURROGATE(c1)) {
1618                 if(U_IS_SURROGATE_LEAD(c1)) {
1619                     /* advance beyond source surrogate pair if it case-folds */
1620                     ++s1;
1621                 } else /* isTrail(c1) */ {
1622                     /*
1623                      * we got a supplementary code point when hitting its trail surrogate,
1624                      * therefore the lead surrogate must have been the same as in the other string;
1625                      * compare this decomposition with the lead surrogate in the other string
1626                      * remember that this simulates bulk text replacement:
1627                      * the decomposition would replace the entire code point
1628                      */
1629                     --s2;
1630                     --m2;
1631                     c2=*(s2-1);
1632                 }
1633             }
1634 
1635             /* push current level pointers */
1636             stack1[0].start=start1;
1637             stack1[0].s=s1;
1638             stack1[0].limit=limit1;
1639             ++level1;
1640 
1641             /* copy the folding result to fold1[] */
1642             if(length<=UCASE_MAX_STRING_LENGTH) {
1643                 u_memcpy(fold1, p, length);
1644             } else {
1645                 int32_t i=0;
1646                 U16_APPEND_UNSAFE(fold1, i, length);
1647                 length=i;
1648             }
1649 
1650             /* set next level pointers to case folding */
1651             start1=s1=fold1;
1652             limit1=fold1+length;
1653 
1654             /* get ready to read from decomposition, continue with loop */
1655             c1=-1;
1656             continue;
1657         }
1658 
1659         if( level2==0 &&
1660             (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1661         ) {
1662             /* cp2 case-folds to the code point "length" or to p[length] */
1663             if(U_IS_SURROGATE(c2)) {
1664                 if(U_IS_SURROGATE_LEAD(c2)) {
1665                     /* advance beyond source surrogate pair if it case-folds */
1666                     ++s2;
1667                 } else /* isTrail(c2) */ {
1668                     /*
1669                      * we got a supplementary code point when hitting its trail surrogate,
1670                      * therefore the lead surrogate must have been the same as in the other string;
1671                      * compare this decomposition with the lead surrogate in the other string
1672                      * remember that this simulates bulk text replacement:
1673                      * the decomposition would replace the entire code point
1674                      */
1675                     --s1;
1676                     --m2;
1677                     c1=*(s1-1);
1678                 }
1679             }
1680 
1681             /* push current level pointers */
1682             stack2[0].start=start2;
1683             stack2[0].s=s2;
1684             stack2[0].limit=limit2;
1685             ++level2;
1686 
1687             /* copy the folding result to fold2[] */
1688             if(length<=UCASE_MAX_STRING_LENGTH) {
1689                 u_memcpy(fold2, p, length);
1690             } else {
1691                 int32_t i=0;
1692                 U16_APPEND_UNSAFE(fold2, i, length);
1693                 length=i;
1694             }
1695 
1696             /* set next level pointers to case folding */
1697             start2=s2=fold2;
1698             limit2=fold2+length;
1699 
1700             /* get ready to read from decomposition, continue with loop */
1701             c2=-1;
1702             continue;
1703         }
1704 
1705         /*
1706          * no decomposition/case folding, max level for both sides:
1707          * return difference result
1708          *
1709          * code point order comparison must not just return cp1-cp2
1710          * because when single surrogates are present then the surrogate pairs
1711          * that formed cp1 and cp2 may be from different string indexes
1712          *
1713          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1714          * c1=d800 cp1=10001 c2=dc00 cp2=10000
1715          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1716          *
1717          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1718          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1719          * so we have slightly different pointer/start/limit comparisons here
1720          */
1721 
1722         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1723             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1724             if(
1725                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1726                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1727             ) {
1728                 /* part of a surrogate pair, leave >=d800 */
1729             } else {
1730                 /* BMP code point - may be surrogate code point - make <d800 */
1731                 c1-=0x2800;
1732             }
1733 
1734             if(
1735                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1736                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1737             ) {
1738                 /* part of a surrogate pair, leave >=d800 */
1739             } else {
1740                 /* BMP code point - may be surrogate code point - make <d800 */
1741                 c2-=0x2800;
1742             }
1743         }
1744 
1745         cmpRes=c1-c2;
1746         break;
1747     }
1748 
1749     if(matchLen1) {
1750         *matchLen1=static_cast<int32_t>(m1-org1);
1751         *matchLen2=static_cast<int32_t>(m2-org2);
1752     }
1753     return cmpRes;
1754 }
1755 
1756 /* internal function */
1757 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1758 u_strcmpFold(const UChar *s1, int32_t length1,
1759              const UChar *s2, int32_t length2,
1760              uint32_t options,
1761              UErrorCode *pErrorCode) {
1762     return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1763 }
1764 
1765 /* public API functions */
1766 
1767 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1768 u_strCaseCompare(const UChar *s1, int32_t length1,
1769                  const UChar *s2, int32_t length2,
1770                  uint32_t options,
1771                  UErrorCode *pErrorCode) {
1772     /* argument checking */
1773     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1774         return 0;
1775     }
1776     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1777         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1778         return 0;
1779     }
1780     return u_strcmpFold(s1, length1, s2, length2,
1781                         options|U_COMPARE_IGNORE_CASE,
1782                         pErrorCode);
1783 }
1784 
1785 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)1786 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1787     UErrorCode errorCode=U_ZERO_ERROR;
1788     return u_strcmpFold(s1, -1, s2, -1,
1789                         options|U_COMPARE_IGNORE_CASE,
1790                         &errorCode);
1791 }
1792 
1793 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)1794 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1795     UErrorCode errorCode=U_ZERO_ERROR;
1796     return u_strcmpFold(s1, length, s2, length,
1797                         options|U_COMPARE_IGNORE_CASE,
1798                         &errorCode);
1799 }
1800 
1801 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)1802 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1803     UErrorCode errorCode=U_ZERO_ERROR;
1804     return u_strcmpFold(s1, n, s2, n,
1805                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1806                         &errorCode);
1807 }
1808 
1809 /* internal API - detect length of shared prefix */
1810 U_CAPI void
u_caseInsensitivePrefixMatch(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1811 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1812                              const UChar *s2, int32_t length2,
1813                              uint32_t options,
1814                              int32_t *matchLen1, int32_t *matchLen2,
1815                              UErrorCode *pErrorCode) {
1816     _cmpFold(s1, length1, s2, length2, options,
1817         matchLen1, matchLen2, pErrorCode);
1818 }
1819