1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ustrcase.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
17 *
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/stringoptions.h"
28 #include "unicode/ustring.h"
29 #include "unicode/ucasemap.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf16.h"
33 #include "cmemory.h"
34 #include "ucase.h"
35 #include "ucasemap_imp.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38
39 U_NAMESPACE_BEGIN
40
41 namespace {
42
checkOverflowAndEditsError(int32_t destIndex,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)43 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
44 Edits *edits, UErrorCode &errorCode) {
45 if (U_SUCCESS(errorCode)) {
46 if (destIndex > destCapacity) {
47 errorCode = U_BUFFER_OVERFLOW_ERROR;
48 } else if (edits != NULL) {
49 edits->copyErrorTo(errorCode);
50 }
51 }
52 return destIndex;
53 }
54
55 } // namespace
56
57 U_NAMESPACE_END
58
59 U_NAMESPACE_USE
60
61 /* string casing ------------------------------------------------------------ */
62
63 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
64 static inline int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s,int32_t cpLength,uint32_t options,icu::Edits * edits)65 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
66 int32_t result, const UChar *s,
67 int32_t cpLength, uint32_t options, icu::Edits *edits) {
68 UChar32 c;
69 int32_t length;
70
71 /* decode the result */
72 if(result<0) {
73 /* (not) original code point */
74 if(edits!=NULL) {
75 edits->addUnchanged(cpLength);
76 }
77 if(options & U_OMIT_UNCHANGED_TEXT) {
78 return destIndex;
79 }
80 c=~result;
81 if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
82 dest[destIndex++]=(UChar)c;
83 return destIndex;
84 }
85 length=cpLength;
86 } else {
87 if(result<=UCASE_MAX_STRING_LENGTH) {
88 c=U_SENTINEL;
89 length=result;
90 } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
91 dest[destIndex++]=(UChar)result;
92 if(edits!=NULL) {
93 edits->addReplace(cpLength, 1);
94 }
95 return destIndex;
96 } else {
97 c=result;
98 length=U16_LENGTH(c);
99 }
100 if(edits!=NULL) {
101 edits->addReplace(cpLength, length);
102 }
103 }
104 if(length>(INT32_MAX-destIndex)) {
105 return -1; // integer overflow
106 }
107
108 if(destIndex<destCapacity) {
109 /* append the result */
110 if(c>=0) {
111 /* code point */
112 UBool isError=FALSE;
113 U16_APPEND(dest, destIndex, destCapacity, c, isError);
114 if(isError) {
115 /* overflow, nothing written */
116 destIndex+=length;
117 }
118 } else {
119 /* string */
120 if((destIndex+length)<=destCapacity) {
121 while(length>0) {
122 dest[destIndex++]=*s++;
123 --length;
124 }
125 } else {
126 /* overflow */
127 destIndex+=length;
128 }
129 }
130 } else {
131 /* preflight */
132 destIndex+=length;
133 }
134 return destIndex;
135 }
136
137 static inline int32_t
appendUChar(UChar * dest,int32_t destIndex,int32_t destCapacity,UChar c)138 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
139 if(destIndex<destCapacity) {
140 dest[destIndex]=c;
141 } else if(destIndex==INT32_MAX) {
142 return -1; // integer overflow
143 }
144 return destIndex+1;
145 }
146
147 static inline int32_t
appendUnchanged(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length,uint32_t options,icu::Edits * edits)148 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
149 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
150 if(length>0) {
151 if(edits!=NULL) {
152 edits->addUnchanged(length);
153 }
154 if(options & U_OMIT_UNCHANGED_TEXT) {
155 return destIndex;
156 }
157 if(length>(INT32_MAX-destIndex)) {
158 return -1; // integer overflow
159 }
160 if((destIndex+length)<=destCapacity) {
161 u_memcpy(dest+destIndex, s, length);
162 }
163 destIndex+=length;
164 }
165 return destIndex;
166 }
167
168 static UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)169 utf16_caseContextIterator(void *context, int8_t dir) {
170 UCaseContext *csc=(UCaseContext *)context;
171 UChar32 c;
172
173 if(dir<0) {
174 /* reset for backward iteration */
175 csc->index=csc->cpStart;
176 csc->dir=dir;
177 } else if(dir>0) {
178 /* reset for forward iteration */
179 csc->index=csc->cpLimit;
180 csc->dir=dir;
181 } else {
182 /* continue current iteration direction */
183 dir=csc->dir;
184 }
185
186 if(dir<0) {
187 if(csc->start<csc->index) {
188 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
189 return c;
190 }
191 } else {
192 if(csc->index<csc->limit) {
193 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
194 return c;
195 }
196 }
197 return U_SENTINEL;
198 }
199
200 /*
201 * Case-maps [srcStart..srcLimit[ but takes
202 * context [0..srcLength[ into account.
203 */
204 static int32_t
_caseMap(int32_t caseLocale,uint32_t options,UCaseMapFull * map,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::Edits * edits,UErrorCode & errorCode)205 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
206 UChar *dest, int32_t destCapacity,
207 const UChar *src, UCaseContext *csc,
208 int32_t srcStart, int32_t srcLimit,
209 icu::Edits *edits,
210 UErrorCode &errorCode) {
211 /* case mapping loop */
212 int32_t srcIndex=srcStart;
213 int32_t destIndex=0;
214 while(srcIndex<srcLimit) {
215 int32_t cpStart;
216 csc->cpStart=cpStart=srcIndex;
217 UChar32 c;
218 U16_NEXT(src, srcIndex, srcLimit, c);
219 csc->cpLimit=srcIndex;
220 const UChar *s;
221 c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
222 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
223 srcIndex - cpStart, options, edits);
224 if (destIndex < 0) {
225 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
226 return 0;
227 }
228 }
229
230 return destIndex;
231 }
232
233 #if !UCONFIG_NO_BREAK_ITERATION
234
235 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)236 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
237 UChar *dest, int32_t destCapacity,
238 const UChar *src, int32_t srcLength,
239 icu::Edits *edits,
240 UErrorCode &errorCode) {
241 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
242 return 0;
243 }
244
245 /* set up local variables */
246 UCaseContext csc=UCASECONTEXT_INITIALIZER;
247 csc.p=(void *)src;
248 csc.limit=srcLength;
249 int32_t destIndex=0;
250 int32_t prev=0;
251 UBool isFirstIndex=TRUE;
252
253 /* titlecasing loop */
254 while(prev<srcLength) {
255 /* find next index where to titlecase */
256 int32_t index;
257 if(isFirstIndex) {
258 isFirstIndex=FALSE;
259 index=iter->first();
260 } else {
261 index=iter->next();
262 }
263 if(index==UBRK_DONE || index>srcLength) {
264 index=srcLength;
265 }
266
267 /*
268 * Segment [prev..index[ into 3 parts:
269 * a) skipped characters (copy as-is) [prev..titleStart[
270 * b) first letter (titlecase) [titleStart..titleLimit[
271 * c) subsequent characters (lowercase) [titleLimit..index[
272 */
273 if(prev<index) {
274 // Find and copy skipped characters [prev..titleStart[
275 int32_t titleStart=prev;
276 int32_t titleLimit=prev;
277 UChar32 c;
278 U16_NEXT(src, titleLimit, index, c);
279 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
280 // Adjust the titlecasing index to the next cased character,
281 // or to the next letter/number/symbol/private use.
282 // Stop with titleStart<titleLimit<=index
283 // if there is a character to be titlecased,
284 // or else stop with titleStart==titleLimit==index.
285 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
286 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
287 titleStart=titleLimit;
288 if(titleLimit==index) {
289 break;
290 }
291 U16_NEXT(src, titleLimit, index, c);
292 }
293 if (prev < titleStart) {
294 destIndex=appendUnchanged(dest, destIndex, destCapacity,
295 src+prev, titleStart-prev, options, edits);
296 if(destIndex<0) {
297 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
298 return 0;
299 }
300 }
301 }
302
303 if(titleStart<titleLimit) {
304 /* titlecase c which is from [titleStart..titleLimit[ */
305 csc.cpStart=titleStart;
306 csc.cpLimit=titleLimit;
307 const UChar *s;
308 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
309 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
310 titleLimit-titleStart, options, edits);
311 if(destIndex<0) {
312 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
313 return 0;
314 }
315
316 /* Special case Dutch IJ titlecasing */
317 if (titleStart+1 < index &&
318 caseLocale == UCASE_LOC_DUTCH &&
319 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
320 if (src[titleStart+1] == 0x006A) {
321 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
322 if(destIndex<0) {
323 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
324 return 0;
325 }
326 if(edits!=NULL) {
327 edits->addReplace(1, 1);
328 }
329 titleLimit++;
330 } else if (src[titleStart+1] == 0x004A) {
331 // Keep the capital J from getting lowercased.
332 destIndex=appendUnchanged(dest, destIndex, destCapacity,
333 src+titleStart+1, 1, options, edits);
334 if(destIndex<0) {
335 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
336 return 0;
337 }
338 titleLimit++;
339 }
340 }
341
342 /* lowercase [titleLimit..index[ */
343 if(titleLimit<index) {
344 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
345 /* Normal operation: Lowercase the rest of the word. */
346 destIndex+=
347 _caseMap(
348 caseLocale, options, ucase_toFullLower,
349 dest+destIndex, destCapacity-destIndex,
350 src, &csc,
351 titleLimit, index,
352 edits, errorCode);
353 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
354 errorCode=U_ZERO_ERROR;
355 }
356 if(U_FAILURE(errorCode)) {
357 return destIndex;
358 }
359 } else {
360 /* Optionally just copy the rest of the word unchanged. */
361 destIndex=appendUnchanged(dest, destIndex, destCapacity,
362 src+titleLimit, index-titleLimit, options, edits);
363 if(destIndex<0) {
364 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
365 return 0;
366 }
367 }
368 }
369 }
370 }
371
372 prev=index;
373 }
374
375 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
376 }
377
378 #endif // !UCONFIG_NO_BREAK_ITERATION
379
380 U_NAMESPACE_BEGIN
381 namespace GreekUpper {
382
383 // Data generated by prototype code, see
384 // http://site.icu-project.org/design/case/greek-upper
385 // TODO: Move this data into ucase.icu.
386 static const uint16_t data0370[] = {
387 // U+0370..03FF
388 0x0370,
389 0x0370,
390 0x0372,
391 0x0372,
392 0,
393 0,
394 0x0376,
395 0x0376,
396 0,
397 0,
398 0x037A,
399 0x03FD,
400 0x03FE,
401 0x03FF,
402 0,
403 0x037F,
404 0,
405 0,
406 0,
407 0,
408 0,
409 0,
410 0x0391 | HAS_VOWEL | HAS_ACCENT,
411 0,
412 0x0395 | HAS_VOWEL | HAS_ACCENT,
413 0x0397 | HAS_VOWEL | HAS_ACCENT,
414 0x0399 | HAS_VOWEL | HAS_ACCENT,
415 0,
416 0x039F | HAS_VOWEL | HAS_ACCENT,
417 0,
418 0x03A5 | HAS_VOWEL | HAS_ACCENT,
419 0x03A9 | HAS_VOWEL | HAS_ACCENT,
420 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
421 0x0391 | HAS_VOWEL,
422 0x0392,
423 0x0393,
424 0x0394,
425 0x0395 | HAS_VOWEL,
426 0x0396,
427 0x0397 | HAS_VOWEL,
428 0x0398,
429 0x0399 | HAS_VOWEL,
430 0x039A,
431 0x039B,
432 0x039C,
433 0x039D,
434 0x039E,
435 0x039F | HAS_VOWEL,
436 0x03A0,
437 0x03A1,
438 0,
439 0x03A3,
440 0x03A4,
441 0x03A5 | HAS_VOWEL,
442 0x03A6,
443 0x03A7,
444 0x03A8,
445 0x03A9 | HAS_VOWEL,
446 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
447 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
448 0x0391 | HAS_VOWEL | HAS_ACCENT,
449 0x0395 | HAS_VOWEL | HAS_ACCENT,
450 0x0397 | HAS_VOWEL | HAS_ACCENT,
451 0x0399 | HAS_VOWEL | HAS_ACCENT,
452 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
453 0x0391 | HAS_VOWEL,
454 0x0392,
455 0x0393,
456 0x0394,
457 0x0395 | HAS_VOWEL,
458 0x0396,
459 0x0397 | HAS_VOWEL,
460 0x0398,
461 0x0399 | HAS_VOWEL,
462 0x039A,
463 0x039B,
464 0x039C,
465 0x039D,
466 0x039E,
467 0x039F | HAS_VOWEL,
468 0x03A0,
469 0x03A1,
470 0x03A3,
471 0x03A3,
472 0x03A4,
473 0x03A5 | HAS_VOWEL,
474 0x03A6,
475 0x03A7,
476 0x03A8,
477 0x03A9 | HAS_VOWEL,
478 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
479 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
480 0x039F | HAS_VOWEL | HAS_ACCENT,
481 0x03A5 | HAS_VOWEL | HAS_ACCENT,
482 0x03A9 | HAS_VOWEL | HAS_ACCENT,
483 0x03CF,
484 0x0392,
485 0x0398,
486 0x03D2,
487 0x03D2 | HAS_ACCENT,
488 0x03D2 | HAS_DIALYTIKA,
489 0x03A6,
490 0x03A0,
491 0x03CF,
492 0x03D8,
493 0x03D8,
494 0x03DA,
495 0x03DA,
496 0x03DC,
497 0x03DC,
498 0x03DE,
499 0x03DE,
500 0x03E0,
501 0x03E0,
502 0,
503 0,
504 0,
505 0,
506 0,
507 0,
508 0,
509 0,
510 0,
511 0,
512 0,
513 0,
514 0,
515 0,
516 0x039A,
517 0x03A1,
518 0x03F9,
519 0x037F,
520 0x03F4,
521 0x0395 | HAS_VOWEL,
522 0,
523 0x03F7,
524 0x03F7,
525 0x03F9,
526 0x03FA,
527 0x03FA,
528 0x03FC,
529 0x03FD,
530 0x03FE,
531 0x03FF,
532 };
533
534 static const uint16_t data1F00[] = {
535 // U+1F00..1FFF
536 0x0391 | HAS_VOWEL,
537 0x0391 | HAS_VOWEL,
538 0x0391 | HAS_VOWEL | HAS_ACCENT,
539 0x0391 | HAS_VOWEL | HAS_ACCENT,
540 0x0391 | HAS_VOWEL | HAS_ACCENT,
541 0x0391 | HAS_VOWEL | HAS_ACCENT,
542 0x0391 | HAS_VOWEL | HAS_ACCENT,
543 0x0391 | HAS_VOWEL | HAS_ACCENT,
544 0x0391 | HAS_VOWEL,
545 0x0391 | HAS_VOWEL,
546 0x0391 | HAS_VOWEL | HAS_ACCENT,
547 0x0391 | HAS_VOWEL | HAS_ACCENT,
548 0x0391 | HAS_VOWEL | HAS_ACCENT,
549 0x0391 | HAS_VOWEL | HAS_ACCENT,
550 0x0391 | HAS_VOWEL | HAS_ACCENT,
551 0x0391 | HAS_VOWEL | HAS_ACCENT,
552 0x0395 | HAS_VOWEL,
553 0x0395 | HAS_VOWEL,
554 0x0395 | HAS_VOWEL | HAS_ACCENT,
555 0x0395 | HAS_VOWEL | HAS_ACCENT,
556 0x0395 | HAS_VOWEL | HAS_ACCENT,
557 0x0395 | HAS_VOWEL | HAS_ACCENT,
558 0,
559 0,
560 0x0395 | HAS_VOWEL,
561 0x0395 | HAS_VOWEL,
562 0x0395 | HAS_VOWEL | HAS_ACCENT,
563 0x0395 | HAS_VOWEL | HAS_ACCENT,
564 0x0395 | HAS_VOWEL | HAS_ACCENT,
565 0x0395 | HAS_VOWEL | HAS_ACCENT,
566 0,
567 0,
568 0x0397 | HAS_VOWEL,
569 0x0397 | HAS_VOWEL,
570 0x0397 | HAS_VOWEL | HAS_ACCENT,
571 0x0397 | HAS_VOWEL | HAS_ACCENT,
572 0x0397 | HAS_VOWEL | HAS_ACCENT,
573 0x0397 | HAS_VOWEL | HAS_ACCENT,
574 0x0397 | HAS_VOWEL | HAS_ACCENT,
575 0x0397 | HAS_VOWEL | HAS_ACCENT,
576 0x0397 | HAS_VOWEL,
577 0x0397 | HAS_VOWEL,
578 0x0397 | HAS_VOWEL | HAS_ACCENT,
579 0x0397 | HAS_VOWEL | HAS_ACCENT,
580 0x0397 | HAS_VOWEL | HAS_ACCENT,
581 0x0397 | HAS_VOWEL | HAS_ACCENT,
582 0x0397 | HAS_VOWEL | HAS_ACCENT,
583 0x0397 | HAS_VOWEL | HAS_ACCENT,
584 0x0399 | HAS_VOWEL,
585 0x0399 | HAS_VOWEL,
586 0x0399 | HAS_VOWEL | HAS_ACCENT,
587 0x0399 | HAS_VOWEL | HAS_ACCENT,
588 0x0399 | HAS_VOWEL | HAS_ACCENT,
589 0x0399 | HAS_VOWEL | HAS_ACCENT,
590 0x0399 | HAS_VOWEL | HAS_ACCENT,
591 0x0399 | HAS_VOWEL | HAS_ACCENT,
592 0x0399 | HAS_VOWEL,
593 0x0399 | HAS_VOWEL,
594 0x0399 | HAS_VOWEL | HAS_ACCENT,
595 0x0399 | HAS_VOWEL | HAS_ACCENT,
596 0x0399 | HAS_VOWEL | HAS_ACCENT,
597 0x0399 | HAS_VOWEL | HAS_ACCENT,
598 0x0399 | HAS_VOWEL | HAS_ACCENT,
599 0x0399 | HAS_VOWEL | HAS_ACCENT,
600 0x039F | HAS_VOWEL,
601 0x039F | HAS_VOWEL,
602 0x039F | HAS_VOWEL | HAS_ACCENT,
603 0x039F | HAS_VOWEL | HAS_ACCENT,
604 0x039F | HAS_VOWEL | HAS_ACCENT,
605 0x039F | HAS_VOWEL | HAS_ACCENT,
606 0,
607 0,
608 0x039F | HAS_VOWEL,
609 0x039F | HAS_VOWEL,
610 0x039F | HAS_VOWEL | HAS_ACCENT,
611 0x039F | HAS_VOWEL | HAS_ACCENT,
612 0x039F | HAS_VOWEL | HAS_ACCENT,
613 0x039F | HAS_VOWEL | HAS_ACCENT,
614 0,
615 0,
616 0x03A5 | HAS_VOWEL,
617 0x03A5 | HAS_VOWEL,
618 0x03A5 | HAS_VOWEL | HAS_ACCENT,
619 0x03A5 | HAS_VOWEL | HAS_ACCENT,
620 0x03A5 | HAS_VOWEL | HAS_ACCENT,
621 0x03A5 | HAS_VOWEL | HAS_ACCENT,
622 0x03A5 | HAS_VOWEL | HAS_ACCENT,
623 0x03A5 | HAS_VOWEL | HAS_ACCENT,
624 0,
625 0x03A5 | HAS_VOWEL,
626 0,
627 0x03A5 | HAS_VOWEL | HAS_ACCENT,
628 0,
629 0x03A5 | HAS_VOWEL | HAS_ACCENT,
630 0,
631 0x03A5 | HAS_VOWEL | HAS_ACCENT,
632 0x03A9 | HAS_VOWEL,
633 0x03A9 | HAS_VOWEL,
634 0x03A9 | HAS_VOWEL | HAS_ACCENT,
635 0x03A9 | HAS_VOWEL | HAS_ACCENT,
636 0x03A9 | HAS_VOWEL | HAS_ACCENT,
637 0x03A9 | HAS_VOWEL | HAS_ACCENT,
638 0x03A9 | HAS_VOWEL | HAS_ACCENT,
639 0x03A9 | HAS_VOWEL | HAS_ACCENT,
640 0x03A9 | HAS_VOWEL,
641 0x03A9 | HAS_VOWEL,
642 0x03A9 | HAS_VOWEL | HAS_ACCENT,
643 0x03A9 | HAS_VOWEL | HAS_ACCENT,
644 0x03A9 | HAS_VOWEL | HAS_ACCENT,
645 0x03A9 | HAS_VOWEL | HAS_ACCENT,
646 0x03A9 | HAS_VOWEL | HAS_ACCENT,
647 0x03A9 | HAS_VOWEL | HAS_ACCENT,
648 0x0391 | HAS_VOWEL | HAS_ACCENT,
649 0x0391 | HAS_VOWEL | HAS_ACCENT,
650 0x0395 | HAS_VOWEL | HAS_ACCENT,
651 0x0395 | HAS_VOWEL | HAS_ACCENT,
652 0x0397 | HAS_VOWEL | HAS_ACCENT,
653 0x0397 | HAS_VOWEL | HAS_ACCENT,
654 0x0399 | HAS_VOWEL | HAS_ACCENT,
655 0x0399 | HAS_VOWEL | HAS_ACCENT,
656 0x039F | HAS_VOWEL | HAS_ACCENT,
657 0x039F | HAS_VOWEL | HAS_ACCENT,
658 0x03A5 | HAS_VOWEL | HAS_ACCENT,
659 0x03A5 | HAS_VOWEL | HAS_ACCENT,
660 0x03A9 | HAS_VOWEL | HAS_ACCENT,
661 0x03A9 | HAS_VOWEL | HAS_ACCENT,
662 0,
663 0,
664 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
665 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
666 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
667 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
668 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
669 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
670 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
671 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
672 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
673 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
674 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
675 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
676 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
677 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
678 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
679 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
680 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
681 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
682 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
683 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
684 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
685 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
686 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
687 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
688 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
689 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
690 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
691 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
692 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
693 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
694 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
695 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
696 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
697 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
698 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
699 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
700 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
701 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
702 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
703 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
704 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
705 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
706 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
707 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
708 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
709 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
710 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
711 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
712 0x0391 | HAS_VOWEL,
713 0x0391 | HAS_VOWEL,
714 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
715 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
716 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
717 0,
718 0x0391 | HAS_VOWEL | HAS_ACCENT,
719 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
720 0x0391 | HAS_VOWEL,
721 0x0391 | HAS_VOWEL,
722 0x0391 | HAS_VOWEL | HAS_ACCENT,
723 0x0391 | HAS_VOWEL | HAS_ACCENT,
724 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
725 0,
726 0x0399 | HAS_VOWEL,
727 0,
728 0,
729 0,
730 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
731 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
732 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
733 0,
734 0x0397 | HAS_VOWEL | HAS_ACCENT,
735 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
736 0x0395 | HAS_VOWEL | HAS_ACCENT,
737 0x0395 | HAS_VOWEL | HAS_ACCENT,
738 0x0397 | HAS_VOWEL | HAS_ACCENT,
739 0x0397 | HAS_VOWEL | HAS_ACCENT,
740 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
741 0,
742 0,
743 0,
744 0x0399 | HAS_VOWEL,
745 0x0399 | HAS_VOWEL,
746 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
747 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
748 0,
749 0,
750 0x0399 | HAS_VOWEL | HAS_ACCENT,
751 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
752 0x0399 | HAS_VOWEL,
753 0x0399 | HAS_VOWEL,
754 0x0399 | HAS_VOWEL | HAS_ACCENT,
755 0x0399 | HAS_VOWEL | HAS_ACCENT,
756 0,
757 0,
758 0,
759 0,
760 0x03A5 | HAS_VOWEL,
761 0x03A5 | HAS_VOWEL,
762 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
763 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
764 0x03A1,
765 0x03A1,
766 0x03A5 | HAS_VOWEL | HAS_ACCENT,
767 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
768 0x03A5 | HAS_VOWEL,
769 0x03A5 | HAS_VOWEL,
770 0x03A5 | HAS_VOWEL | HAS_ACCENT,
771 0x03A5 | HAS_VOWEL | HAS_ACCENT,
772 0x03A1,
773 0,
774 0,
775 0,
776 0,
777 0,
778 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
779 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
780 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
781 0,
782 0x03A9 | HAS_VOWEL | HAS_ACCENT,
783 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
784 0x039F | HAS_VOWEL | HAS_ACCENT,
785 0x039F | HAS_VOWEL | HAS_ACCENT,
786 0x03A9 | HAS_VOWEL | HAS_ACCENT,
787 0x03A9 | HAS_VOWEL | HAS_ACCENT,
788 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
789 0,
790 0,
791 0,
792 };
793
794 // U+2126 Ohm sign
795 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
796
getLetterData(UChar32 c)797 uint32_t getLetterData(UChar32 c) {
798 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
799 return 0;
800 } else if (c <= 0x3ff) {
801 return data0370[c - 0x370];
802 } else if (c <= 0x1fff) {
803 return data1F00[c - 0x1f00];
804 } else if (c == 0x2126) {
805 return data2126;
806 } else {
807 return 0;
808 }
809 }
810
getDiacriticData(UChar32 c)811 uint32_t getDiacriticData(UChar32 c) {
812 switch (c) {
813 case 0x0300: // varia
814 case 0x0301: // tonos = oxia
815 case 0x0342: // perispomeni
816 case 0x0302: // circumflex can look like perispomeni
817 case 0x0303: // tilde can look like perispomeni
818 case 0x0311: // inverted breve can look like perispomeni
819 return HAS_ACCENT;
820 case 0x0308: // dialytika = diaeresis
821 return HAS_COMBINING_DIALYTIKA;
822 case 0x0344: // dialytika tonos
823 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
824 case 0x0345: // ypogegrammeni = iota subscript
825 return HAS_YPOGEGRAMMENI;
826 case 0x0304: // macron
827 case 0x0306: // breve
828 case 0x0313: // comma above
829 case 0x0314: // reversed comma above
830 case 0x0343: // koronis
831 return HAS_OTHER_GREEK_DIACRITIC;
832 default:
833 return 0;
834 }
835 }
836
isFollowedByCasedLetter(const UChar * s,int32_t i,int32_t length)837 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
838 while (i < length) {
839 UChar32 c;
840 U16_NEXT(s, i, length, c);
841 int32_t type = ucase_getTypeOrIgnorable(c);
842 if ((type & UCASE_IGNORABLE) != 0) {
843 // Case-ignorable, continue with the loop.
844 } else if (type != UCASE_NONE) {
845 return TRUE; // Followed by cased letter.
846 } else {
847 return FALSE; // Uncased and not case-ignorable.
848 }
849 }
850 return FALSE; // Not followed by cased letter.
851 }
852
853 /**
854 * Greek string uppercasing with a state machine.
855 * Probably simpler than a stateless function that has to figure out complex context-before
856 * for each character.
857 * TODO: Try to re-consolidate one way or another with the non-Greek function.
858 */
toUpper(uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,Edits * edits,UErrorCode & errorCode)859 int32_t toUpper(uint32_t options,
860 UChar *dest, int32_t destCapacity,
861 const UChar *src, int32_t srcLength,
862 Edits *edits,
863 UErrorCode &errorCode) {
864 int32_t destIndex=0;
865 uint32_t state = 0;
866 for (int32_t i = 0; i < srcLength;) {
867 int32_t nextIndex = i;
868 UChar32 c;
869 U16_NEXT(src, nextIndex, srcLength, c);
870 uint32_t nextState = 0;
871 int32_t type = ucase_getTypeOrIgnorable(c);
872 if ((type & UCASE_IGNORABLE) != 0) {
873 // c is case-ignorable
874 nextState |= (state & AFTER_CASED);
875 } else if (type != UCASE_NONE) {
876 // c is cased
877 nextState |= AFTER_CASED;
878 }
879 uint32_t data = getLetterData(c);
880 if (data > 0) {
881 uint32_t upper = data & UPPER_MASK;
882 // Add a dialytika to this iota or ypsilon vowel
883 // if we removed a tonos from the previous vowel,
884 // and that previous vowel did not also have (or gain) a dialytika.
885 // Adding one only to the final vowel in a longer sequence
886 // (which does not occur in normal writing) would require lookahead.
887 // Set the same flag as for preserving an existing dialytika.
888 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
889 (upper == 0x399 || upper == 0x3A5)) {
890 data |= HAS_DIALYTIKA;
891 }
892 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
893 if ((data & HAS_YPOGEGRAMMENI) != 0) {
894 numYpogegrammeni = 1;
895 }
896 // Skip combining diacritics after this Greek letter.
897 while (nextIndex < srcLength) {
898 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
899 if (diacriticData != 0) {
900 data |= diacriticData;
901 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
902 ++numYpogegrammeni;
903 }
904 ++nextIndex;
905 } else {
906 break; // not a Greek diacritic
907 }
908 }
909 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
910 nextState |= AFTER_VOWEL_WITH_ACCENT;
911 }
912 // Map according to Greek rules.
913 UBool addTonos = FALSE;
914 if (upper == 0x397 &&
915 (data & HAS_ACCENT) != 0 &&
916 numYpogegrammeni == 0 &&
917 (state & AFTER_CASED) == 0 &&
918 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
919 // Keep disjunctive "or" with (only) a tonos.
920 // We use the same "word boundary" conditions as for the Final_Sigma test.
921 if (i == nextIndex) {
922 upper = 0x389; // Preserve the precomposed form.
923 } else {
924 addTonos = TRUE;
925 }
926 } else if ((data & HAS_DIALYTIKA) != 0) {
927 // Preserve a vowel with dialytika in precomposed form if it exists.
928 if (upper == 0x399) {
929 upper = 0x3AA;
930 data &= ~HAS_EITHER_DIALYTIKA;
931 } else if (upper == 0x3A5) {
932 upper = 0x3AB;
933 data &= ~HAS_EITHER_DIALYTIKA;
934 }
935 }
936
937 UBool change;
938 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
939 change = TRUE; // common, simple usage
940 } else {
941 // Find out first whether we are changing the text.
942 change = src[i] != upper || numYpogegrammeni > 0;
943 int32_t i2 = i + 1;
944 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
945 change |= i2 >= nextIndex || src[i2] != 0x308;
946 ++i2;
947 }
948 if (addTonos) {
949 change |= i2 >= nextIndex || src[i2] != 0x301;
950 ++i2;
951 }
952 int32_t oldLength = nextIndex - i;
953 int32_t newLength = (i2 - i) + numYpogegrammeni;
954 change |= oldLength != newLength;
955 if (change) {
956 if (edits != NULL) {
957 edits->addReplace(oldLength, newLength);
958 }
959 } else {
960 if (edits != NULL) {
961 edits->addUnchanged(oldLength);
962 }
963 // Write unchanged text?
964 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
965 }
966 }
967
968 if (change) {
969 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
970 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
971 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
972 }
973 if (destIndex >= 0 && addTonos) {
974 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
975 }
976 while (destIndex >= 0 && numYpogegrammeni > 0) {
977 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
978 --numYpogegrammeni;
979 }
980 if(destIndex<0) {
981 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
982 return 0;
983 }
984 }
985 } else {
986 const UChar *s;
987 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
988 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
989 nextIndex - i, options, edits);
990 if (destIndex < 0) {
991 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
992 return 0;
993 }
994 }
995 i = nextIndex;
996 state = nextState;
997 }
998
999 return destIndex;
1000 }
1001
1002 } // namespace GreekUpper
1003 U_NAMESPACE_END
1004
1005 /* functions available in the common library (for unistr_case.cpp) */
1006
1007 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1008 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1009 UChar *dest, int32_t destCapacity,
1010 const UChar *src, int32_t srcLength,
1011 icu::Edits *edits,
1012 UErrorCode &errorCode) {
1013 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1014 csc.p=(void *)src;
1015 csc.limit=srcLength;
1016 int32_t destIndex = _caseMap(
1017 caseLocale, options, ucase_toFullLower,
1018 dest, destCapacity,
1019 src, &csc, 0, srcLength,
1020 edits, errorCode);
1021 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1022 }
1023
1024 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1025 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1026 UChar *dest, int32_t destCapacity,
1027 const UChar *src, int32_t srcLength,
1028 icu::Edits *edits,
1029 UErrorCode &errorCode) {
1030 int32_t destIndex;
1031 if (caseLocale == UCASE_LOC_GREEK) {
1032 destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1033 src, srcLength, edits, errorCode);
1034 } else {
1035 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1036 csc.p=(void *)src;
1037 csc.limit=srcLength;
1038 destIndex = _caseMap(
1039 caseLocale, options, ucase_toFullUpper,
1040 dest, destCapacity,
1041 src, &csc, 0, srcLength,
1042 edits, errorCode);
1043 }
1044 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1045 }
1046
1047 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1048 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1049 UChar *dest, int32_t destCapacity,
1050 const UChar *src, int32_t srcLength,
1051 icu::Edits *edits,
1052 UErrorCode &errorCode) {
1053 /* case mapping loop */
1054 int32_t srcIndex = 0;
1055 int32_t destIndex = 0;
1056 while (srcIndex < srcLength) {
1057 int32_t cpStart = srcIndex;
1058 UChar32 c;
1059 U16_NEXT(src, srcIndex, srcLength, c);
1060 const UChar *s;
1061 c = ucase_toFullFolding(c, &s, options);
1062 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1063 srcIndex - cpStart, options, edits);
1064 if (destIndex < 0) {
1065 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1066 return 0;
1067 }
1068 }
1069
1070 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1071 }
1072
1073 U_CFUNC int32_t
ustrcase_map(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)1074 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1075 UChar *dest, int32_t destCapacity,
1076 const UChar *src, int32_t srcLength,
1077 UStringCaseMapper *stringCaseMapper,
1078 icu::Edits *edits,
1079 UErrorCode &errorCode) {
1080 int32_t destLength;
1081
1082 /* check argument values */
1083 if(U_FAILURE(errorCode)) {
1084 return 0;
1085 }
1086 if( destCapacity<0 ||
1087 (dest==NULL && destCapacity>0) ||
1088 src==NULL ||
1089 srcLength<-1
1090 ) {
1091 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1092 return 0;
1093 }
1094
1095 /* get the string length */
1096 if(srcLength==-1) {
1097 srcLength=u_strlen(src);
1098 }
1099
1100 /* check for overlapping source and destination */
1101 if( dest!=NULL &&
1102 ((src>=dest && src<(dest+destCapacity)) ||
1103 (dest>=src && dest<(src+srcLength)))
1104 ) {
1105 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1106 return 0;
1107 }
1108
1109 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
1110 edits->reset();
1111 }
1112 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1113 dest, destCapacity, src, srcLength, edits, errorCode);
1114 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1115 }
1116
1117 U_CFUNC int32_t
ustrcase_mapWithOverlap(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode & errorCode)1118 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1119 UChar *dest, int32_t destCapacity,
1120 const UChar *src, int32_t srcLength,
1121 UStringCaseMapper *stringCaseMapper,
1122 UErrorCode &errorCode) {
1123 UChar buffer[300];
1124 UChar *temp;
1125
1126 int32_t destLength;
1127
1128 /* check argument values */
1129 if(U_FAILURE(errorCode)) {
1130 return 0;
1131 }
1132 if( destCapacity<0 ||
1133 (dest==NULL && destCapacity>0) ||
1134 src==NULL ||
1135 srcLength<-1
1136 ) {
1137 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1138 return 0;
1139 }
1140
1141 /* get the string length */
1142 if(srcLength==-1) {
1143 srcLength=u_strlen(src);
1144 }
1145
1146 /* check for overlapping source and destination */
1147 if( dest!=NULL &&
1148 ((src>=dest && src<(dest+destCapacity)) ||
1149 (dest>=src && dest<(src+srcLength)))
1150 ) {
1151 /* overlap: provide a temporary destination buffer and later copy the result */
1152 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1153 /* the stack buffer is large enough */
1154 temp=buffer;
1155 } else {
1156 /* allocate a buffer */
1157 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1158 if(temp==NULL) {
1159 errorCode=U_MEMORY_ALLOCATION_ERROR;
1160 return 0;
1161 }
1162 }
1163 } else {
1164 temp=dest;
1165 }
1166
1167 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1168 temp, destCapacity, src, srcLength, NULL, errorCode);
1169 if(temp!=dest) {
1170 /* copy the result string to the destination buffer */
1171 if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1172 u_memmove(dest, temp, destLength);
1173 }
1174 if(temp!=buffer) {
1175 uprv_free(temp);
1176 }
1177 }
1178
1179 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1180 }
1181
1182 /* public API functions */
1183
1184 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1185 u_strFoldCase(UChar *dest, int32_t destCapacity,
1186 const UChar *src, int32_t srcLength,
1187 uint32_t options,
1188 UErrorCode *pErrorCode) {
1189 return ustrcase_mapWithOverlap(
1190 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1191 dest, destCapacity,
1192 src, srcLength,
1193 ustrcase_internalFold, *pErrorCode);
1194 }
1195
1196 U_NAMESPACE_BEGIN
1197
fold(uint32_t options,const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1198 int32_t CaseMap::fold(
1199 uint32_t options,
1200 const UChar *src, int32_t srcLength,
1201 UChar *dest, int32_t destCapacity, Edits *edits,
1202 UErrorCode &errorCode) {
1203 return ustrcase_map(
1204 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1205 dest, destCapacity,
1206 src, srcLength,
1207 ustrcase_internalFold, edits, errorCode);
1208 }
1209
1210 U_NAMESPACE_END
1211
1212 /* case-insensitive string comparisons -------------------------------------- */
1213
1214 /*
1215 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1216 * canonical equivalence.
1217 * Keep the functions in sync, and see there for how this works.
1218 * The duplication is for modularization:
1219 * It makes caseless (but not canonical caseless) matches independent of
1220 * the normalization code.
1221 */
1222
1223 /* stack element for previous-level source/decomposition pointers */
1224 struct CmpEquivLevel {
1225 const UChar *start, *s, *limit;
1226 };
1227 typedef struct CmpEquivLevel CmpEquivLevel;
1228
1229 /**
1230 * Internal implementation code comparing string with case fold.
1231 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1232 *
1233 * @param s1 input string 1
1234 * @param length1 length of string 1, or -1 (NULL terminated)
1235 * @param s2 input string 2
1236 * @param length2 length of string 2, or -1 (NULL terminated)
1237 * @param options compare options
1238 * @param matchLen1 (output) length of partial prefix match in s1
1239 * @param matchLen2 (output) length of partial prefix match in s2
1240 * @param pErrorCode receives error status
1241 * @return The result of comparison
1242 */
_cmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1243 static int32_t _cmpFold(
1244 const UChar *s1, int32_t length1,
1245 const UChar *s2, int32_t length2,
1246 uint32_t options,
1247 int32_t *matchLen1, int32_t *matchLen2,
1248 UErrorCode *pErrorCode) {
1249 int32_t cmpRes = 0;
1250
1251 /* current-level start/limit - s1/s2 as current */
1252 const UChar *start1, *start2, *limit1, *limit2;
1253
1254 /* points to the original start address */
1255 const UChar *org1, *org2;
1256
1257 /* points to the end of match + 1 */
1258 const UChar *m1, *m2;
1259
1260 /* case folding variables */
1261 const UChar *p;
1262 int32_t length;
1263
1264 /* stacks of previous-level start/current/limit */
1265 CmpEquivLevel stack1[2], stack2[2];
1266
1267 /* case folding buffers, only use current-level start/limit */
1268 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1269
1270 /* track which is the current level per string */
1271 int32_t level1, level2;
1272
1273 /* current code units, and code points for lookups */
1274 UChar32 c1, c2, cp1, cp2;
1275
1276 /* no argument error checking because this itself is not an API */
1277
1278 /*
1279 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1280 * otherwise this function would have to behave exactly as uprv_strCompare()
1281 */
1282 if(U_FAILURE(*pErrorCode)) {
1283 return 0;
1284 }
1285
1286 /* initialize */
1287 if(matchLen1) {
1288 U_ASSERT(matchLen2 !=NULL);
1289 *matchLen1=0;
1290 *matchLen2=0;
1291 }
1292
1293 start1=m1=org1=s1;
1294 if(length1==-1) {
1295 limit1=NULL;
1296 } else {
1297 limit1=s1+length1;
1298 }
1299
1300 start2=m2=org2=s2;
1301 if(length2==-1) {
1302 limit2=NULL;
1303 } else {
1304 limit2=s2+length2;
1305 }
1306
1307 level1=level2=0;
1308 c1=c2=-1;
1309
1310 /* comparison loop */
1311 for(;;) {
1312 /*
1313 * here a code unit value of -1 means "get another code unit"
1314 * below it will mean "this source is finished"
1315 */
1316
1317 if(c1<0) {
1318 /* get next code unit from string 1, post-increment */
1319 for(;;) {
1320 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1321 if(level1==0) {
1322 c1=-1;
1323 break;
1324 }
1325 } else {
1326 ++s1;
1327 break;
1328 }
1329
1330 /* reached end of level buffer, pop one level */
1331 do {
1332 --level1;
1333 start1=stack1[level1].start; /*Not uninitialized*/
1334 } while(start1==NULL);
1335 s1=stack1[level1].s; /*Not uninitialized*/
1336 limit1=stack1[level1].limit; /*Not uninitialized*/
1337 }
1338 }
1339
1340 if(c2<0) {
1341 /* get next code unit from string 2, post-increment */
1342 for(;;) {
1343 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1344 if(level2==0) {
1345 c2=-1;
1346 break;
1347 }
1348 } else {
1349 ++s2;
1350 break;
1351 }
1352
1353 /* reached end of level buffer, pop one level */
1354 do {
1355 --level2;
1356 start2=stack2[level2].start; /*Not uninitialized*/
1357 } while(start2==NULL);
1358 s2=stack2[level2].s; /*Not uninitialized*/
1359 limit2=stack2[level2].limit; /*Not uninitialized*/
1360 }
1361 }
1362
1363 /*
1364 * compare c1 and c2
1365 * either variable c1, c2 is -1 only if the corresponding string is finished
1366 */
1367 if(c1==c2) {
1368 const UChar *next1, *next2;
1369
1370 if(c1<0) {
1371 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1372 break;
1373 }
1374
1375 /*
1376 * Note: Move the match positions in both strings at the same time
1377 * only when corresponding code point(s) in the original strings
1378 * are fully consumed. For example, when comparing s1="Fust" and
1379 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1380 * the first code point in the case-folded data. But the second "s"
1381 * has no matching code point in s1, so this implementation returns
1382 * 2 as the prefix match length ("Fu").
1383 */
1384 next1=next2=NULL;
1385 if(level1==0) {
1386 next1=s1;
1387 } else if(s1==limit1) {
1388 /* Note: This implementation only use a single level of stack.
1389 * If this code needs to be changed to use multiple levels
1390 * of stacks, the code above should check if the current
1391 * code is at the end of all stacks.
1392 */
1393 U_ASSERT(level1==1);
1394
1395 /* is s1 at the end of the current stack? */
1396 next1=stack1[0].s;
1397 }
1398
1399 if (next1!=NULL) {
1400 if(level2==0) {
1401 next2=s2;
1402 } else if(s2==limit2) {
1403 U_ASSERT(level2==1);
1404
1405 /* is s2 at the end of the current stack? */
1406 next2=stack2[0].s;
1407 }
1408 if(next2!=NULL) {
1409 m1=next1;
1410 m2=next2;
1411 }
1412 }
1413 c1=c2=-1; /* make us fetch new code units */
1414 continue;
1415 } else if(c1<0) {
1416 cmpRes=-1; /* string 1 ends before string 2 */
1417 break;
1418 } else if(c2<0) {
1419 cmpRes=1; /* string 2 ends before string 1 */
1420 break;
1421 }
1422 /* c1!=c2 && c1>=0 && c2>=0 */
1423
1424 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1425 cp1=c1;
1426 if(U_IS_SURROGATE(c1)) {
1427 UChar c;
1428
1429 if(U_IS_SURROGATE_LEAD(c1)) {
1430 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1431 /* advance ++s1; only below if cp1 decomposes/case-folds */
1432 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1433 }
1434 } else /* isTrail(c1) */ {
1435 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1436 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1437 }
1438 }
1439 }
1440
1441 cp2=c2;
1442 if(U_IS_SURROGATE(c2)) {
1443 UChar c;
1444
1445 if(U_IS_SURROGATE_LEAD(c2)) {
1446 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1447 /* advance ++s2; only below if cp2 decomposes/case-folds */
1448 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1449 }
1450 } else /* isTrail(c2) */ {
1451 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1452 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1453 }
1454 }
1455 }
1456
1457 /*
1458 * go down one level for each string
1459 * continue with the main loop as soon as there is a real change
1460 */
1461
1462 if( level1==0 &&
1463 (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1464 ) {
1465 /* cp1 case-folds to the code point "length" or to p[length] */
1466 if(U_IS_SURROGATE(c1)) {
1467 if(U_IS_SURROGATE_LEAD(c1)) {
1468 /* advance beyond source surrogate pair if it case-folds */
1469 ++s1;
1470 } else /* isTrail(c1) */ {
1471 /*
1472 * we got a supplementary code point when hitting its trail surrogate,
1473 * therefore the lead surrogate must have been the same as in the other string;
1474 * compare this decomposition with the lead surrogate in the other string
1475 * remember that this simulates bulk text replacement:
1476 * the decomposition would replace the entire code point
1477 */
1478 --s2;
1479 --m2;
1480 c2=*(s2-1);
1481 }
1482 }
1483
1484 /* push current level pointers */
1485 stack1[0].start=start1;
1486 stack1[0].s=s1;
1487 stack1[0].limit=limit1;
1488 ++level1;
1489
1490 /* copy the folding result to fold1[] */
1491 if(length<=UCASE_MAX_STRING_LENGTH) {
1492 u_memcpy(fold1, p, length);
1493 } else {
1494 int32_t i=0;
1495 U16_APPEND_UNSAFE(fold1, i, length);
1496 length=i;
1497 }
1498
1499 /* set next level pointers to case folding */
1500 start1=s1=fold1;
1501 limit1=fold1+length;
1502
1503 /* get ready to read from decomposition, continue with loop */
1504 c1=-1;
1505 continue;
1506 }
1507
1508 if( level2==0 &&
1509 (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1510 ) {
1511 /* cp2 case-folds to the code point "length" or to p[length] */
1512 if(U_IS_SURROGATE(c2)) {
1513 if(U_IS_SURROGATE_LEAD(c2)) {
1514 /* advance beyond source surrogate pair if it case-folds */
1515 ++s2;
1516 } else /* isTrail(c2) */ {
1517 /*
1518 * we got a supplementary code point when hitting its trail surrogate,
1519 * therefore the lead surrogate must have been the same as in the other string;
1520 * compare this decomposition with the lead surrogate in the other string
1521 * remember that this simulates bulk text replacement:
1522 * the decomposition would replace the entire code point
1523 */
1524 --s1;
1525 --m2;
1526 c1=*(s1-1);
1527 }
1528 }
1529
1530 /* push current level pointers */
1531 stack2[0].start=start2;
1532 stack2[0].s=s2;
1533 stack2[0].limit=limit2;
1534 ++level2;
1535
1536 /* copy the folding result to fold2[] */
1537 if(length<=UCASE_MAX_STRING_LENGTH) {
1538 u_memcpy(fold2, p, length);
1539 } else {
1540 int32_t i=0;
1541 U16_APPEND_UNSAFE(fold2, i, length);
1542 length=i;
1543 }
1544
1545 /* set next level pointers to case folding */
1546 start2=s2=fold2;
1547 limit2=fold2+length;
1548
1549 /* get ready to read from decomposition, continue with loop */
1550 c2=-1;
1551 continue;
1552 }
1553
1554 /*
1555 * no decomposition/case folding, max level for both sides:
1556 * return difference result
1557 *
1558 * code point order comparison must not just return cp1-cp2
1559 * because when single surrogates are present then the surrogate pairs
1560 * that formed cp1 and cp2 may be from different string indexes
1561 *
1562 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1563 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1564 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1565 *
1566 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1567 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1568 * so we have slightly different pointer/start/limit comparisons here
1569 */
1570
1571 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1572 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1573 if(
1574 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1575 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1576 ) {
1577 /* part of a surrogate pair, leave >=d800 */
1578 } else {
1579 /* BMP code point - may be surrogate code point - make <d800 */
1580 c1-=0x2800;
1581 }
1582
1583 if(
1584 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1585 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1586 ) {
1587 /* part of a surrogate pair, leave >=d800 */
1588 } else {
1589 /* BMP code point - may be surrogate code point - make <d800 */
1590 c2-=0x2800;
1591 }
1592 }
1593
1594 cmpRes=c1-c2;
1595 break;
1596 }
1597
1598 if(matchLen1) {
1599 *matchLen1=m1-org1;
1600 *matchLen2=m2-org2;
1601 }
1602 return cmpRes;
1603 }
1604
1605 /* internal function */
1606 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1607 u_strcmpFold(const UChar *s1, int32_t length1,
1608 const UChar *s2, int32_t length2,
1609 uint32_t options,
1610 UErrorCode *pErrorCode) {
1611 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1612 }
1613
1614 /* public API functions */
1615
1616 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1617 u_strCaseCompare(const UChar *s1, int32_t length1,
1618 const UChar *s2, int32_t length2,
1619 uint32_t options,
1620 UErrorCode *pErrorCode) {
1621 /* argument checking */
1622 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1623 return 0;
1624 }
1625 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1626 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1627 return 0;
1628 }
1629 return u_strcmpFold(s1, length1, s2, length2,
1630 options|U_COMPARE_IGNORE_CASE,
1631 pErrorCode);
1632 }
1633
1634 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)1635 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1636 UErrorCode errorCode=U_ZERO_ERROR;
1637 return u_strcmpFold(s1, -1, s2, -1,
1638 options|U_COMPARE_IGNORE_CASE,
1639 &errorCode);
1640 }
1641
1642 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)1643 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1644 UErrorCode errorCode=U_ZERO_ERROR;
1645 return u_strcmpFold(s1, length, s2, length,
1646 options|U_COMPARE_IGNORE_CASE,
1647 &errorCode);
1648 }
1649
1650 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)1651 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1652 UErrorCode errorCode=U_ZERO_ERROR;
1653 return u_strcmpFold(s1, n, s2, n,
1654 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1655 &errorCode);
1656 }
1657
1658 /* internal API - detect length of shared prefix */
1659 U_CAPI void
u_caseInsensitivePrefixMatch(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1660 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1661 const UChar *s2, int32_t length2,
1662 uint32_t options,
1663 int32_t *matchLen1, int32_t *matchLen2,
1664 UErrorCode *pErrorCode) {
1665 _cmpFold(s1, length1, s2, length2, options,
1666 matchLen1, matchLen2, pErrorCode);
1667 }
1668