1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucase.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004aug30 16 * created by: Markus W. Scherer 17 * 18 * Low-level Unicode character/string case mapping code. 19 * Much code moved here (and modified) from uchar.c. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "unicode/unistr.h" 24 #include "unicode/uset.h" 25 #include "unicode/udata.h" /* UDataInfo */ 26 #include "unicode/utf16.h" 27 #include "ucmndata.h" /* DataHeader */ 28 #include "udatamem.h" 29 #include "umutex.h" 30 #include "uassert.h" 31 #include "cmemory.h" 32 #include "utrie2.h" 33 #include "ucase.h" 34 35 struct UCaseProps { 36 UDataMemory *mem; 37 const int32_t *indexes; 38 const uint16_t *exceptions; 39 const uint16_t *unfold; 40 41 UTrie2 trie; 42 uint8_t formatVersion[4]; 43 }; 44 45 /* ucase_props_data.h is machine-generated by gencase --csource */ 46 #define INCLUDED_FROM_UCASE_CPP 47 #include "ucase_props_data.h" 48 49 /* UCaseProps singleton ----------------------------------------------------- */ 50 51 U_CAPI const UCaseProps * U_EXPORT2 52 ucase_getSingleton() { 53 return &ucase_props_singleton; 54 } 55 56 /* set of property starts for UnicodeSet ------------------------------------ */ 57 58 static UBool U_CALLCONV 59 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 60 /* add the start code point to the USet */ 61 const USetAdder *sa=(const USetAdder *)context; 62 sa->add(sa->set, start); 63 return TRUE; 64 } 65 66 U_CFUNC void U_EXPORT2 67 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) { 68 if(U_FAILURE(*pErrorCode)) { 69 return; 70 } 71 72 /* add the start code point of each same-value range of the trie */ 73 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); 74 75 /* add code points with hardcoded properties, plus the ones following them */ 76 77 /* (none right now, see comment below) */ 78 79 /* 80 * Omit code points with hardcoded specialcasing properties 81 * because we do not build property UnicodeSets for them right now. 82 */ 83 } 84 85 /* data access primitives --------------------------------------------------- */ 86 87 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) 88 89 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) 90 91 /* number of bits in an 8-bit integer value */ 92 static const uint8_t flagsOffset[256]={ 93 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 95 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 101 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 102 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 103 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 105 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 106 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 107 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 108 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 109 }; 110 111 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) 112 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] 113 114 /* 115 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). 116 * 117 * @param excWord (in) initial exceptions word 118 * @param idx (in) desired slot index 119 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; 120 * moved to the last uint16_t of the value, use +1 for beginning of next slot 121 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified 122 */ 123 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ 124 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ 125 (pExc16)+=SLOT_OFFSET(excWord, idx); \ 126 (value)=*pExc16; \ 127 } else { \ 128 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ 129 (value)=*pExc16++; \ 130 (value)=((value)<<16)|*pExc16; \ 131 } 132 133 /* simple case mappings ----------------------------------------------------- */ 134 135 U_CAPI UChar32 U_EXPORT2 136 ucase_tolower(const UCaseProps *csp, UChar32 c) { 137 uint16_t props=UTRIE2_GET16(&csp->trie, c); 138 if(!PROPS_HAS_EXCEPTION(props)) { 139 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 140 c+=UCASE_GET_DELTA(props); 141 } 142 } else { 143 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 144 uint16_t excWord=*pe++; 145 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 146 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); 147 } 148 } 149 return c; 150 } 151 152 U_CAPI UChar32 U_EXPORT2 153 ucase_toupper(const UCaseProps *csp, UChar32 c) { 154 uint16_t props=UTRIE2_GET16(&csp->trie, c); 155 if(!PROPS_HAS_EXCEPTION(props)) { 156 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 157 c+=UCASE_GET_DELTA(props); 158 } 159 } else { 160 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 161 uint16_t excWord=*pe++; 162 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 163 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); 164 } 165 } 166 return c; 167 } 168 169 U_CAPI UChar32 U_EXPORT2 170 ucase_totitle(const UCaseProps *csp, UChar32 c) { 171 uint16_t props=UTRIE2_GET16(&csp->trie, c); 172 if(!PROPS_HAS_EXCEPTION(props)) { 173 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 174 c+=UCASE_GET_DELTA(props); 175 } 176 } else { 177 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 178 uint16_t excWord=*pe++; 179 int32_t idx; 180 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 181 idx=UCASE_EXC_TITLE; 182 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 183 idx=UCASE_EXC_UPPER; 184 } else { 185 return c; 186 } 187 GET_SLOT_VALUE(excWord, idx, pe, c); 188 } 189 return c; 190 } 191 192 static const UChar iDot[2] = { 0x69, 0x307 }; 193 static const UChar jDot[2] = { 0x6a, 0x307 }; 194 static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; 195 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; 196 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; 197 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; 198 199 200 U_CFUNC void U_EXPORT2 201 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { 202 uint16_t props; 203 204 /* 205 * Hardcode the case closure of i and its relatives and ignore the 206 * data file data for these characters. 207 * The Turkic dotless i and dotted I with their case mapping conditions 208 * and case folding option make the related characters behave specially. 209 * This code matches their closure behavior to their case folding behavior. 210 */ 211 212 switch(c) { 213 case 0x49: 214 /* regular i and I are in one equivalence class */ 215 sa->add(sa->set, 0x69); 216 return; 217 case 0x69: 218 sa->add(sa->set, 0x49); 219 return; 220 case 0x130: 221 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 222 sa->addString(sa->set, iDot, 2); 223 return; 224 case 0x131: 225 /* dotless i is in a class by itself */ 226 return; 227 default: 228 /* otherwise use the data file data */ 229 break; 230 } 231 232 props=UTRIE2_GET16(&csp->trie, c); 233 if(!PROPS_HAS_EXCEPTION(props)) { 234 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 235 /* add the one simple case mapping, no matter what type it is */ 236 int32_t delta=UCASE_GET_DELTA(props); 237 if(delta!=0) { 238 sa->add(sa->set, c+delta); 239 } 240 } 241 } else { 242 /* 243 * c has exceptions, so there may be multiple simple and/or 244 * full case mappings. Add them all. 245 */ 246 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); 247 const UChar *closure; 248 uint16_t excWord=*pe++; 249 int32_t idx, closureLength, fullLength, length; 250 251 pe0=pe; 252 253 /* add all simple case mappings */ 254 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 255 if(HAS_SLOT(excWord, idx)) { 256 pe=pe0; 257 GET_SLOT_VALUE(excWord, idx, pe, c); 258 sa->add(sa->set, c); 259 } 260 } 261 262 /* get the closure string pointer & length */ 263 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 264 pe=pe0; 265 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 266 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 267 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ 268 } else { 269 closureLength=0; 270 closure=NULL; 271 } 272 273 /* add the full case folding */ 274 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 275 pe=pe0; 276 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 277 278 /* start of full case mapping strings */ 279 ++pe; 280 281 fullLength&=0xffff; /* bits 16 and higher are reserved */ 282 283 /* skip the lowercase result string */ 284 pe+=fullLength&UCASE_FULL_LOWER; 285 fullLength>>=4; 286 287 /* add the full case folding string */ 288 length=fullLength&0xf; 289 if(length!=0) { 290 sa->addString(sa->set, (const UChar *)pe, length); 291 pe+=length; 292 } 293 294 /* skip the uppercase and titlecase strings */ 295 fullLength>>=4; 296 pe+=fullLength&0xf; 297 fullLength>>=4; 298 pe+=fullLength; 299 300 closure=(const UChar *)pe; /* behind full case mappings */ 301 } 302 303 /* add each code point in the closure string */ 304 for(idx=0; idx<closureLength;) { 305 U16_NEXT_UNSAFE(closure, idx, c); 306 sa->add(sa->set, c); 307 } 308 } 309 } 310 311 /* 312 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated 313 * must be length>0 and max>0 and length<=max 314 */ 315 static inline int32_t 316 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { 317 int32_t c1, c2; 318 319 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 320 do { 321 c1=*s++; 322 c2=*t++; 323 if(c2==0) { 324 return 1; /* reached the end of t but not of s */ 325 } 326 c1-=c2; 327 if(c1!=0) { 328 return c1; /* return difference result */ 329 } 330 } while(--length>0); 331 /* ends with length==0 */ 332 333 if(max==0 || *t==0) { 334 return 0; /* equal to length of both strings */ 335 } else { 336 return -max; /* return lengh difference */ 337 } 338 } 339 340 U_CFUNC UBool U_EXPORT2 341 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) { 342 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; 343 344 if(csp->unfold==NULL || s==NULL) { 345 return FALSE; /* no reverse case folding data, or no string */ 346 } 347 if(length<=1) { 348 /* the string is too short to find any match */ 349 /* 350 * more precise would be: 351 * if(!u_strHasMoreChar32Than(s, length, 1)) 352 * but this does not make much practical difference because 353 * a single supplementary code point would just not be found 354 */ 355 return FALSE; 356 } 357 358 const uint16_t *unfold=csp->unfold; 359 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; 360 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; 361 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; 362 unfold+=unfoldRowWidth; 363 364 if(length>unfoldStringWidth) { 365 /* the string is too long to find any match */ 366 return FALSE; 367 } 368 369 /* do a binary search for the string */ 370 start=0; 371 limit=unfoldRows; 372 while(start<limit) { 373 i=(start+limit)/2; 374 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth)); 375 result=strcmpMax(s, length, p, unfoldStringWidth); 376 377 if(result==0) { 378 /* found the string: add each code point, and its case closure */ 379 UChar32 c; 380 381 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { 382 U16_NEXT_UNSAFE(p, i, c); 383 sa->add(sa->set, c); 384 ucase_addCaseClosure(csp, c, sa); 385 } 386 return TRUE; 387 } else if(result<0) { 388 limit=i; 389 } else /* result>0 */ { 390 start=i+1; 391 } 392 } 393 394 return FALSE; /* string not found */ 395 } 396 397 U_NAMESPACE_BEGIN 398 399 FullCaseFoldingIterator::FullCaseFoldingIterator() 400 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)), 401 unfoldRows(unfold[UCASE_UNFOLD_ROWS]), 402 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]), 403 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]), 404 currentRow(0), 405 rowCpIndex(unfoldStringWidth) { 406 unfold+=unfoldRowWidth; 407 } 408 409 UChar32 410 FullCaseFoldingIterator::next(UnicodeString &full) { 411 // Advance past the last-delivered code point. 412 const UChar *p=unfold+(currentRow*unfoldRowWidth); 413 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) { 414 ++currentRow; 415 p+=unfoldRowWidth; 416 rowCpIndex=unfoldStringWidth; 417 } 418 if(currentRow>=unfoldRows) { return U_SENTINEL; } 419 // Set "full" to the NUL-terminated string in the first unfold column. 420 int32_t length=unfoldStringWidth; 421 while(length>0 && p[length-1]==0) { --length; } 422 full.setTo(FALSE, p, length); 423 // Return the code point. 424 UChar32 c; 425 U16_NEXT_UNSAFE(p, rowCpIndex, c); 426 return c; 427 } 428 429 U_NAMESPACE_END 430 431 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 432 U_CAPI int32_t U_EXPORT2 433 ucase_getType(const UCaseProps *csp, UChar32 c) { 434 uint16_t props=UTRIE2_GET16(&csp->trie, c); 435 return UCASE_GET_TYPE(props); 436 } 437 438 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ 439 U_CAPI int32_t U_EXPORT2 440 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { 441 uint16_t props=UTRIE2_GET16(&csp->trie, c); 442 return UCASE_GET_TYPE_AND_IGNORABLE(props); 443 } 444 445 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ 446 static inline int32_t 447 getDotType(const UCaseProps *csp, UChar32 c) { 448 uint16_t props=UTRIE2_GET16(&csp->trie, c); 449 if(!PROPS_HAS_EXCEPTION(props)) { 450 return props&UCASE_DOT_MASK; 451 } else { 452 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 453 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; 454 } 455 } 456 457 U_CAPI UBool U_EXPORT2 458 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { 459 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); 460 } 461 462 U_CAPI UBool U_EXPORT2 463 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { 464 uint16_t props=UTRIE2_GET16(&csp->trie, c); 465 return (UBool)((props&UCASE_SENSITIVE)!=0); 466 } 467 468 /* string casing ------------------------------------------------------------ */ 469 470 /* 471 * These internal functions form the core of string case mappings. 472 * They map single code points to result code points or strings and take 473 * all necessary conditions (context, locale ID, options) into account. 474 * 475 * They do not iterate over the source or write to the destination 476 * so that the same functions are useful for non-standard string storage, 477 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 478 * For the same reason, the "surrounding text" context is passed in as a 479 * UCaseContextIterator which does not make any assumptions about 480 * the underlying storage. 481 * 482 * This section contains helper functions that check for conditions 483 * in the input text surrounding the current code point 484 * according to SpecialCasing.txt. 485 * 486 * Each helper function gets the index 487 * - after the current code point if it looks at following text 488 * - before the current code point if it looks at preceding text 489 * 490 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 491 * 492 * Final_Sigma 493 * C is preceded by a sequence consisting of 494 * a cased letter and a case-ignorable sequence, 495 * and C is not followed by a sequence consisting of 496 * an ignorable sequence and then a cased letter. 497 * 498 * More_Above 499 * C is followed by one or more characters of combining class 230 (ABOVE) 500 * in the combining character sequence. 501 * 502 * After_Soft_Dotted 503 * The last preceding character with combining class of zero before C 504 * was Soft_Dotted, 505 * and there is no intervening combining character class 230 (ABOVE). 506 * 507 * Before_Dot 508 * C is followed by combining dot above (U+0307). 509 * Any sequence of characters with a combining class that is neither 0 nor 230 510 * may intervene between the current character and the combining dot above. 511 * 512 * The erratum from 2002-10-31 adds the condition 513 * 514 * After_I 515 * The last preceding base character was an uppercase I, and there is no 516 * intervening combining character class 230 (ABOVE). 517 * 518 * (See Jitterbug 2344 and the comments on After_I below.) 519 * 520 * Helper definitions in Unicode 3.2 UAX 21: 521 * 522 * D1. A character C is defined to be cased 523 * if it meets any of the following criteria: 524 * 525 * - The general category of C is Titlecase Letter (Lt) 526 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 527 * - Given D = NFD(C), then it is not the case that: 528 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 529 * (This third criterium does not add any characters to the list 530 * for Unicode 3.2. Ignored.) 531 * 532 * D2. A character C is defined to be case-ignorable 533 * if it meets either of the following criteria: 534 * 535 * - The general category of C is 536 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 537 * Letter Modifier (Lm), or Symbol Modifier (Sk) 538 * - C is one of the following characters 539 * U+0027 APOSTROPHE 540 * U+00AD SOFT HYPHEN (SHY) 541 * U+2019 RIGHT SINGLE QUOTATION MARK 542 * (the preferred character for apostrophe) 543 * 544 * D3. A case-ignorable sequence is a sequence of 545 * zero or more case-ignorable characters. 546 */ 547 548 #define is_a(c) ((c)=='a' || (c)=='A') 549 #define is_d(c) ((c)=='d' || (c)=='D') 550 #define is_e(c) ((c)=='e' || (c)=='E') 551 #define is_i(c) ((c)=='i' || (c)=='I') 552 #define is_l(c) ((c)=='l' || (c)=='L') 553 #define is_n(c) ((c)=='n' || (c)=='N') 554 #define is_r(c) ((c)=='r' || (c)=='R') 555 #define is_t(c) ((c)=='t' || (c)=='T') 556 #define is_u(c) ((c)=='u' || (c)=='U') 557 #define is_z(c) ((c)=='z' || (c)=='Z') 558 559 /* separator? */ 560 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) 561 562 /** 563 * Requires non-NULL locale ID but otherwise does the equivalent of 564 * checking for language codes as if uloc_getLanguage() were called: 565 * Accepts both 2- and 3-letter codes and accepts case variants. 566 */ 567 U_CFUNC int32_t 568 ucase_getCaseLocale(const char *locale, int32_t *locCache) { 569 int32_t result; 570 char c; 571 572 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) { 573 return result; 574 } 575 576 result=UCASE_LOC_ROOT; 577 578 /* 579 * This function used to use uloc_getLanguage(), but the current code 580 * removes the dependency of this low-level code on uloc implementation code 581 * and is faster because not the whole locale ID has to be 582 * examined and copied/transformed. 583 * 584 * Because this code does not want to depend on uloc, the caller must 585 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). 586 */ 587 c=*locale++; 588 if(is_t(c)) { 589 /* tr or tur? */ 590 c=*locale++; 591 if(is_u(c)) { 592 c=*locale++; 593 } 594 if(is_r(c)) { 595 c=*locale; 596 if(is_sep(c)) { 597 result=UCASE_LOC_TURKISH; 598 } 599 } 600 } else if(is_a(c)) { 601 /* az or aze? */ 602 c=*locale++; 603 if(is_z(c)) { 604 c=*locale++; 605 if(is_e(c)) { 606 c=*locale; 607 } 608 if(is_sep(c)) { 609 result=UCASE_LOC_TURKISH; 610 } 611 } 612 } else if(is_l(c)) { 613 /* lt or lit? */ 614 c=*locale++; 615 if(is_i(c)) { 616 c=*locale++; 617 } 618 if(is_t(c)) { 619 c=*locale; 620 if(is_sep(c)) { 621 result=UCASE_LOC_LITHUANIAN; 622 } 623 } 624 } else if(is_e(c)) { 625 /* el or ell? */ 626 c=*locale++; 627 if(is_l(c)) { 628 c=*locale++; 629 if(is_l(c)) { 630 c=*locale; 631 } 632 if(is_sep(c)) { 633 result=UCASE_LOC_GREEK; 634 } 635 } 636 } else if(is_n(c)) { 637 /* nl or nld? */ 638 c=*locale++; 639 if(is_l(c)) { 640 c=*locale++; 641 if(is_d(c)) { 642 c=*locale; 643 } 644 if(is_sep(c)) { 645 result=UCASE_LOC_DUTCH; 646 } 647 } 648 } 649 650 if(locCache!=NULL) { 651 *locCache=result; 652 } 653 return result; 654 } 655 656 /* 657 * Is followed by 658 * {case-ignorable}* cased 659 * ? 660 * (dir determines looking forward/backward) 661 * If a character is case-ignorable, it is skipped regardless of whether 662 * it is also cased or not. 663 */ 664 static UBool 665 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) { 666 UChar32 c; 667 668 if(iter==NULL) { 669 return FALSE; 670 } 671 672 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { 673 int32_t type=ucase_getTypeOrIgnorable(csp, c); 674 if(type&4) { 675 /* case-ignorable, continue with the loop */ 676 } else if(type!=UCASE_NONE) { 677 return TRUE; /* followed by cased letter */ 678 } else { 679 return FALSE; /* uncased and not case-ignorable */ 680 } 681 } 682 683 return FALSE; /* not followed by cased letter */ 684 } 685 686 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 687 static UBool 688 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 689 UChar32 c; 690 int32_t dotType; 691 int8_t dir; 692 693 if(iter==NULL) { 694 return FALSE; 695 } 696 697 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 698 dotType=getDotType(csp, c); 699 if(dotType==UCASE_SOFT_DOTTED) { 700 return TRUE; /* preceded by TYPE_i */ 701 } else if(dotType!=UCASE_OTHER_ACCENT) { 702 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 703 } 704 } 705 706 return FALSE; /* not preceded by TYPE_i */ 707 } 708 709 /* 710 * See Jitterbug 2344: 711 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 712 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 713 * we made those releases compatible with Unicode 3.2 which had not fixed 714 * a related bug in SpecialCasing.txt. 715 * 716 * From the Jitterbug 2344 text: 717 * ... this bug is listed as a Unicode erratum 718 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 719 * <quote> 720 * There are two errors in SpecialCasing.txt. 721 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 722 * 2. An incorrect context definition. Correct as follows: 723 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 724 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 725 * --- 726 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 727 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 728 * where the context After_I is defined as: 729 * The last preceding base character was an uppercase I, and there is no 730 * intervening combining character class 230 (ABOVE). 731 * </quote> 732 * 733 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 734 * 735 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 736 * # This matches the behavior of the canonically equivalent I-dot_above 737 * 738 * See also the description in this place in older versions of uchar.c (revision 1.100). 739 * 740 * Markus W. Scherer 2003-feb-15 741 */ 742 743 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 744 static UBool 745 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 746 UChar32 c; 747 int32_t dotType; 748 int8_t dir; 749 750 if(iter==NULL) { 751 return FALSE; 752 } 753 754 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 755 if(c==0x49) { 756 return TRUE; /* preceded by I */ 757 } 758 dotType=getDotType(csp, c); 759 if(dotType!=UCASE_OTHER_ACCENT) { 760 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ 761 } 762 } 763 764 return FALSE; /* not preceded by I */ 765 } 766 767 /* Is followed by one or more cc==230 ? */ 768 static UBool 769 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 770 UChar32 c; 771 int32_t dotType; 772 int8_t dir; 773 774 if(iter==NULL) { 775 return FALSE; 776 } 777 778 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 779 dotType=getDotType(csp, c); 780 if(dotType==UCASE_ABOVE) { 781 return TRUE; /* at least one cc==230 following */ 782 } else if(dotType!=UCASE_OTHER_ACCENT) { 783 return FALSE; /* next base character, no more cc==230 following */ 784 } 785 } 786 787 return FALSE; /* no more cc==230 following */ 788 } 789 790 /* Is followed by a dot above (without cc==230 in between) ? */ 791 static UBool 792 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 793 UChar32 c; 794 int32_t dotType; 795 int8_t dir; 796 797 if(iter==NULL) { 798 return FALSE; 799 } 800 801 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 802 if(c==0x307) { 803 return TRUE; 804 } 805 dotType=getDotType(csp, c); 806 if(dotType!=UCASE_OTHER_ACCENT) { 807 return FALSE; /* next base character or cc==230 in between */ 808 } 809 } 810 811 return FALSE; /* no dot above following */ 812 } 813 814 U_CAPI int32_t U_EXPORT2 815 ucase_toFullLower(const UCaseProps *csp, UChar32 c, 816 UCaseContextIterator *iter, void *context, 817 const UChar **pString, 818 const char *locale, int32_t *locCache) { 819 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 820 U_ASSERT(c >= 0); 821 UChar32 result=c; 822 uint16_t props=UTRIE2_GET16(&csp->trie, c); 823 if(!PROPS_HAS_EXCEPTION(props)) { 824 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 825 result=c+UCASE_GET_DELTA(props); 826 } 827 } else { 828 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 829 uint16_t excWord=*pe++; 830 int32_t full; 831 832 pe2=pe; 833 834 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 835 /* use hardcoded conditions and mappings */ 836 int32_t loc=ucase_getCaseLocale(locale, locCache); 837 838 /* 839 * Test for conditional mappings first 840 * (otherwise the unconditional default mappings are always taken), 841 * then test for characters that have unconditional mappings in SpecialCasing.txt, 842 * then get the UnicodeData.txt mappings. 843 */ 844 if( loc==UCASE_LOC_LITHUANIAN && 845 /* base characters, find accents above */ 846 (((c==0x49 || c==0x4a || c==0x12e) && 847 isFollowedByMoreAbove(csp, iter, context)) || 848 /* precomposed with accent above, no need to find one */ 849 (c==0xcc || c==0xcd || c==0x128)) 850 ) { 851 /* 852 # Lithuanian 853 854 # Lithuanian retains the dot in a lowercase i when followed by accents. 855 856 # Introduce an explicit dot above when lowercasing capital I's and J's 857 # whenever there are more accents above. 858 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 859 860 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 861 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 862 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 863 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 864 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 865 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 866 */ 867 switch(c) { 868 case 0x49: /* LATIN CAPITAL LETTER I */ 869 *pString=iDot; 870 return 2; 871 case 0x4a: /* LATIN CAPITAL LETTER J */ 872 *pString=jDot; 873 return 2; 874 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 875 *pString=iOgonekDot; 876 return 2; 877 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 878 *pString=iDotGrave; 879 return 3; 880 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 881 *pString=iDotAcute; 882 return 3; 883 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 884 *pString=iDotTilde; 885 return 3; 886 default: 887 return 0; /* will not occur */ 888 } 889 /* # Turkish and Azeri */ 890 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { 891 /* 892 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 893 # The following rules handle those cases. 894 895 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 896 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 897 */ 898 return 0x69; 899 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { 900 /* 901 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 902 # This matches the behavior of the canonically equivalent I-dot_above 903 904 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 905 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 906 */ 907 return 0; /* remove the dot (continue without output) */ 908 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { 909 /* 910 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 911 912 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 913 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 914 */ 915 return 0x131; 916 } else if(c==0x130) { 917 /* 918 # Preserve canonical equivalence for I with dot. Turkic is handled below. 919 920 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 921 */ 922 *pString=iDot; 923 return 2; 924 } else if( c==0x3a3 && 925 !isFollowedByCasedLetter(csp, iter, context, 1) && 926 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ 927 ) { 928 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 929 /* 930 # Special case for final form of sigma 931 932 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 933 */ 934 return 0x3c2; /* greek small final sigma */ 935 } else { 936 /* no known conditional special case mapping, use a normal mapping */ 937 } 938 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 939 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 940 full&=UCASE_FULL_LOWER; 941 if(full!=0) { 942 /* set the output pointer to the lowercase mapping */ 943 *pString=reinterpret_cast<const UChar *>(pe+1); 944 945 /* return the string length */ 946 return full; 947 } 948 } 949 950 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 951 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); 952 } 953 } 954 955 return (result==c) ? ~result : result; 956 } 957 958 /* internal */ 959 static int32_t 960 toUpperOrTitle(const UCaseProps *csp, UChar32 c, 961 UCaseContextIterator *iter, void *context, 962 const UChar **pString, 963 const char *locale, int32_t *locCache, 964 UBool upperNotTitle) { 965 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 966 U_ASSERT(c >= 0); 967 UChar32 result=c; 968 uint16_t props=UTRIE2_GET16(&csp->trie, c); 969 if(!PROPS_HAS_EXCEPTION(props)) { 970 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 971 result=c+UCASE_GET_DELTA(props); 972 } 973 } else { 974 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 975 uint16_t excWord=*pe++; 976 int32_t full, idx; 977 978 pe2=pe; 979 980 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 981 /* use hardcoded conditions and mappings */ 982 int32_t loc=ucase_getCaseLocale(locale, locCache); 983 984 if(loc==UCASE_LOC_TURKISH && c==0x69) { 985 /* 986 # Turkish and Azeri 987 988 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 989 # The following rules handle those cases. 990 991 # When uppercasing, i turns into a dotted capital I 992 993 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 994 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 995 */ 996 return 0x130; 997 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { 998 /* 999 # Lithuanian 1000 1001 # Lithuanian retains the dot in a lowercase i when followed by accents. 1002 1003 # Remove DOT ABOVE after "i" with upper or titlecase 1004 1005 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1006 */ 1007 return 0; /* remove the dot (continue without output) */ 1008 } else { 1009 /* no known conditional special case mapping, use a normal mapping */ 1010 } 1011 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1012 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1013 1014 /* start of full case mapping strings */ 1015 ++pe; 1016 1017 /* skip the lowercase and case-folding result strings */ 1018 pe+=full&UCASE_FULL_LOWER; 1019 full>>=4; 1020 pe+=full&0xf; 1021 full>>=4; 1022 1023 if(upperNotTitle) { 1024 full&=0xf; 1025 } else { 1026 /* skip the uppercase result string */ 1027 pe+=full&0xf; 1028 full=(full>>4)&0xf; 1029 } 1030 1031 if(full!=0) { 1032 /* set the output pointer to the result string */ 1033 *pString=reinterpret_cast<const UChar *>(pe); 1034 1035 /* return the string length */ 1036 return full; 1037 } 1038 } 1039 1040 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 1041 idx=UCASE_EXC_TITLE; 1042 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 1043 /* here, titlecase is same as uppercase */ 1044 idx=UCASE_EXC_UPPER; 1045 } else { 1046 return ~c; 1047 } 1048 GET_SLOT_VALUE(excWord, idx, pe2, result); 1049 } 1050 1051 return (result==c) ? ~result : result; 1052 } 1053 1054 U_CAPI int32_t U_EXPORT2 1055 ucase_toFullUpper(const UCaseProps *csp, UChar32 c, 1056 UCaseContextIterator *iter, void *context, 1057 const UChar **pString, 1058 const char *locale, int32_t *locCache) { 1059 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE); 1060 } 1061 1062 U_CAPI int32_t U_EXPORT2 1063 ucase_toFullTitle(const UCaseProps *csp, UChar32 c, 1064 UCaseContextIterator *iter, void *context, 1065 const UChar **pString, 1066 const char *locale, int32_t *locCache) { 1067 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE); 1068 } 1069 1070 /* case folding ------------------------------------------------------------- */ 1071 1072 /* 1073 * Case folding is similar to lowercasing. 1074 * The result may be a simple mapping, i.e., a single code point, or 1075 * a full mapping, i.e., a string. 1076 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1077 * then only the lowercase mapping is stored. 1078 * 1079 * Some special cases are hardcoded because their conditions cannot be 1080 * parsed and processed from CaseFolding.txt. 1081 * 1082 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1083 1084 # C: common case folding, common mappings shared by both simple and full mappings. 1085 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1086 # S: simple case folding, mappings to single characters where different from F. 1087 # T: special case for uppercase I and dotted uppercase I 1088 # - For non-Turkic languages, this mapping is normally not used. 1089 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1090 # 1091 # Usage: 1092 # A. To do a simple case folding, use the mappings with status C + S. 1093 # B. To do a full case folding, use the mappings with status C + F. 1094 # 1095 # The mappings with status T can be used or omitted depending on the desired case-folding 1096 # behavior. (The default option is to exclude them.) 1097 1098 * Unicode 3.2 has 'T' mappings as follows: 1099 1100 0049; T; 0131; # LATIN CAPITAL LETTER I 1101 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1102 1103 * while the default mappings for these code points are: 1104 1105 0049; C; 0069; # LATIN CAPITAL LETTER I 1106 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1107 1108 * U+0130 has no simple case folding (simple-case-folds to itself). 1109 */ 1110 1111 /* return the simple case folding mapping for c */ 1112 U_CAPI UChar32 U_EXPORT2 1113 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { 1114 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1115 if(!PROPS_HAS_EXCEPTION(props)) { 1116 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1117 c+=UCASE_GET_DELTA(props); 1118 } 1119 } else { 1120 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 1121 uint16_t excWord=*pe++; 1122 int32_t idx; 1123 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1124 /* special case folding mappings, hardcoded */ 1125 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1126 /* default mappings */ 1127 if(c==0x49) { 1128 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1129 return 0x69; 1130 } else if(c==0x130) { 1131 /* no simple case folding for U+0130 */ 1132 return c; 1133 } 1134 } else { 1135 /* Turkic mappings */ 1136 if(c==0x49) { 1137 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1138 return 0x131; 1139 } else if(c==0x130) { 1140 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1141 return 0x69; 1142 } 1143 } 1144 } 1145 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1146 idx=UCASE_EXC_FOLD; 1147 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1148 idx=UCASE_EXC_LOWER; 1149 } else { 1150 return c; 1151 } 1152 GET_SLOT_VALUE(excWord, idx, pe, c); 1153 } 1154 return c; 1155 } 1156 1157 /* 1158 * Issue for canonical caseless match (UAX #21): 1159 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1160 * canonical equivalence, unlike default-option casefolding. 1161 * For example, I-grave and I + grave fold to strings that are not canonically 1162 * equivalent. 1163 * For more details, see the comment in unorm_compare() in unorm.cpp 1164 * and the intermediate prototype changes for Jitterbug 2021. 1165 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1166 * 1167 * This did not get fixed because it appears that it is not possible to fix 1168 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1169 * together in a way that they still fold to common result strings. 1170 */ 1171 1172 U_CAPI int32_t U_EXPORT2 1173 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, 1174 const UChar **pString, 1175 uint32_t options) { 1176 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1177 U_ASSERT(c >= 0); 1178 UChar32 result=c; 1179 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1180 if(!PROPS_HAS_EXCEPTION(props)) { 1181 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1182 result=c+UCASE_GET_DELTA(props); 1183 } 1184 } else { 1185 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 1186 uint16_t excWord=*pe++; 1187 int32_t full, idx; 1188 1189 pe2=pe; 1190 1191 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1192 /* use hardcoded conditions and mappings */ 1193 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1194 /* default mappings */ 1195 if(c==0x49) { 1196 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1197 return 0x69; 1198 } else if(c==0x130) { 1199 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1200 *pString=iDot; 1201 return 2; 1202 } 1203 } else { 1204 /* Turkic mappings */ 1205 if(c==0x49) { 1206 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1207 return 0x131; 1208 } else if(c==0x130) { 1209 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1210 return 0x69; 1211 } 1212 } 1213 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1214 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1215 1216 /* start of full case mapping strings */ 1217 ++pe; 1218 1219 /* skip the lowercase result string */ 1220 pe+=full&UCASE_FULL_LOWER; 1221 full=(full>>4)&0xf; 1222 1223 if(full!=0) { 1224 /* set the output pointer to the result string */ 1225 *pString=reinterpret_cast<const UChar *>(pe); 1226 1227 /* return the string length */ 1228 return full; 1229 } 1230 } 1231 1232 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1233 idx=UCASE_EXC_FOLD; 1234 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1235 idx=UCASE_EXC_LOWER; 1236 } else { 1237 return ~c; 1238 } 1239 GET_SLOT_VALUE(excWord, idx, pe2, result); 1240 } 1241 1242 return (result==c) ? ~result : result; 1243 } 1244 1245 /* case mapping properties API ---------------------------------------------- */ 1246 1247 #define GET_CASE_PROPS() &ucase_props_singleton 1248 1249 /* public API (see uchar.h) */ 1250 1251 U_CAPI UBool U_EXPORT2 1252 u_isULowercase(UChar32 c) { 1253 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c)); 1254 } 1255 1256 U_CAPI UBool U_EXPORT2 1257 u_isUUppercase(UChar32 c) { 1258 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c)); 1259 } 1260 1261 /* Transforms the Unicode character to its lower case equivalent.*/ 1262 U_CAPI UChar32 U_EXPORT2 1263 u_tolower(UChar32 c) { 1264 return ucase_tolower(GET_CASE_PROPS(), c); 1265 } 1266 1267 /* Transforms the Unicode character to its upper case equivalent.*/ 1268 U_CAPI UChar32 U_EXPORT2 1269 u_toupper(UChar32 c) { 1270 return ucase_toupper(GET_CASE_PROPS(), c); 1271 } 1272 1273 /* Transforms the Unicode character to its title case equivalent.*/ 1274 U_CAPI UChar32 U_EXPORT2 1275 u_totitle(UChar32 c) { 1276 return ucase_totitle(GET_CASE_PROPS(), c); 1277 } 1278 1279 /* return the simple case folding mapping for c */ 1280 U_CAPI UChar32 U_EXPORT2 1281 u_foldCase(UChar32 c, uint32_t options) { 1282 return ucase_fold(GET_CASE_PROPS(), c, options); 1283 } 1284 1285 U_CFUNC int32_t U_EXPORT2 1286 ucase_hasBinaryProperty(UChar32 c, UProperty which) { 1287 /* case mapping properties */ 1288 const UChar *resultString; 1289 int32_t locCache; 1290 const UCaseProps *csp=GET_CASE_PROPS(); 1291 if(csp==NULL) { 1292 return FALSE; 1293 } 1294 switch(which) { 1295 case UCHAR_LOWERCASE: 1296 return (UBool)(UCASE_LOWER==ucase_getType(csp, c)); 1297 case UCHAR_UPPERCASE: 1298 return (UBool)(UCASE_UPPER==ucase_getType(csp, c)); 1299 case UCHAR_SOFT_DOTTED: 1300 return ucase_isSoftDotted(csp, c); 1301 case UCHAR_CASE_SENSITIVE: 1302 return ucase_isCaseSensitive(csp, c); 1303 case UCHAR_CASED: 1304 return (UBool)(UCASE_NONE!=ucase_getType(csp, c)); 1305 case UCHAR_CASE_IGNORABLE: 1306 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2); 1307 /* 1308 * Note: The following Changes_When_Xyz are defined as testing whether 1309 * the NFD form of the input changes when Xyz-case-mapped. 1310 * However, this simpler implementation of these properties, 1311 * ignoring NFD, passes the tests. 1312 * The implementation needs to be changed if the tests start failing. 1313 * When that happens, optimizations should be used to work with the 1314 * per-single-code point ucase_toFullXyz() functions unless 1315 * the NFD form has more than one code point, 1316 * and the property starts set needs to be the union of the 1317 * start sets for normalization and case mappings. 1318 */ 1319 case UCHAR_CHANGES_WHEN_LOWERCASED: 1320 locCache=UCASE_LOC_ROOT; 1321 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1322 case UCHAR_CHANGES_WHEN_UPPERCASED: 1323 locCache=UCASE_LOC_ROOT; 1324 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1325 case UCHAR_CHANGES_WHEN_TITLECASED: 1326 locCache=UCASE_LOC_ROOT; 1327 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1328 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ 1329 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1330 locCache=UCASE_LOC_ROOT; 1331 return (UBool)( 1332 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1333 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1334 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1335 default: 1336 return FALSE; 1337 } 1338 } 1339