1 /* 2 * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.nio.ByteBuffer; 36 import java.text.Normalizer; 37 38 // Original filename in ICU4J: Normalizer2Impl.java 39 public final class NormalizerImpl { 40 public static final class Hangul { 41 /* Korean Hangul and Jamo constants */ 42 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 43 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 44 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 45 46 public static final int HANGUL_BASE=0xac00; 47 public static final int HANGUL_END=0xd7a3; 48 49 public static final int JAMO_L_COUNT=19; 50 public static final int JAMO_V_COUNT=21; 51 public static final int JAMO_T_COUNT=28; 52 53 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 54 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 55 isHangul(int c)56 public static boolean isHangul(int c) { 57 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 58 } isHangulLV(int c)59 public static boolean isHangulLV(int c) { 60 c-=HANGUL_BASE; 61 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 62 } 63 64 /** 65 * Decomposes c, which must be a Hangul syllable, into buffer 66 * and returns the length of the decomposition (2 or 3). 67 */ decompose(int c, Appendable buffer)68 public static int decompose(int c, Appendable buffer) { 69 try { 70 c-=HANGUL_BASE; 71 int c2=c%JAMO_T_COUNT; 72 c/=JAMO_T_COUNT; 73 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 74 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 75 if(c2==0) { 76 return 2; 77 } else { 78 buffer.append((char)(JAMO_T_BASE+c2)); 79 return 3; 80 } 81 } catch(IOException e) { 82 throw new InternalError(e); 83 } 84 } 85 } 86 87 /** 88 * Writable buffer that takes care of canonical ordering. 89 * Its Appendable methods behave like the C++ implementation's 90 * appendZeroCC() methods. 91 * <p> 92 * If dest is a StringBuilder, then the buffer writes directly to it. 93 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 94 * until no further changes are necessary and whole segments are appended. 95 * append() methods that take combining-class values always write to the StringBuilder. 96 * Other append() methods flush and append to the Appendable. 97 */ 98 public static final class ReorderingBuffer implements Appendable { ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity)99 public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { 100 impl=ni; 101 app=dest; 102 if (app instanceof StringBuilder) { 103 appIsStringBuilder=true; 104 str=(StringBuilder)dest; 105 // In Java, the constructor subsumes public void init(int destCapacity) 106 str.ensureCapacity(destCapacity); 107 reorderStart=0; 108 if(str.length()==0) { 109 lastCC=0; 110 } else { 111 setIterator(); 112 lastCC=previousCC(); 113 // Set reorderStart after the last code point with cc<=1 if there is one. 114 if(lastCC>1) { 115 while(previousCC()>1) {} 116 } 117 reorderStart=codePointLimit; 118 } 119 } else { 120 appIsStringBuilder=false; 121 str=new StringBuilder(); 122 reorderStart=0; 123 lastCC=0; 124 } 125 } 126 isEmpty()127 public boolean isEmpty() { return str.length()==0; } length()128 public int length() { return str.length(); } getLastCC()129 public int getLastCC() { return lastCC; } 130 getStringBuilder()131 public StringBuilder getStringBuilder() { return str; } 132 equals(CharSequence s, int start, int limit)133 public boolean equals(CharSequence s, int start, int limit) { 134 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 135 } 136 append(int c, int cc)137 public void append(int c, int cc) { 138 if(lastCC<=cc || cc==0) { 139 str.appendCodePoint(c); 140 lastCC=cc; 141 if(cc<=1) { 142 reorderStart=str.length(); 143 } 144 } else { 145 insert(c, cc); 146 } 147 } 148 // s must be in NFD, otherwise change the implementation. append(CharSequence s, int start, int limit, int leadCC, int trailCC)149 public void append(CharSequence s, int start, int limit, 150 int leadCC, int trailCC) { 151 if(start==limit) { 152 return; 153 } 154 if(lastCC<=leadCC || leadCC==0) { 155 if(trailCC<=1) { 156 reorderStart=str.length()+(limit-start); 157 } else if(leadCC<=1) { 158 reorderStart=str.length()+1; // Ok if not a code point boundary. 159 } 160 str.append(s, start, limit); 161 lastCC=trailCC; 162 } else { 163 int c=Character.codePointAt(s, start); 164 start+=Character.charCount(c); 165 insert(c, leadCC); // insert first code point 166 while(start<limit) { 167 c=Character.codePointAt(s, start); 168 start+=Character.charCount(c); 169 if(start<limit) { 170 // s must be in NFD, otherwise we need to use getCC(). 171 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 172 } else { 173 leadCC=trailCC; 174 } 175 append(c, leadCC); 176 } 177 } 178 } 179 // The following append() methods work like C++ appendZeroCC(). 180 // They assume that the cc or trailCC of their input is 0. 181 // Most of them implement Appendable interface methods. 182 @Override append(char c)183 public ReorderingBuffer append(char c) { 184 str.append(c); 185 lastCC=0; 186 reorderStart=str.length(); 187 return this; 188 } appendZeroCC(int c)189 public void appendZeroCC(int c) { 190 str.appendCodePoint(c); 191 lastCC=0; 192 reorderStart=str.length(); 193 } 194 @Override append(CharSequence s)195 public ReorderingBuffer append(CharSequence s) { 196 if(s.length()!=0) { 197 str.append(s); 198 lastCC=0; 199 reorderStart=str.length(); 200 } 201 return this; 202 } 203 @Override append(CharSequence s, int start, int limit)204 public ReorderingBuffer append(CharSequence s, int start, int limit) { 205 if(start!=limit) { 206 str.append(s, start, limit); 207 lastCC=0; 208 reorderStart=str.length(); 209 } 210 return this; 211 } 212 /** 213 * Flushes from the intermediate StringBuilder to the Appendable, 214 * if they are different objects. 215 * Used after recomposition. 216 * Must be called at the end when writing to a non-StringBuilder Appendable. 217 */ flush()218 public void flush() { 219 if(appIsStringBuilder) { 220 reorderStart=str.length(); 221 } else { 222 try { 223 app.append(str); 224 str.setLength(0); 225 reorderStart=0; 226 } catch(IOException e) { 227 throw new InternalError(e); // Avoid declaring "throws IOException". 228 } 229 } 230 lastCC=0; 231 } 232 /** 233 * Flushes from the intermediate StringBuilder to the Appendable, 234 * if they are different objects. 235 * Then appends the new text to the Appendable or StringBuilder. 236 * Normally used after quick check loops find a non-empty sequence. 237 */ flushAndAppendZeroCC(CharSequence s, int start, int limit)238 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 239 if(appIsStringBuilder) { 240 str.append(s, start, limit); 241 reorderStart=str.length(); 242 } else { 243 try { 244 app.append(str).append(s, start, limit); 245 str.setLength(0); 246 reorderStart=0; 247 } catch(IOException e) { 248 throw new InternalError(e); // Avoid declaring "throws IOException". 249 } 250 } 251 lastCC=0; 252 return this; 253 } remove()254 public void remove() { 255 str.setLength(0); 256 lastCC=0; 257 reorderStart=0; 258 } removeSuffix(int suffixLength)259 public void removeSuffix(int suffixLength) { 260 int oldLength=str.length(); 261 str.delete(oldLength-suffixLength, oldLength); 262 lastCC=0; 263 reorderStart=str.length(); 264 } 265 266 // Inserts c somewhere before the last character. 267 // Requires 0<cc<lastCC which implies reorderStart<limit. insert(int c, int cc)268 private void insert(int c, int cc) { 269 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 270 // insert c at codePointLimit, after the character with prevCC<=cc 271 if(c<=0xffff) { 272 str.insert(codePointLimit, (char)c); 273 if(cc<=1) { 274 reorderStart=codePointLimit+1; 275 } 276 } else { 277 str.insert(codePointLimit, Character.toChars(c)); 278 if(cc<=1) { 279 reorderStart=codePointLimit+2; 280 } 281 } 282 } 283 284 private final NormalizerImpl impl; 285 private final Appendable app; 286 private final StringBuilder str; 287 private final boolean appIsStringBuilder; 288 private int reorderStart; 289 private int lastCC; 290 291 // private backward iterator setIterator()292 private void setIterator() { codePointStart=str.length(); } skipPrevious()293 private void skipPrevious() { // Requires 0<codePointStart. 294 codePointLimit=codePointStart; 295 codePointStart=str.offsetByCodePoints(codePointStart, -1); 296 } previousCC()297 private int previousCC() { // Returns 0 if there is no previous character. 298 codePointLimit=codePointStart; 299 if(reorderStart>=codePointStart) { 300 return 0; 301 } 302 int c=str.codePointBefore(codePointStart); 303 codePointStart-=Character.charCount(c); 304 return impl.getCCFromYesOrMaybeCP(c); 305 } 306 private int codePointStart, codePointLimit; 307 } 308 309 // TODO: Propose as public API on the UTF16 class. 310 // TODO: Propose widening UTF16 methods that take char to take int. 311 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 312 public static final class UTF16Plus { 313 /** 314 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 315 * is it a lead surrogate? 316 * @param c code unit or code point 317 * @return true or false 318 */ isSurrogateLead(int c)319 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 320 321 /** 322 * Compares two CharSequence subsequences for binary equality. 323 * @param s1 first sequence 324 * @param start1 start offset in first sequence 325 * @param limit1 limit offset in first sequence 326 * @param s2 second sequence 327 * @param start2 start offset in second sequence 328 * @param limit2 limit offset in second sequence 329 * @return true if s1.subSequence(start1, limit1) contains the same text 330 * as s2.subSequence(start2, limit2) 331 */ equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)332 public static boolean equal(CharSequence s1, int start1, int limit1, 333 CharSequence s2, int start2, int limit2) { 334 if((limit1-start1)!=(limit2-start2)) { 335 return false; 336 } 337 if(s1==s2 && start1==start2) { 338 return true; 339 } 340 while(start1<limit1) { 341 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 342 return false; 343 } 344 } 345 return true; 346 } 347 } 348 NormalizerImpl()349 public NormalizerImpl() {} 350 351 private static final class IsAcceptable implements ICUBinary.Authenticate { isDataVersionAcceptable(byte version[])352 public boolean isDataVersionAcceptable(byte version[]) { 353 return version[0]==3; 354 } 355 } 356 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 357 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 358 load(ByteBuffer bytes)359 public NormalizerImpl load(ByteBuffer bytes) { 360 try { 361 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 362 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 363 if(indexesLength<=IX_MIN_LCCC_CP) { 364 throw new InternalError("Normalizer2 data: not enough indexes"); 365 } 366 int[] inIndexes=new int[indexesLength]; 367 inIndexes[0]=indexesLength*4; 368 for(int i=1; i<indexesLength; ++i) { 369 inIndexes[i]=bytes.getInt(); 370 } 371 372 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 373 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 374 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 375 376 minYesNo=inIndexes[IX_MIN_YES_NO]; 377 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 378 minNoNo=inIndexes[IX_MIN_NO_NO]; 379 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 380 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 381 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 382 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 383 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 384 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 385 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 386 387 // Read the normTrie. 388 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 389 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 390 normTrie=Trie2_16.createFromSerialized(bytes); 391 int trieLength=normTrie.getSerializedLength(); 392 if(trieLength>(nextOffset-offset)) { 393 throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); 394 } 395 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 396 397 // Read the composition and mapping data. 398 offset=nextOffset; 399 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 400 int numChars=(nextOffset-offset)/2; 401 char[] chars; 402 if(numChars!=0) { 403 chars=new char[numChars]; 404 for(int i=0; i<numChars; ++i) { 405 chars[i]=bytes.getChar(); 406 } 407 maybeYesCompositions=new String(chars); 408 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 409 } 410 411 // smallFCD: new in formatVersion 2 412 offset=nextOffset; 413 smallFCD=new byte[0x100]; 414 bytes.get(smallFCD); 415 416 return this; 417 } catch(IOException e) { 418 throw new InternalError(e); 419 } 420 } load(String name)421 public NormalizerImpl load(String name) { 422 return load(ICUBinary.getRequiredData(name)); 423 } 424 425 getNorm16(int c)426 public int getNorm16(int c) { return normTrie.get(c); } isAlgorithmicNoNo(int norm16)427 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } isCompNo(int norm16)428 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } isDecompYes(int norm16)429 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 430 getCC(int norm16)431 public int getCC(int norm16) { 432 if(norm16>=MIN_NORMAL_MAYBE_YES) { 433 return getCCFromNormalYesOrMaybe(norm16); 434 } 435 if(norm16<minNoNo || limitNoNo<=norm16) { 436 return 0; 437 } 438 return getCCFromNoNo(norm16); 439 } getCCFromNormalYesOrMaybe(int norm16)440 public static int getCCFromNormalYesOrMaybe(int norm16) { 441 return (norm16 >> OFFSET_SHIFT) & 0xff; 442 } getCCFromYesOrMaybe(int norm16)443 public static int getCCFromYesOrMaybe(int norm16) { 444 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 445 } getCCFromYesOrMaybeCP(int c)446 public int getCCFromYesOrMaybeCP(int c) { 447 if (c < minCompNoMaybeCP) { return 0; } 448 return getCCFromYesOrMaybe(getNorm16(c)); 449 } 450 451 /** 452 * Returns the FCD data for code point c. 453 * @param c A Unicode code point. 454 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 455 */ getFCD16(int c)456 public int getFCD16(int c) { 457 if(c<minDecompNoCP) { 458 return 0; 459 } else if(c<=0xffff) { 460 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 461 } 462 return getFCD16FromNormData(c); 463 } 464 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ singleLeadMightHaveNonZeroFCD16(int lead)465 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 466 // 0<=lead<=0xffff 467 byte bits=smallFCD[lead>>8]; 468 if(bits==0) { return false; } 469 return ((bits>>((lead>>5)&7))&1)!=0; 470 } 471 472 /** Gets the FCD value from the regular normalization data. */ getFCD16FromNormData(int c)473 public int getFCD16FromNormData(int c) { 474 int norm16=getNorm16(c); 475 if (norm16 >= limitNoNo) { 476 if(norm16>=MIN_NORMAL_MAYBE_YES) { 477 // combining mark 478 norm16=getCCFromNormalYesOrMaybe(norm16); 479 return norm16|(norm16<<8); 480 } else if(norm16>=minMaybeYes) { 481 return 0; 482 } else { // isDecompNoAlgorithmic(norm16) 483 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 484 if (deltaTrailCC <= DELTA_TCCC_1) { 485 return deltaTrailCC >> OFFSET_SHIFT; 486 } 487 // Maps to an isCompYesAndZeroCC. 488 c=mapAlgorithmic(c, norm16); 489 norm16=getNorm16(c); 490 } 491 } 492 if(norm16<=minYesNo || isHangulLVT(norm16)) { 493 // no decomposition or Hangul syllable, all zeros 494 return 0; 495 } 496 // c decomposes, get everything from the variable-length extra data 497 int mapping=norm16>>OFFSET_SHIFT; 498 int firstUnit=extraData.charAt(mapping); 499 int fcd16=firstUnit>>8; // tccc 500 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 501 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 502 } 503 return fcd16; 504 } 505 506 /** 507 * Gets the decomposition for one code point. 508 * @param c code point 509 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 510 */ getDecomposition(int c)511 public String getDecomposition(int c) { 512 int norm16; 513 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 514 // c does not decompose 515 return null; 516 } 517 int decomp = -1; 518 if(isDecompNoAlgorithmic(norm16)) { 519 // Maps to an isCompYesAndZeroCC. 520 decomp=c=mapAlgorithmic(c, norm16); 521 // The mapping might decompose further. 522 norm16 = getNorm16(c); 523 } 524 if (norm16 < minYesNo) { 525 if(decomp<0) { 526 return null; 527 } else { 528 return UTF16.valueOf(decomp); 529 } 530 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 531 // Hangul syllable: decompose algorithmically 532 StringBuilder buffer=new StringBuilder(); 533 Hangul.decompose(c, buffer); 534 return buffer.toString(); 535 } 536 // c decomposes, get everything from the variable-length extra data 537 int mapping=norm16>>OFFSET_SHIFT; 538 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 539 return extraData.substring(mapping, mapping+length); 540 } 541 542 // Fixed norm16 values. 543 public static final int MIN_YES_YES_WITH_CC=0xfe02; 544 public static final int JAMO_VT=0xfe00; 545 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 546 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 547 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 548 549 // norm16 bit 0 is comp-boundary-after. 550 public static final int HAS_COMP_BOUNDARY_AFTER=1; 551 public static final int OFFSET_SHIFT=1; 552 553 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 554 // tccc (0, 1, >1) for quick FCC boundary-after tests. 555 public static final int DELTA_TCCC_0=0; 556 public static final int DELTA_TCCC_1=2; 557 public static final int DELTA_TCCC_GT_1=4; 558 public static final int DELTA_TCCC_MASK=6; 559 public static final int DELTA_SHIFT=3; 560 561 public static final int MAX_DELTA=0x40; 562 563 // Byte offsets from the start of the data, after the generic header. 564 public static final int IX_NORM_TRIE_OFFSET=0; 565 public static final int IX_EXTRA_DATA_OFFSET=1; 566 public static final int IX_SMALL_FCD_OFFSET=2; 567 public static final int IX_RESERVED3_OFFSET=3; 568 public static final int IX_TOTAL_SIZE=7; 569 public static final int MIN_CCC_LCCC_CP=0x300; 570 // Code point thresholds for quick check codes. 571 public static final int IX_MIN_DECOMP_NO_CP=8; 572 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 573 574 // Norm16 value thresholds for quick check combinations and types of extra data. 575 576 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 577 public static final int IX_MIN_YES_NO=10; 578 /** Mappings are comp-normalized. */ 579 public static final int IX_MIN_NO_NO=11; 580 public static final int IX_LIMIT_NO_NO=12; 581 public static final int IX_MIN_MAYBE_YES=13; 582 583 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 584 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 585 /** Mappings are not comp-normalized but have a comp boundary before. */ 586 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 587 /** Mappings do not have a comp boundary before. */ 588 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 589 /** Mappings to the empty string. */ 590 public static final int IX_MIN_NO_NO_EMPTY=17; 591 592 public static final int IX_MIN_LCCC_CP=18; 593 public static final int IX_COUNT=20; 594 595 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 596 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 597 // unused bit 0x20; 598 public static final int MAPPING_LENGTH_MASK=0x1f; 599 600 public static final int COMP_1_LAST_TUPLE=0x8000; 601 public static final int COMP_1_TRIPLE=1; 602 public static final int COMP_1_TRAIL_LIMIT=0x3400; 603 public static final int COMP_1_TRAIL_MASK=0x7ffe; 604 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 605 public static final int COMP_2_TRAIL_SHIFT=6; 606 public static final int COMP_2_TRAIL_MASK=0xffc0; 607 608 // higher-level functionality ------------------------------------------ *** 609 610 /** 611 * Decomposes s[src, limit[ and writes the result to dest. 612 * limit can be NULL if src is NUL-terminated. 613 * destLengthEstimate is the initial dest buffer capacity and can be -1. 614 */ decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)615 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 616 int destLengthEstimate) { 617 if(destLengthEstimate<0) { 618 destLengthEstimate=limit-src; 619 } 620 dest.setLength(0); 621 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 622 decompose(s, src, limit, buffer); 623 } 624 625 // Dual functionality: 626 // buffer!=NULL: normalize 627 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)628 public int decompose(CharSequence s, int src, int limit, 629 ReorderingBuffer buffer) { 630 int minNoCP=minDecompNoCP; 631 632 int prevSrc; 633 int c=0; 634 int norm16=0; 635 636 // only for quick check 637 int prevBoundary=src; 638 int prevCC=0; 639 640 for(;;) { 641 // count code units below the minimum or with irrelevant data for the quick check 642 for(prevSrc=src; src!=limit;) { 643 if( (c=s.charAt(src))<minNoCP || 644 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 645 ) { 646 ++src; 647 } else if(!UTF16.isSurrogate((char)c)) { 648 break; 649 } else { 650 char c2; 651 if(UTF16Plus.isSurrogateLead(c)) { 652 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 653 c=Character.toCodePoint((char)c, c2); 654 } 655 } else /* trail surrogate */ { 656 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 657 --src; 658 c=Character.toCodePoint(c2, (char)c); 659 } 660 } 661 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 662 src+=Character.charCount(c); 663 } else { 664 break; 665 } 666 } 667 } 668 // copy these code units all at once 669 if(src!=prevSrc) { 670 if(buffer!=null) { 671 buffer.flushAndAppendZeroCC(s, prevSrc, src); 672 } else { 673 prevCC=0; 674 prevBoundary=src; 675 } 676 } 677 if(src==limit) { 678 break; 679 } 680 681 // Check one above-minimum, relevant code point. 682 src+=Character.charCount(c); 683 if(buffer!=null) { 684 decompose(c, norm16, buffer); 685 } else { 686 if(isDecompYes(norm16)) { 687 int cc=getCCFromYesOrMaybe(norm16); 688 if(prevCC<=cc || cc==0) { 689 prevCC=cc; 690 if(cc<=1) { 691 prevBoundary=src; 692 } 693 continue; 694 } 695 } 696 return prevBoundary; // "no" or cc out of order 697 } 698 } 699 return src; 700 } decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)701 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 702 int limit=s.length(); 703 if(limit==0) { 704 return; 705 } 706 if(doDecompose) { 707 decompose(s, 0, limit, buffer); 708 return; 709 } 710 // Just merge the strings at the boundary. 711 int c=Character.codePointAt(s, 0); 712 int src=0; 713 int firstCC, prevCC, cc; 714 firstCC=prevCC=cc=getCC(getNorm16(c)); 715 while(cc!=0) { 716 prevCC=cc; 717 src+=Character.charCount(c); 718 if(src>=limit) { 719 break; 720 } 721 c=Character.codePointAt(s, src); 722 cc=getCC(getNorm16(c)); 723 }; 724 buffer.append(s, 0, src, firstCC, prevCC); 725 buffer.append(s, src, limit); 726 } 727 728 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 729 // doCompose: normalize 730 // !doCompose: isNormalized (buffer must be empty and initialized) compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)731 public boolean compose(CharSequence s, int src, int limit, 732 boolean onlyContiguous, 733 boolean doCompose, 734 ReorderingBuffer buffer) { 735 int prevBoundary=src; 736 int minNoMaybeCP=minCompNoMaybeCP; 737 738 for (;;) { 739 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 740 // or with (compYes && ccc==0) properties. 741 int prevSrc; 742 int c = 0; 743 int norm16 = 0; 744 for (;;) { 745 if (src == limit) { 746 if (prevBoundary != limit && doCompose) { 747 buffer.append(s, prevBoundary, limit); 748 } 749 return true; 750 } 751 if( (c=s.charAt(src))<minNoMaybeCP || 752 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 753 ) { 754 ++src; 755 } else { 756 prevSrc = src++; 757 if(!UTF16.isSurrogate((char)c)) { 758 break; 759 } else { 760 char c2; 761 if(UTF16Plus.isSurrogateLead(c)) { 762 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 763 ++src; 764 c=Character.toCodePoint((char)c, c2); 765 } 766 } else /* trail surrogate */ { 767 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 768 --prevSrc; 769 c=Character.toCodePoint(c2, (char)c); 770 } 771 } 772 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 773 break; 774 } 775 } 776 } 777 } 778 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 779 // The current character is either a "noNo" (has a mapping) 780 // or a "maybeYes" (combines backward) 781 // or a "yesYes" with ccc!=0. 782 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 783 784 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 785 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 786 if (!doCompose) { 787 return false; 788 } 789 // Fast path for mapping a character that is immediately surrounded by boundaries. 790 // In this case, we need not decompose around the current character. 791 if (isDecompNoAlgorithmic(norm16)) { 792 // Maps to a single isCompYesAndZeroCC character 793 // which also implies hasCompBoundaryBefore. 794 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 795 hasCompBoundaryBefore(s, src, limit)) { 796 if (prevBoundary != prevSrc) { 797 buffer.append(s, prevBoundary, prevSrc); 798 } 799 buffer.append(mapAlgorithmic(c, norm16), 0); 800 prevBoundary = src; 801 continue; 802 } 803 } else if (norm16 < minNoNoCompBoundaryBefore) { 804 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 805 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 806 hasCompBoundaryBefore(s, src, limit)) { 807 if (prevBoundary != prevSrc) { 808 buffer.append(s, prevBoundary, prevSrc); 809 } 810 int mapping = norm16 >> OFFSET_SHIFT; 811 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 812 buffer.append(extraData, mapping, mapping + length); 813 prevBoundary = src; 814 continue; 815 } 816 } else if (norm16 >= minNoNoEmpty) { 817 // The current character maps to nothing. 818 // Simply omit it from the output if there is a boundary before _or_ after it. 819 // The character itself implies no boundaries. 820 if (hasCompBoundaryBefore(s, src, limit) || 821 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 822 if (prevBoundary != prevSrc) { 823 buffer.append(s, prevBoundary, prevSrc); 824 } 825 prevBoundary = src; 826 continue; 827 } 828 } 829 // Other "noNo" type, or need to examine more text around this character: 830 // Fall through to the slow path. 831 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 832 char prev=s.charAt(prevSrc-1); 833 if(c<Hangul.JAMO_T_BASE) { 834 // The current character is a Jamo Vowel, 835 // compose with previous Jamo L and following Jamo T. 836 char l = (char)(prev-Hangul.JAMO_L_BASE); 837 if(l<Hangul.JAMO_L_COUNT) { 838 if (!doCompose) { 839 return false; 840 } 841 int t; 842 if (src != limit && 843 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 844 t < Hangul.JAMO_T_COUNT) { 845 // The next character is a Jamo T. 846 ++src; 847 } else if (hasCompBoundaryBefore(s, src, limit)) { 848 // No Jamo T follows, not even via decomposition. 849 t = 0; 850 } else { 851 t = -1; 852 } 853 if (t >= 0) { 854 int syllable = Hangul.HANGUL_BASE + 855 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 856 Hangul.JAMO_T_COUNT + t; 857 --prevSrc; // Replace the Jamo L as well. 858 if (prevBoundary != prevSrc) { 859 buffer.append(s, prevBoundary, prevSrc); 860 } 861 buffer.append((char)syllable); 862 prevBoundary = src; 863 continue; 864 } 865 // If we see L+V+x where x!=T then we drop to the slow path, 866 // decompose and recompose. 867 // This is to deal with NFKC finding normal L and V but a 868 // compatibility variant of a T. 869 // We need to either fully compose that combination here 870 // (which would complicate the code and may not work with strange custom data) 871 // or use the slow path. 872 } 873 } else if (Hangul.isHangulLV(prev)) { 874 // The current character is a Jamo Trailing consonant, 875 // compose with previous Hangul LV that does not contain a Jamo T. 876 if (!doCompose) { 877 return false; 878 } 879 int syllable = prev + c - Hangul.JAMO_T_BASE; 880 --prevSrc; // Replace the Hangul LV as well. 881 if (prevBoundary != prevSrc) { 882 buffer.append(s, prevBoundary, prevSrc); 883 } 884 buffer.append((char)syllable); 885 prevBoundary = src; 886 continue; 887 } 888 // No matching context, or may need to decompose surrounding text first: 889 // Fall through to the slow path. 890 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 891 // One or more combining marks that do not combine-back: 892 // Check for canonical order, copy unchanged if ok and 893 // if followed by a character with a boundary-before. 894 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 895 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 896 // Fails FCD test, need to decompose and contiguously recompose. 897 if (!doCompose) { 898 return false; 899 } 900 } else { 901 // If !onlyContiguous (not FCC), then we ignore the tccc of 902 // the previous character which passed the quick check "yes && ccc==0" test. 903 int n16; 904 for (;;) { 905 if (src == limit) { 906 if (doCompose) { 907 buffer.append(s, prevBoundary, limit); 908 } 909 return true; 910 } 911 int prevCC = cc; 912 c = Character.codePointAt(s, src); 913 n16 = normTrie.get(c); 914 if (n16 >= MIN_YES_YES_WITH_CC) { 915 cc = getCCFromNormalYesOrMaybe(n16); 916 if (prevCC > cc) { 917 if (!doCompose) { 918 return false; 919 } 920 break; 921 } 922 } else { 923 break; 924 } 925 src += Character.charCount(c); 926 } 927 // p is after the last in-order combining mark. 928 // If there is a boundary here, then we continue with no change. 929 if (norm16HasCompBoundaryBefore(n16)) { 930 if (isCompYesAndZeroCC(n16)) { 931 src += Character.charCount(c); 932 } 933 continue; 934 } 935 // Use the slow path. There is no boundary in [prevSrc, src[. 936 } 937 } 938 939 // Slow path: Find the nearest boundaries around the current character, 940 // decompose and recompose. 941 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 942 c = Character.codePointBefore(s, prevSrc); 943 norm16 = normTrie.get(c); 944 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 945 prevSrc -= Character.charCount(c); 946 } 947 } 948 if (doCompose && prevBoundary != prevSrc) { 949 buffer.append(s, prevBoundary, prevSrc); 950 } 951 int recomposeStartIndex=buffer.length(); 952 // We know there is not a boundary here. 953 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 954 buffer); 955 // Decompose until the next boundary. 956 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 957 buffer); 958 recompose(buffer, recomposeStartIndex, onlyContiguous); 959 if(!doCompose) { 960 if(!buffer.equals(s, prevSrc, src)) { 961 return false; 962 } 963 buffer.remove(); 964 } 965 prevBoundary=src; 966 } 967 } 968 969 /** 970 * Very similar to compose(): Make the same changes in both places if relevant. 971 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 972 * !doSpan: quickCheck 973 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 974 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 975 * then the quick check result is "no" 976 */ composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)977 public int composeQuickCheck(CharSequence s, int src, int limit, 978 boolean onlyContiguous, boolean doSpan) { 979 int qcResult=0; 980 int prevBoundary=src; 981 int minNoMaybeCP=minCompNoMaybeCP; 982 983 for(;;) { 984 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 985 // or with (compYes && ccc==0) properties. 986 int prevSrc; 987 int c = 0; 988 int norm16 = 0; 989 for (;;) { 990 if(src==limit) { 991 return (src<<1)|qcResult; // "yes" or "maybe" 992 } 993 if( (c=s.charAt(src))<minNoMaybeCP || 994 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 995 ) { 996 ++src; 997 } else { 998 prevSrc = src++; 999 if(!UTF16.isSurrogate((char)c)) { 1000 break; 1001 } else { 1002 char c2; 1003 if(UTF16Plus.isSurrogateLead(c)) { 1004 if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) { 1005 ++src; 1006 c=Character.toCodePoint((char)c, c2); 1007 } 1008 } else /* trail surrogate */ { 1009 if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) { 1010 --prevSrc; 1011 c=Character.toCodePoint(c2, (char)c); 1012 } 1013 } 1014 if(!isCompYesAndZeroCC(norm16=getNorm16(c))) { 1015 break; 1016 } 1017 } 1018 } 1019 } 1020 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1021 // The current character is either a "noNo" (has a mapping) 1022 // or a "maybeYes" (combines backward) 1023 // or a "yesYes" with ccc!=0. 1024 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1025 1026 int prevNorm16 = INERT; 1027 if (prevBoundary != prevSrc) { 1028 prevBoundary = prevSrc; 1029 if (!norm16HasCompBoundaryBefore(norm16)) { 1030 c = Character.codePointBefore(s, prevSrc); 1031 int n16 = getNorm16(c); 1032 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1033 prevBoundary -= Character.charCount(c); 1034 prevNorm16 = n16; 1035 } 1036 } 1037 } 1038 1039 if(isMaybeOrNonZeroCC(norm16)) { 1040 int cc=getCCFromYesOrMaybe(norm16); 1041 if (onlyContiguous /* FCC */ && cc != 0 && 1042 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1043 // The [prevBoundary..prevSrc[ character 1044 // passed the quick check "yes && ccc==0" test 1045 // but is out of canonical order with the current combining mark. 1046 } else { 1047 // If !onlyContiguous (not FCC), then we ignore the tccc of 1048 // the previous character which passed the quick check "yes && ccc==0" test. 1049 for (;;) { 1050 if (norm16 < MIN_YES_YES_WITH_CC) { 1051 if (!doSpan) { 1052 qcResult = 1; 1053 } else { 1054 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1055 } 1056 } 1057 if (src == limit) { 1058 return (src<<1) | qcResult; // "yes" or "maybe" 1059 } 1060 int prevCC = cc; 1061 c = Character.codePointAt(s, src); 1062 norm16 = getNorm16(c); 1063 if (isMaybeOrNonZeroCC(norm16)) { 1064 cc = getCCFromYesOrMaybe(norm16); 1065 if (!(prevCC <= cc || cc == 0)) { 1066 break; 1067 } 1068 } else { 1069 break; 1070 } 1071 src += Character.charCount(c); 1072 } 1073 // src is after the last in-order combining mark. 1074 if (isCompYesAndZeroCC(norm16)) { 1075 prevBoundary = src; 1076 src += Character.charCount(c); 1077 continue; 1078 } 1079 } 1080 } 1081 return prevBoundary<<1; // "no" 1082 } 1083 } composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1084 public void composeAndAppend(CharSequence s, 1085 boolean doCompose, 1086 boolean onlyContiguous, 1087 ReorderingBuffer buffer) { 1088 int src=0, limit=s.length(); 1089 if(!buffer.isEmpty()) { 1090 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1091 if(0!=firstStarterInSrc) { 1092 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1093 buffer.length(), onlyContiguous); 1094 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1095 firstStarterInSrc+16); 1096 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1097 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1098 middle.append(s, 0, firstStarterInSrc); 1099 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1100 src=firstStarterInSrc; 1101 } 1102 } 1103 if(doCompose) { 1104 compose(s, src, limit, onlyContiguous, true, buffer); 1105 } else { 1106 buffer.append(s, src, limit); 1107 } 1108 } 1109 // Dual functionality: 1110 // buffer!=NULL: normalize 1111 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1112 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1113 // Note: In this function we use buffer->appendZeroCC() because we track 1114 // the lead and trail combining classes here, rather than leaving it to 1115 // the ReorderingBuffer. 1116 // The exception is the call to decomposeShort() which uses the buffer 1117 // in the normal way. 1118 1119 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1120 // Similar to the prevBoundary in the compose() implementation. 1121 int prevBoundary=src; 1122 int prevSrc; 1123 int c=0; 1124 int prevFCD16=0; 1125 int fcd16=0; 1126 1127 for(;;) { 1128 // count code units with lccc==0 1129 for(prevSrc=src; src!=limit;) { 1130 if((c=s.charAt(src))<minLcccCP) { 1131 prevFCD16=~c; 1132 ++src; 1133 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1134 prevFCD16=0; 1135 ++src; 1136 } else { 1137 if(UTF16.isSurrogate((char)c)) { 1138 char c2; 1139 if(UTF16Plus.isSurrogateLead(c)) { 1140 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1141 c=Character.toCodePoint((char)c, c2); 1142 } 1143 } else /* trail surrogate */ { 1144 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1145 --src; 1146 c=Character.toCodePoint(c2, (char)c); 1147 } 1148 } 1149 } 1150 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1151 prevFCD16=fcd16; 1152 src+=Character.charCount(c); 1153 } else { 1154 break; 1155 } 1156 } 1157 } 1158 // copy these code units all at once 1159 if(src!=prevSrc) { 1160 if(src==limit) { 1161 if(buffer!=null) { 1162 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1163 } 1164 break; 1165 } 1166 prevBoundary=src; 1167 // We know that the previous character's lccc==0. 1168 if(prevFCD16<0) { 1169 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1170 int prev=~prevFCD16; 1171 if(prev<minDecompNoCP) { 1172 prevFCD16=0; 1173 } else { 1174 prevFCD16=getFCD16FromNormData(prev); 1175 if(prevFCD16>1) { 1176 --prevBoundary; 1177 } 1178 } 1179 } else { 1180 int p=src-1; 1181 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1182 Character.isHighSurrogate(s.charAt(p-1)) 1183 ) { 1184 --p; 1185 // Need to fetch the previous character's FCD value because 1186 // prevFCD16 was just for the trail surrogate code point. 1187 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1188 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1189 } 1190 if(prevFCD16>1) { 1191 prevBoundary=p; 1192 } 1193 } 1194 if(buffer!=null) { 1195 // The last lccc==0 character is excluded from the 1196 // flush-and-append call in case it needs to be modified. 1197 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1198 buffer.append(s, prevBoundary, src); 1199 } 1200 // The start of the current character (c). 1201 prevSrc=src; 1202 } else if(src==limit) { 1203 break; 1204 } 1205 1206 src+=Character.charCount(c); 1207 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1208 // Check for proper order, and decompose locally if necessary. 1209 if((prevFCD16&0xff)<=(fcd16>>8)) { 1210 // proper order: prev tccc <= current lccc 1211 if((fcd16&0xff)<=1) { 1212 prevBoundary=src; 1213 } 1214 if(buffer!=null) { 1215 buffer.appendZeroCC(c); 1216 } 1217 prevFCD16=fcd16; 1218 continue; 1219 } else if(buffer==null) { 1220 return prevBoundary; // quick check "no" 1221 } else { 1222 /* 1223 * Back out the part of the source that we copied or appended 1224 * already but is now going to be decomposed. 1225 * prevSrc is set to after what was copied/appended. 1226 */ 1227 buffer.removeSuffix(prevSrc-prevBoundary); 1228 /* 1229 * Find the part of the source that needs to be decomposed, 1230 * up to the next safe boundary. 1231 */ 1232 src=findNextFCDBoundary(s, src, limit); 1233 /* 1234 * The source text does not fulfill the conditions for FCD. 1235 * Decompose and reorder a limited piece of the text. 1236 */ 1237 decomposeShort(s, prevBoundary, src, false, false, buffer); 1238 prevBoundary=src; 1239 prevFCD16=0; 1240 } 1241 } 1242 return src; 1243 } 1244 hasDecompBoundaryBefore(int c)1245 public boolean hasDecompBoundaryBefore(int c) { 1246 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1247 norm16HasDecompBoundaryBefore(getNorm16(c)); 1248 } norm16HasDecompBoundaryBefore(int norm16)1249 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1250 if (norm16 < minNoNoCompNoMaybeCC) { 1251 return true; 1252 } 1253 if (norm16 >= limitNoNo) { 1254 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1255 } 1256 // c decomposes, get everything from the variable-length extra data 1257 int mapping=norm16>>OFFSET_SHIFT; 1258 int firstUnit=extraData.charAt(mapping); 1259 // true if leadCC==0 (hasFCDBoundaryBefore()) 1260 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1261 } hasDecompBoundaryAfter(int c)1262 public boolean hasDecompBoundaryAfter(int c) { 1263 if (c < minDecompNoCP) { 1264 return true; 1265 } 1266 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1267 return true; 1268 } 1269 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1270 } norm16HasDecompBoundaryAfter(int norm16)1271 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1272 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1273 return true; 1274 } 1275 if (norm16 >= limitNoNo) { 1276 if (isMaybeOrNonZeroCC(norm16)) { 1277 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1278 } 1279 // Maps to an isCompYesAndZeroCC. 1280 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1281 } 1282 // c decomposes, get everything from the variable-length extra data 1283 int mapping=norm16>>OFFSET_SHIFT; 1284 int firstUnit=extraData.charAt(mapping); 1285 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1286 // fcd16<=1 || trailCC==0 1287 if(firstUnit>0x1ff) { 1288 return false; // trailCC>1 1289 } 1290 if(firstUnit<=0xff) { 1291 return true; // trailCC==0 1292 } 1293 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1294 // true if leadCC==0 (hasFCDBoundaryBefore()) 1295 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1296 } isDecompInert(int c)1297 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1298 hasCompBoundaryBefore(int c)1299 public boolean hasCompBoundaryBefore(int c) { 1300 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1301 } hasCompBoundaryAfter(int c, boolean onlyContiguous)1302 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1303 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1304 } 1305 isMaybe(int norm16)1306 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } isMaybeOrNonZeroCC(int norm16)1307 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } isInert(int norm16)1308 private static boolean isInert(int norm16) { return norm16==INERT; } isJamoVT(int norm16)1309 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } hangulLVT()1310 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } isHangulLV(int norm16)1311 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } isHangulLVT(int norm16)1312 private boolean isHangulLVT(int norm16) { 1313 return norm16==hangulLVT(); 1314 } isCompYesAndZeroCC(int norm16)1315 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1316 // UBool isCompYes(uint16_t norm16) const { 1317 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1318 // } 1319 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1320 // return norm16<minNoNo || minMaybeYes<=norm16; 1321 // } 1322 // private boolean hasZeroCCFromDecompYes(int norm16) { 1323 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1324 // } isDecompYesAndZeroCC(int norm16)1325 private boolean isDecompYesAndZeroCC(int norm16) { 1326 return norm16<minYesNo || 1327 norm16==JAMO_VT || 1328 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1329 } 1330 /** 1331 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1332 * the MaybeYes which combine-forward and have ccc=0. 1333 * (Standard Unicode 10 normalization does not have such characters.) 1334 */ isMostDecompYesAndZeroCC(int norm16)1335 private boolean isMostDecompYesAndZeroCC(int norm16) { 1336 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1337 } isDecompNoAlgorithmic(int norm16)1338 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1339 1340 // For use with isCompYes(). 1341 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1342 // static uint8_t getCCFromYes(uint16_t norm16) { 1343 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1344 // } getCCFromNoNo(int norm16)1345 private int getCCFromNoNo(int norm16) { 1346 int mapping=norm16>>OFFSET_SHIFT; 1347 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1348 return extraData.charAt(mapping-1)&0xff; 1349 } else { 1350 return 0; 1351 } 1352 } getTrailCCFromCompYesAndZeroCC(int norm16)1353 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1354 if(norm16<=minYesNo) { 1355 return 0; // yesYes and Hangul LV have ccc=tccc=0 1356 } else { 1357 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1358 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1359 } 1360 } 1361 1362 // Requires algorithmic-NoNo. mapAlgorithmic(int c, int norm16)1363 private int mapAlgorithmic(int c, int norm16) { 1364 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1365 } 1366 1367 // Requires minYesNo<norm16<limitNoNo. 1368 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1369 1370 /** 1371 * @return index into maybeYesCompositions, or -1 1372 */ getCompositionsListForDecompYes(int norm16)1373 private int getCompositionsListForDecompYes(int norm16) { 1374 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1375 return -1; 1376 } else { 1377 if((norm16-=minMaybeYes)<0) { 1378 // norm16<minMaybeYes: index into extraData which is a substring at 1379 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1380 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1381 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1382 } 1383 return norm16>>OFFSET_SHIFT; 1384 } 1385 } 1386 /** 1387 * @return index into maybeYesCompositions 1388 */ getCompositionsListForComposite(int norm16)1389 private int getCompositionsListForComposite(int norm16) { 1390 // A composite has both mapping & compositions list. 1391 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1392 int firstUnit=maybeYesCompositions.charAt(list); 1393 return list+ // mapping in maybeYesCompositions 1394 1+ // +1 to skip the first unit with the mapping length 1395 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1396 } 1397 1398 // Decompose a short piece of text which is likely to contain characters that 1399 // fail the quick check loop and/or where the quick check loop's overhead 1400 // is unlikely to be amortized. 1401 // Called by the compose() and makeFCD() implementations. 1402 // Public in Java for collation implementation code. decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1403 private int decomposeShort( 1404 CharSequence s, int src, int limit, 1405 boolean stopAtCompBoundary, boolean onlyContiguous, 1406 ReorderingBuffer buffer) { 1407 while(src<limit) { 1408 int c=Character.codePointAt(s, src); 1409 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1410 return src; 1411 } 1412 int norm16 = getNorm16(c); 1413 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1414 return src; 1415 } 1416 src+=Character.charCount(c); 1417 decompose(c, norm16, buffer); 1418 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1419 return src; 1420 } 1421 } 1422 return src; 1423 } decompose(int c, int norm16, ReorderingBuffer buffer)1424 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1425 // get the decomposition and the lead and trail cc's 1426 if (norm16 >= limitNoNo) { 1427 if (isMaybeOrNonZeroCC(norm16)) { 1428 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1429 return; 1430 } 1431 // Maps to an isCompYesAndZeroCC. 1432 c=mapAlgorithmic(c, norm16); 1433 norm16=getNorm16(c); 1434 } 1435 if (norm16 < minYesNo) { 1436 // c does not decompose 1437 buffer.append(c, 0); 1438 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1439 // Hangul syllable: decompose algorithmically 1440 Hangul.decompose(c, buffer); 1441 } else { 1442 // c decomposes, get everything from the variable-length extra data 1443 int mapping=norm16>>OFFSET_SHIFT; 1444 int firstUnit=extraData.charAt(mapping); 1445 int length=firstUnit&MAPPING_LENGTH_MASK; 1446 int leadCC, trailCC; 1447 trailCC=firstUnit>>8; 1448 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1449 leadCC=extraData.charAt(mapping-1)>>8; 1450 } else { 1451 leadCC=0; 1452 } 1453 ++mapping; // skip over the firstUnit 1454 buffer.append(extraData, mapping, mapping+length, leadCC, trailCC); 1455 } 1456 } 1457 1458 /** 1459 * Finds the recomposition result for 1460 * a forward-combining "lead" character, 1461 * specified with a pointer to its compositions list, 1462 * and a backward-combining "trail" character. 1463 * 1464 * <p>If the lead and trail characters combine, then this function returns 1465 * the following "compositeAndFwd" value: 1466 * <pre> 1467 * Bits 21..1 composite character 1468 * Bit 0 set if the composite is a forward-combining starter 1469 * </pre> 1470 * otherwise it returns -1. 1471 * 1472 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1473 * encoded as either pairs or triples of 16-bit units. 1474 * The last entry has the high bit of its first unit set. 1475 * 1476 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1477 * A linear search is used. 1478 * 1479 * <p>See normalizer2impl.h for a more detailed description 1480 * of the compositions list format. 1481 */ combine(String compositions, int list, int trail)1482 private static int combine(String compositions, int list, int trail) { 1483 int key1, firstUnit; 1484 if(trail<COMP_1_TRAIL_LIMIT) { 1485 // trail character is 0..33FF 1486 // result entry may have 2 or 3 units 1487 key1=(trail<<1); 1488 while(key1>(firstUnit=compositions.charAt(list))) { 1489 list+=2+(firstUnit&COMP_1_TRIPLE); 1490 } 1491 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1492 if((firstUnit&COMP_1_TRIPLE)!=0) { 1493 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1494 } else { 1495 return compositions.charAt(list+1); 1496 } 1497 } 1498 } else { 1499 // trail character is 3400..10FFFF 1500 // result entry has 3 units 1501 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1502 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1503 int secondUnit; 1504 for(;;) { 1505 if(key1>(firstUnit=compositions.charAt(list))) { 1506 list+=2+(firstUnit&COMP_1_TRIPLE); 1507 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1508 if(key2>(secondUnit=compositions.charAt(list+1))) { 1509 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1510 break; 1511 } else { 1512 list+=3; 1513 } 1514 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1515 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1516 } else { 1517 break; 1518 } 1519 } else { 1520 break; 1521 } 1522 } 1523 } 1524 return -1; 1525 } 1526 1527 /* 1528 * Recomposes the buffer text starting at recomposeStartIndex 1529 * (which is in NFD - decomposed and canonically ordered), 1530 * and truncates the buffer contents. 1531 * 1532 * Note that recomposition never lengthens the text: 1533 * Any character consists of either one or two code units; 1534 * a composition may contain at most one more code unit than the original starter, 1535 * while the combining mark that is removed has at least one code unit. 1536 */ recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1537 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1538 boolean onlyContiguous) { 1539 StringBuilder sb=buffer.getStringBuilder(); 1540 int p=recomposeStartIndex; 1541 if(p==sb.length()) { 1542 return; 1543 } 1544 1545 int starter, pRemove; 1546 int compositionsList; 1547 int c, compositeAndFwd; 1548 int norm16; 1549 int cc, prevCC; 1550 boolean starterIsSupplementary; 1551 1552 // Some of the following variables are not used until we have a forward-combining starter 1553 // and are only initialized now to avoid compiler warnings. 1554 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1555 starter=-1; 1556 starterIsSupplementary=false; 1557 prevCC=0; 1558 1559 for(;;) { 1560 c=sb.codePointAt(p); 1561 p+=Character.charCount(c); 1562 norm16=getNorm16(c); 1563 cc=getCCFromYesOrMaybe(norm16); 1564 if( // this character combines backward and 1565 isMaybe(norm16) && 1566 // we have seen a starter that combines forward and 1567 compositionsList>=0 && 1568 // the backward-combining character is not blocked 1569 (prevCC<cc || prevCC==0) 1570 ) { 1571 if(isJamoVT(norm16)) { 1572 // c is a Jamo V/T, see if we can compose it with the previous character. 1573 if(c<Hangul.JAMO_T_BASE) { 1574 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1575 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1576 if(prev<Hangul.JAMO_L_COUNT) { 1577 pRemove=p-1; 1578 char syllable=(char) 1579 (Hangul.HANGUL_BASE+ 1580 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1581 Hangul.JAMO_T_COUNT); 1582 char t; 1583 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1584 ++p; 1585 syllable+=t; // The next character was a Jamo T. 1586 } 1587 sb.setCharAt(starter, syllable); 1588 // remove the Jamo V/T 1589 sb.delete(pRemove, p); 1590 p=pRemove; 1591 } 1592 } 1593 /* 1594 * No "else" for Jamo T: 1595 * Since the input is in NFD, there are no Hangul LV syllables that 1596 * a Jamo T could combine with. 1597 * All Jamo Ts are combined above when handling Jamo Vs. 1598 */ 1599 if(p==sb.length()) { 1600 break; 1601 } 1602 compositionsList=-1; 1603 continue; 1604 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1605 // The starter and the combining mark (c) do combine. 1606 int composite=compositeAndFwd>>1; 1607 1608 // Remove the combining mark. 1609 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1610 sb.delete(pRemove, p); 1611 p=pRemove; 1612 // Replace the starter with the composite. 1613 if(starterIsSupplementary) { 1614 if(composite>0xffff) { 1615 // both are supplementary 1616 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1617 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1618 } else { 1619 sb.setCharAt(starter, (char)c); 1620 sb.deleteCharAt(starter+1); 1621 // The composite is shorter than the starter, 1622 // move the intermediate characters forward one. 1623 starterIsSupplementary=false; 1624 --p; 1625 } 1626 } else if(composite>0xffff) { 1627 // The composite is longer than the starter, 1628 // move the intermediate characters back one. 1629 starterIsSupplementary=true; 1630 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1631 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1632 ++p; 1633 } else { 1634 // both are on the BMP 1635 sb.setCharAt(starter, (char)composite); 1636 } 1637 1638 // Keep prevCC because we removed the combining mark. 1639 1640 if(p==sb.length()) { 1641 break; 1642 } 1643 // Is the composite a starter that combines forward? 1644 if((compositeAndFwd&1)!=0) { 1645 compositionsList= 1646 getCompositionsListForComposite(getNorm16(composite)); 1647 } else { 1648 compositionsList=-1; 1649 } 1650 1651 // We combined; continue with looking for compositions. 1652 continue; 1653 } 1654 } 1655 1656 // no combination this time 1657 prevCC=cc; 1658 if(p==sb.length()) { 1659 break; 1660 } 1661 1662 // If c did not combine, then check if it is a starter. 1663 if(cc==0) { 1664 // Found a new starter. 1665 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 1666 // It may combine with something, prepare for it. 1667 if(c<=0xffff) { 1668 starterIsSupplementary=false; 1669 starter=p-1; 1670 } else { 1671 starterIsSupplementary=true; 1672 starter=p-2; 1673 } 1674 } 1675 } else if(onlyContiguous) { 1676 // FCC: no discontiguous compositions; any intervening character blocks. 1677 compositionsList=-1; 1678 } 1679 } 1680 buffer.flush(); 1681 } 1682 1683 /** 1684 * Does c have a composition boundary before it? 1685 * True if its decomposition begins with a character that has 1686 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1687 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1688 * (isCompYesAndZeroCC()) so we need not decompose. 1689 */ hasCompBoundaryBefore(int c, int norm16)1690 private boolean hasCompBoundaryBefore(int c, int norm16) { 1691 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 1692 } norm16HasCompBoundaryBefore(int norm16)1693 private boolean norm16HasCompBoundaryBefore(int norm16) { 1694 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 1695 } hasCompBoundaryBefore(CharSequence s, int src, int limit)1696 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 1697 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 1698 } norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)1699 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 1700 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1701 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 1702 } hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)1703 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 1704 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 1705 } 1706 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ isTrailCC01ForCompBoundaryAfter(int norm16)1707 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 1708 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 1709 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 1710 } 1711 findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)1712 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 1713 while(p>0) { 1714 int c=Character.codePointBefore(s, p); 1715 int norm16 = getNorm16(c); 1716 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1717 break; 1718 } 1719 p-=Character.charCount(c); 1720 if(hasCompBoundaryBefore(c, norm16)) { 1721 break; 1722 } 1723 } 1724 return p; 1725 } findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)1726 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 1727 while(p<limit) { 1728 int c=Character.codePointAt(s, p); 1729 int norm16=normTrie.get(c); 1730 if(hasCompBoundaryBefore(c, norm16)) { 1731 break; 1732 } 1733 p+=Character.charCount(c); 1734 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1735 break; 1736 } 1737 } 1738 return p; 1739 } 1740 1741 findNextFCDBoundary(CharSequence s, int p, int limit)1742 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 1743 while(p<limit) { 1744 int c=Character.codePointAt(s, p); 1745 int norm16; 1746 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 1747 break; 1748 } 1749 p+=Character.charCount(c); 1750 if (norm16HasDecompBoundaryAfter(norm16)) { 1751 break; 1752 } 1753 } 1754 return p; 1755 } 1756 1757 /** 1758 * Get the canonical decomposition 1759 * sherman for ComposedCharIter 1760 */ getDecompose(int chars[], String decomps[])1761 public static int getDecompose(int chars[], String decomps[]) { 1762 Normalizer2 impl = Normalizer2.getNFDInstance(); 1763 1764 int length=0; 1765 int norm16 = 0; 1766 int ch = -1; 1767 int i = 0; 1768 1769 while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff 1770 //TBD !!!! the hack code heres save us about 50ms for startup 1771 //need a better solution/lookup 1772 if (ch == 0x30ff) 1773 ch = 0xf900; 1774 else if (ch == 0x115bc) 1775 ch = 0x1d15e; 1776 else if (ch == 0x1d1c1) 1777 ch = 0x2f800; 1778 1779 String s = impl.getDecomposition(ch); 1780 1781 if(s != null && i < chars.length) { 1782 chars[i] = ch; 1783 decomps[i++] = s; 1784 } 1785 } 1786 return i; 1787 } 1788 1789 //------------------------------------------------------ 1790 // special method for Collation (RBTableBuilder.build()) 1791 //------------------------------------------------------ needSingleQuotation(char c)1792 private static boolean needSingleQuotation(char c) { 1793 return (c >= 0x0009 && c <= 0x000D) || 1794 (c >= 0x0020 && c <= 0x002F) || 1795 (c >= 0x003A && c <= 0x0040) || 1796 (c >= 0x005B && c <= 0x0060) || 1797 (c >= 0x007B && c <= 0x007E); 1798 } 1799 canonicalDecomposeWithSingleQuotation(String string)1800 public static String canonicalDecomposeWithSingleQuotation(String string) { 1801 Normalizer2 impl = Normalizer2.getNFDInstance(); 1802 char[] src = string.toCharArray(); 1803 int srcIndex = 0; 1804 int srcLimit = src.length; 1805 char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 1806 int destIndex = 0; 1807 int destLimit = dest.length; 1808 1809 int prevSrc; 1810 String norm; 1811 int reorderStartIndex, length; 1812 char c1, c2; 1813 int cp; 1814 int minNoMaybe = 0x00c0; 1815 int cc, prevCC, trailCC; 1816 char[] p; 1817 int pStart; 1818 1819 // initialize 1820 reorderStartIndex = 0; 1821 prevCC = 0; 1822 norm = null; 1823 cp = 0; 1824 pStart = 0; 1825 1826 cc = trailCC = -1; // initialize to bogus value 1827 c1 = 0; 1828 for (;;) { 1829 prevSrc=srcIndex; 1830 //quick check (1)less than minNoMaybe (2)no decomp (3)hangual 1831 while (srcIndex != srcLimit && 1832 ((c1 = src[srcIndex]) < minNoMaybe || 1833 (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || 1834 (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables 1835 prevCC = 0; 1836 srcIndex += (cp < 0x10000) ? 1 : 2; 1837 } 1838 1839 // copy these code units all at once 1840 if (srcIndex != prevSrc) { 1841 length = srcIndex - prevSrc; 1842 if ((destIndex + length) <= destLimit) { 1843 System.arraycopy(src,prevSrc,dest,destIndex,length); 1844 } 1845 1846 destIndex += length; 1847 reorderStartIndex = destIndex; 1848 } 1849 1850 // end of source reached? 1851 if (srcIndex == srcLimit) { 1852 break; 1853 } 1854 1855 // cp already contains *src and norm32 is set for it, increment src 1856 srcIndex += (cp < 0x10000) ? 1 : 2; 1857 1858 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 1859 c2 = 0; 1860 length = 1; 1861 1862 if (Character.isHighSurrogate(c1) 1863 || Character.isLowSurrogate(c1)) { 1864 norm = null; 1865 } 1866 } else { 1867 length = 2; 1868 c2 = src[srcIndex-1]; 1869 } 1870 1871 // get the decomposition and the lead and trail cc's 1872 if (norm == null) { 1873 // cp does not decompose 1874 cc = trailCC = UCharacter.getCombiningClass(cp); 1875 p = null; 1876 pStart = -1; 1877 } else { 1878 1879 pStart = 0; 1880 p = norm.toCharArray(); 1881 length = p.length; 1882 int cpNum = norm.codePointCount(0, length); 1883 cc= UCharacter.getCombiningClass(norm.codePointAt(0)); 1884 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); 1885 if (length == 1) { 1886 // fastpath a single code unit from decomposition 1887 c1 = p[pStart]; 1888 c2 = 0; 1889 p = null; 1890 pStart = -1; 1891 } 1892 } 1893 1894 if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations 1895 // buffer overflow 1896 char[] tmpBuf = new char[destLimit * 2]; 1897 System.arraycopy(dest, 0, tmpBuf, 0, destIndex); 1898 dest = tmpBuf; 1899 destLimit = dest.length; 1900 } 1901 1902 // append the decomposition to the destination buffer, assume length>0 1903 { 1904 int reorderSplit = destIndex; 1905 if (p == null) { 1906 // fastpath: single code point 1907 if (needSingleQuotation(c1)) { 1908 //if we need single quotation, no need to consider "prevCC" 1909 //and it must NOT be a supplementary pair 1910 dest[destIndex++] = '\''; 1911 dest[destIndex++] = c1; 1912 dest[destIndex++] = '\''; 1913 trailCC = 0; 1914 } else if(cc != 0 && cc < prevCC) { 1915 // (c1, c2) is out of order with respect to the preceding 1916 // text 1917 destIndex += length; 1918 trailCC = insertOrdered(dest, reorderStartIndex, 1919 reorderSplit, destIndex, c1, c2, cc); 1920 } else { 1921 // just append (c1, c2) 1922 dest[destIndex++] = c1; 1923 if(c2 != 0) { 1924 dest[destIndex++] = c2; 1925 } 1926 } 1927 } else { 1928 // general: multiple code points (ordered by themselves) 1929 // from decomposition 1930 if (needSingleQuotation(p[pStart])) { 1931 dest[destIndex++] = '\''; 1932 dest[destIndex++] = p[pStart++]; 1933 dest[destIndex++] = '\''; 1934 length--; 1935 do { 1936 dest[destIndex++] = p[pStart++]; 1937 } while(--length > 0); 1938 } else if (cc != 0 && cc < prevCC) { 1939 destIndex += length; 1940 trailCC = mergeOrdered(dest, reorderStartIndex, 1941 reorderSplit, p, pStart, 1942 pStart+length); 1943 } else { 1944 // just append the decomposition 1945 do { 1946 dest[destIndex++] = p[pStart++]; 1947 } while (--length > 0); 1948 } 1949 } 1950 } 1951 prevCC = trailCC; 1952 if(prevCC == 0) { 1953 reorderStartIndex = destIndex; 1954 } 1955 } 1956 1957 return new String(dest, 0, destIndex); 1958 } 1959 1960 /** 1961 * simpler, single-character version of mergeOrdered() - 1962 * bubble-insert one single code point into the preceding string 1963 * which is already canonically ordered 1964 * (c, c2) may or may not yet have been inserted at src[current]..src[p] 1965 * 1966 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) 1967 * 1968 * before: src[start]..src[current] is already ordered, and 1969 * src[current]..src[p] may or may not hold (c, c2) but 1970 * must be exactly the same length as (c, c2) 1971 * after: src[start]..src[p] is ordered 1972 * 1973 * @return the trailing combining class 1974 */ insertOrdered(char[] source, int start, int current, int p, char c1, char c2, int cc)1975 private static int/*unsigned byte*/ insertOrdered(char[] source, 1976 int start, 1977 int current, int p, 1978 char c1, char c2, 1979 int/*unsigned byte*/ cc) { 1980 int back, preBack; 1981 int r; 1982 int prevCC, trailCC=cc; 1983 1984 if (start<current && cc!=0) { 1985 // search for the insertion point where cc>=prevCC 1986 preBack=back=current; 1987 1988 PrevArgs prevArgs = new PrevArgs(); 1989 prevArgs.current = current; 1990 prevArgs.start = start; 1991 prevArgs.src = source; 1992 prevArgs.c1 = c1; 1993 prevArgs.c2 = c2; 1994 1995 // get the prevCC 1996 prevCC=getPrevCC(prevArgs); 1997 preBack = prevArgs.current; 1998 1999 if(cc<prevCC) { 2000 // this will be the last code point, so keep its cc 2001 trailCC=prevCC; 2002 back=preBack; 2003 while(start<preBack) { 2004 prevCC=getPrevCC(prevArgs); 2005 preBack=prevArgs.current; 2006 if(cc>=prevCC) { 2007 break; 2008 } 2009 back=preBack; 2010 } 2011 2012 // this is where we are right now with all these indicies: 2013 // [start]..[pPreBack] 0..? code points that we can ignore 2014 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc 2015 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) 2016 // [current]..[p] 1 code point (c, c2) with cc 2017 2018 // move the code units in between up 2019 r=p; 2020 do { 2021 source[--r]=source[--current]; 2022 } while (back!=current); 2023 } 2024 } 2025 2026 // insert (c1, c2) 2027 source[current] = c1; 2028 if (c2!=0) { 2029 source[(current+1)] = c2; 2030 } 2031 2032 // we know the cc of the last code point 2033 return trailCC; 2034 } 2035 /** 2036 * merge two UTF-16 string parts together 2037 * to canonically order (order by combining classes) their concatenation 2038 * 2039 * the two strings may already be adjacent, so that the merging is done 2040 * in-place if the two strings are not adjacent, then the buffer holding the 2041 * first one must be large enough 2042 * the second string may or may not be ordered in itself 2043 * 2044 * before: [start]..[current] is already ordered, and 2045 * [next]..[limit] may be ordered in itself, but 2046 * is not in relation to [start..current[ 2047 * after: [start..current+(limit-next)[ is ordered 2048 * 2049 * the algorithm is a simple bubble-sort that takes the characters from 2050 * src[next++] and inserts them in correct combining class order into the 2051 * preceding part of the string 2052 * 2053 * since this function is called much less often than the single-code point 2054 * insertOrdered(), it just uses that for easier maintenance 2055 * 2056 * @return the trailing combining class 2057 */ mergeOrdered(char[] source, int start, int current, char[] data, int next, int limit)2058 private static int /*unsigned byte*/ mergeOrdered(char[] source, 2059 int start, 2060 int current, 2061 char[] data, 2062 int next, 2063 int limit) { 2064 int r; 2065 int /*unsigned byte*/ cc, trailCC=0; 2066 boolean adjacent; 2067 2068 adjacent= current==next; 2069 NextCCArgs ncArgs = new NextCCArgs(); 2070 ncArgs.source = data; 2071 ncArgs.next = next; 2072 ncArgs.limit = limit; 2073 2074 if(start!=current) { 2075 2076 while(ncArgs.next<ncArgs.limit) { 2077 cc=getNextCC(ncArgs); 2078 if(cc==0) { 2079 // does not bubble back 2080 trailCC=0; 2081 if(adjacent) { 2082 current=ncArgs.next; 2083 } else { 2084 data[current++]=ncArgs.c1; 2085 if(ncArgs.c2!=0) { 2086 data[current++]=ncArgs.c2; 2087 } 2088 } 2089 break; 2090 } else { 2091 r=current+(ncArgs.c2==0 ? 1 : 2); 2092 trailCC=insertOrdered(source,start, current, r, 2093 ncArgs.c1, ncArgs.c2, cc); 2094 current=r; 2095 } 2096 } 2097 } 2098 2099 if(ncArgs.next==ncArgs.limit) { 2100 // we know the cc of the last code point 2101 return trailCC; 2102 } else { 2103 if(!adjacent) { 2104 // copy the second string part 2105 do { 2106 source[current++]=data[ncArgs.next++]; 2107 } while(ncArgs.next!=ncArgs.limit); 2108 ncArgs.limit=current; 2109 } 2110 PrevArgs prevArgs = new PrevArgs(); 2111 prevArgs.src = data; 2112 prevArgs.start = start; 2113 prevArgs.current = ncArgs.limit; 2114 return getPrevCC(prevArgs); 2115 } 2116 2117 } 2118 private static final class PrevArgs{ 2119 char[] src; 2120 int start; 2121 int current; 2122 char c1; 2123 char c2; 2124 } 2125 2126 private static final class NextCCArgs{ 2127 char[] source; 2128 int next; 2129 int limit; 2130 char c1; 2131 char c2; 2132 } getNextCC(NextCCArgs args)2133 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { 2134 args.c1=args.source[args.next++]; 2135 args.c2=0; 2136 2137 if (UTF16.isTrailSurrogate(args.c1)) { 2138 /* unpaired second surrogate */ 2139 return 0; 2140 } else if (!UTF16.isLeadSurrogate(args.c1)) { 2141 return UCharacter.getCombiningClass(args.c1); 2142 } else if (args.next!=args.limit && 2143 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ 2144 ++args.next; 2145 return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); 2146 } else { 2147 /* unpaired first surrogate */ 2148 args.c2=0; 2149 return 0; 2150 } 2151 } getPrevCC(PrevArgs args)2152 private static int /*unsigned*/ getPrevCC(PrevArgs args) { 2153 args.c1=args.src[--args.current]; 2154 args.c2=0; 2155 2156 if (args.c1 < MIN_CCC_LCCC_CP) { 2157 return 0; 2158 } else if (UTF16.isLeadSurrogate(args.c1)) { 2159 /* unpaired first surrogate */ 2160 return 0; 2161 } else if (!UTF16.isTrailSurrogate(args.c1)) { 2162 return UCharacter.getCombiningClass(args.c1); 2163 } else if (args.current!=args.start && 2164 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { 2165 --args.current; 2166 return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); 2167 } else { 2168 /* unpaired second surrogate */ 2169 args.c2=0; 2170 return 0; 2171 } 2172 } 2173 getPreviousTrailCC(CharSequence s, int start, int p)2174 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2175 if (start == p) { 2176 return 0; 2177 } 2178 return getFCD16(Character.codePointBefore(s, p)); 2179 } 2180 2181 private VersionInfo dataVersion; 2182 2183 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2184 private int minDecompNoCP; 2185 private int minCompNoMaybeCP; 2186 private int minLcccCP; 2187 2188 // Norm16 value thresholds for quick check combinations and types of extra data. 2189 private int minYesNo; 2190 private int minYesNoMappingsOnly; 2191 private int minNoNo; 2192 private int minNoNoCompBoundaryBefore; 2193 private int minNoNoCompNoMaybeCC; 2194 private int minNoNoEmpty; 2195 private int limitNoNo; 2196 private int centerNoNoDelta; 2197 private int minMaybeYes; 2198 2199 private Trie2_16 normTrie; 2200 private String maybeYesCompositions; 2201 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2202 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2203 2204 } 2205