1 /*
2  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /*
27  *******************************************************************************
28  *   Copyright (C) 2009-2014, International Business Machines
29  *   Corporation and others.  All Rights Reserved.
30  *******************************************************************************
31  */
32 package sun.text.normalizer;
33 
34 import java.io.IOException;
35 import java.nio.ByteBuffer;
36 import java.text.Normalizer;
37 
38 // Original filename in ICU4J: Normalizer2Impl.java
39 public final class NormalizerImpl {
40     public static final class Hangul {
41         /* Korean Hangul and Jamo constants */
42         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
43         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
44         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
45 
46         public static final int HANGUL_BASE=0xac00;
47         public static final int HANGUL_END=0xd7a3;
48 
49         public static final int JAMO_L_COUNT=19;
50         public static final int JAMO_V_COUNT=21;
51         public static final int JAMO_T_COUNT=28;
52 
53         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
54         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
55 
isHangul(int c)56         public static boolean isHangul(int c) {
57             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
58         }
isHangulLV(int c)59         public static boolean isHangulLV(int c) {
60             c-=HANGUL_BASE;
61             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
62         }
63 
64                 /**
65          * Decomposes c, which must be a Hangul syllable, into buffer
66          * and returns the length of the decomposition (2 or 3).
67          */
decompose(int c, Appendable buffer)68         public static int decompose(int c, Appendable buffer) {
69             try {
70                 c-=HANGUL_BASE;
71                 int c2=c%JAMO_T_COUNT;
72                 c/=JAMO_T_COUNT;
73                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
74                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
75                 if(c2==0) {
76                     return 2;
77                 } else {
78                     buffer.append((char)(JAMO_T_BASE+c2));
79                     return 3;
80                 }
81             } catch(IOException e) {
82                 throw new InternalError(e);
83             }
84         }
85     }
86 
87     /**
88      * Writable buffer that takes care of canonical ordering.
89      * Its Appendable methods behave like the C++ implementation's
90      * appendZeroCC() methods.
91      * <p>
92      * If dest is a StringBuilder, then the buffer writes directly to it.
93      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
94      * until no further changes are necessary and whole segments are appended.
95      * append() methods that take combining-class values always write to the StringBuilder.
96      * Other append() methods flush and append to the Appendable.
97      */
98     public static final class ReorderingBuffer implements Appendable {
ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity)99         public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
100             impl=ni;
101             app=dest;
102             if (app instanceof StringBuilder) {
103                 appIsStringBuilder=true;
104                 str=(StringBuilder)dest;
105                 // In Java, the constructor subsumes public void init(int destCapacity)
106                 str.ensureCapacity(destCapacity);
107                 reorderStart=0;
108                 if(str.length()==0) {
109                     lastCC=0;
110                 } else {
111                     setIterator();
112                     lastCC=previousCC();
113                     // Set reorderStart after the last code point with cc<=1 if there is one.
114                     if(lastCC>1) {
115                         while(previousCC()>1) {}
116                     }
117                     reorderStart=codePointLimit;
118                 }
119             } else {
120                 appIsStringBuilder=false;
121                 str=new StringBuilder();
122                 reorderStart=0;
123                 lastCC=0;
124             }
125         }
126 
isEmpty()127         public boolean isEmpty() { return str.length()==0; }
length()128         public int length() { return str.length(); }
getLastCC()129         public int getLastCC() { return lastCC; }
130 
getStringBuilder()131         public StringBuilder getStringBuilder() { return str; }
132 
equals(CharSequence s, int start, int limit)133         public boolean equals(CharSequence s, int start, int limit) {
134             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
135         }
136 
append(int c, int cc)137         public void append(int c, int cc) {
138             if(lastCC<=cc || cc==0) {
139                 str.appendCodePoint(c);
140                 lastCC=cc;
141                 if(cc<=1) {
142                     reorderStart=str.length();
143                 }
144             } else {
145                 insert(c, cc);
146             }
147         }
148         // s must be in NFD, otherwise change the implementation.
append(CharSequence s, int start, int limit, int leadCC, int trailCC)149         public void append(CharSequence s, int start, int limit,
150                            int leadCC, int trailCC) {
151             if(start==limit) {
152                 return;
153             }
154             if(lastCC<=leadCC || leadCC==0) {
155                 if(trailCC<=1) {
156                     reorderStart=str.length()+(limit-start);
157                 } else if(leadCC<=1) {
158                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
159                 }
160                 str.append(s, start, limit);
161                 lastCC=trailCC;
162             } else {
163                 int c=Character.codePointAt(s, start);
164                 start+=Character.charCount(c);
165                 insert(c, leadCC);  // insert first code point
166                 while(start<limit) {
167                     c=Character.codePointAt(s, start);
168                     start+=Character.charCount(c);
169                     if(start<limit) {
170                         // s must be in NFD, otherwise we need to use getCC().
171                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
172                     } else {
173                         leadCC=trailCC;
174                     }
175                     append(c, leadCC);
176                 }
177             }
178         }
179         // The following append() methods work like C++ appendZeroCC().
180         // They assume that the cc or trailCC of their input is 0.
181         // Most of them implement Appendable interface methods.
182         @Override
append(char c)183         public ReorderingBuffer append(char c) {
184             str.append(c);
185             lastCC=0;
186             reorderStart=str.length();
187             return this;
188         }
appendZeroCC(int c)189         public void appendZeroCC(int c) {
190             str.appendCodePoint(c);
191             lastCC=0;
192             reorderStart=str.length();
193         }
194         @Override
append(CharSequence s)195         public ReorderingBuffer append(CharSequence s) {
196             if(s.length()!=0) {
197                 str.append(s);
198                 lastCC=0;
199                 reorderStart=str.length();
200             }
201             return this;
202         }
203         @Override
append(CharSequence s, int start, int limit)204         public ReorderingBuffer append(CharSequence s, int start, int limit) {
205             if(start!=limit) {
206                 str.append(s, start, limit);
207                 lastCC=0;
208                 reorderStart=str.length();
209             }
210             return this;
211         }
212         /**
213          * Flushes from the intermediate StringBuilder to the Appendable,
214          * if they are different objects.
215          * Used after recomposition.
216          * Must be called at the end when writing to a non-StringBuilder Appendable.
217          */
flush()218         public void flush() {
219             if(appIsStringBuilder) {
220                 reorderStart=str.length();
221             } else {
222                 try {
223                     app.append(str);
224                     str.setLength(0);
225                     reorderStart=0;
226                 } catch(IOException e) {
227                     throw new InternalError(e);  // Avoid declaring "throws IOException".
228                 }
229             }
230             lastCC=0;
231         }
232         /**
233          * Flushes from the intermediate StringBuilder to the Appendable,
234          * if they are different objects.
235          * Then appends the new text to the Appendable or StringBuilder.
236          * Normally used after quick check loops find a non-empty sequence.
237          */
flushAndAppendZeroCC(CharSequence s, int start, int limit)238         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
239             if(appIsStringBuilder) {
240                 str.append(s, start, limit);
241                 reorderStart=str.length();
242             } else {
243                 try {
244                     app.append(str).append(s, start, limit);
245                     str.setLength(0);
246                     reorderStart=0;
247                 } catch(IOException e) {
248                     throw new InternalError(e);  // Avoid declaring "throws IOException".
249                 }
250             }
251             lastCC=0;
252             return this;
253         }
remove()254         public void remove() {
255             str.setLength(0);
256             lastCC=0;
257             reorderStart=0;
258         }
removeSuffix(int suffixLength)259         public void removeSuffix(int suffixLength) {
260             int oldLength=str.length();
261             str.delete(oldLength-suffixLength, oldLength);
262             lastCC=0;
263             reorderStart=str.length();
264         }
265 
266         // Inserts c somewhere before the last character.
267         // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(int c, int cc)268         private void insert(int c, int cc) {
269             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
270             // insert c at codePointLimit, after the character with prevCC<=cc
271             if(c<=0xffff) {
272                 str.insert(codePointLimit, (char)c);
273                 if(cc<=1) {
274                     reorderStart=codePointLimit+1;
275                 }
276             } else {
277                 str.insert(codePointLimit, Character.toChars(c));
278                 if(cc<=1) {
279                     reorderStart=codePointLimit+2;
280                 }
281             }
282         }
283 
284         private final NormalizerImpl impl;
285         private final Appendable app;
286         private final StringBuilder str;
287         private final boolean appIsStringBuilder;
288         private int reorderStart;
289         private int lastCC;
290 
291         // private backward iterator
setIterator()292         private void setIterator() { codePointStart=str.length(); }
skipPrevious()293         private void skipPrevious() {  // Requires 0<codePointStart.
294             codePointLimit=codePointStart;
295             codePointStart=str.offsetByCodePoints(codePointStart, -1);
296         }
previousCC()297         private int previousCC() {  // Returns 0 if there is no previous character.
298             codePointLimit=codePointStart;
299             if(reorderStart>=codePointStart) {
300                 return 0;
301             }
302             int c=str.codePointBefore(codePointStart);
303             codePointStart-=Character.charCount(c);
304             return impl.getCCFromYesOrMaybeCP(c);
305         }
306         private int codePointStart, codePointLimit;
307     }
308 
309     // TODO: Propose as public API on the UTF16 class.
310     // TODO: Propose widening UTF16 methods that take char to take int.
311     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
312     public static final class UTF16Plus {
313         /**
314          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
315          * is it a lead surrogate?
316          * @param c code unit or code point
317          * @return true or false
318          */
isSurrogateLead(int c)319         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
320 
321         /**
322          * Compares two CharSequence subsequences for binary equality.
323          * @param s1 first sequence
324          * @param start1 start offset in first sequence
325          * @param limit1 limit offset in first sequence
326          * @param s2 second sequence
327          * @param start2 start offset in second sequence
328          * @param limit2 limit offset in second sequence
329          * @return true if s1.subSequence(start1, limit1) contains the same text
330          *              as s2.subSequence(start2, limit2)
331          */
equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)332         public static boolean equal(CharSequence s1, int start1, int limit1,
333                                     CharSequence s2, int start2, int limit2) {
334             if((limit1-start1)!=(limit2-start2)) {
335                 return false;
336             }
337             if(s1==s2 && start1==start2) {
338                 return true;
339             }
340             while(start1<limit1) {
341                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
342                     return false;
343                 }
344             }
345             return true;
346         }
347     }
348 
NormalizerImpl()349     public NormalizerImpl() {}
350 
351     private static final class IsAcceptable implements ICUBinary.Authenticate {
isDataVersionAcceptable(byte version[])352         public boolean isDataVersionAcceptable(byte version[]) {
353             return version[0]==3;
354         }
355     }
356     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
357     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
358 
load(ByteBuffer bytes)359     public NormalizerImpl load(ByteBuffer bytes) {
360         try {
361             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
362             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
363             if(indexesLength<=IX_MIN_LCCC_CP) {
364                 throw new InternalError("Normalizer2 data: not enough indexes");
365             }
366             int[] inIndexes=new int[indexesLength];
367             inIndexes[0]=indexesLength*4;
368             for(int i=1; i<indexesLength; ++i) {
369                 inIndexes[i]=bytes.getInt();
370             }
371 
372             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
373             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
374             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
375 
376             minYesNo=inIndexes[IX_MIN_YES_NO];
377             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
378             minNoNo=inIndexes[IX_MIN_NO_NO];
379             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
380             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
381             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
382             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
383             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
384             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
385             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
386 
387             // Read the normTrie.
388             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
389             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
390             normTrie=Trie2_16.createFromSerialized(bytes);
391             int trieLength=normTrie.getSerializedLength();
392             if(trieLength>(nextOffset-offset)) {
393                 throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
394             }
395             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
396 
397             // Read the composition and mapping data.
398             offset=nextOffset;
399             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
400             int numChars=(nextOffset-offset)/2;
401             char[] chars;
402             if(numChars!=0) {
403                 chars=new char[numChars];
404                 for(int i=0; i<numChars; ++i) {
405                     chars[i]=bytes.getChar();
406                 }
407                 maybeYesCompositions=new String(chars);
408                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
409             }
410 
411             // smallFCD: new in formatVersion 2
412             offset=nextOffset;
413             smallFCD=new byte[0x100];
414             bytes.get(smallFCD);
415 
416             return this;
417         } catch(IOException e) {
418             throw new InternalError(e);
419         }
420     }
load(String name)421     public NormalizerImpl load(String name) {
422         return load(ICUBinary.getRequiredData(name));
423     }
424 
425 
getNorm16(int c)426     public int getNorm16(int c) { return normTrie.get(c); }
isAlgorithmicNoNo(int norm16)427     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
isCompNo(int norm16)428     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
isDecompYes(int norm16)429     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
430 
getCC(int norm16)431     public int getCC(int norm16) {
432         if(norm16>=MIN_NORMAL_MAYBE_YES) {
433             return getCCFromNormalYesOrMaybe(norm16);
434         }
435         if(norm16<minNoNo || limitNoNo<=norm16) {
436             return 0;
437         }
438         return getCCFromNoNo(norm16);
439     }
getCCFromNormalYesOrMaybe(int norm16)440     public static int getCCFromNormalYesOrMaybe(int norm16) {
441         return (norm16 >> OFFSET_SHIFT) & 0xff;
442     }
getCCFromYesOrMaybe(int norm16)443     public static int getCCFromYesOrMaybe(int norm16) {
444         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
445     }
getCCFromYesOrMaybeCP(int c)446     public int getCCFromYesOrMaybeCP(int c) {
447         if (c < minCompNoMaybeCP) { return 0; }
448         return getCCFromYesOrMaybe(getNorm16(c));
449     }
450 
451     /**
452      * Returns the FCD data for code point c.
453      * @param c A Unicode code point.
454      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
455      */
getFCD16(int c)456     public int getFCD16(int c) {
457         if(c<minDecompNoCP) {
458             return 0;
459         } else if(c<=0xffff) {
460             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
461         }
462         return getFCD16FromNormData(c);
463     }
464     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
singleLeadMightHaveNonZeroFCD16(int lead)465     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
466         // 0<=lead<=0xffff
467         byte bits=smallFCD[lead>>8];
468         if(bits==0) { return false; }
469         return ((bits>>((lead>>5)&7))&1)!=0;
470     }
471 
472     /** Gets the FCD value from the regular normalization data. */
getFCD16FromNormData(int c)473     public int getFCD16FromNormData(int c) {
474         int norm16=getNorm16(c);
475         if (norm16 >= limitNoNo) {
476             if(norm16>=MIN_NORMAL_MAYBE_YES) {
477                 // combining mark
478                 norm16=getCCFromNormalYesOrMaybe(norm16);
479                 return norm16|(norm16<<8);
480             } else if(norm16>=minMaybeYes) {
481                 return 0;
482             } else {  // isDecompNoAlgorithmic(norm16)
483                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
484                 if (deltaTrailCC <= DELTA_TCCC_1) {
485                     return deltaTrailCC >> OFFSET_SHIFT;
486                 }
487                 // Maps to an isCompYesAndZeroCC.
488                 c=mapAlgorithmic(c, norm16);
489                 norm16=getNorm16(c);
490             }
491         }
492         if(norm16<=minYesNo || isHangulLVT(norm16)) {
493             // no decomposition or Hangul syllable, all zeros
494             return 0;
495         }
496         // c decomposes, get everything from the variable-length extra data
497         int mapping=norm16>>OFFSET_SHIFT;
498         int firstUnit=extraData.charAt(mapping);
499         int fcd16=firstUnit>>8;  // tccc
500         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
501             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
502         }
503         return fcd16;
504     }
505 
506     /**
507      * Gets the decomposition for one code point.
508      * @param c code point
509      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
510      */
getDecomposition(int c)511     public String getDecomposition(int c) {
512         int norm16;
513         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
514             // c does not decompose
515             return null;
516         }
517         int decomp = -1;
518         if(isDecompNoAlgorithmic(norm16)) {
519             // Maps to an isCompYesAndZeroCC.
520             decomp=c=mapAlgorithmic(c, norm16);
521             // The mapping might decompose further.
522             norm16 = getNorm16(c);
523         }
524         if (norm16 < minYesNo) {
525             if(decomp<0) {
526                 return null;
527             } else {
528                 return UTF16.valueOf(decomp);
529             }
530         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
531             // Hangul syllable: decompose algorithmically
532             StringBuilder buffer=new StringBuilder();
533             Hangul.decompose(c, buffer);
534             return buffer.toString();
535         }
536         // c decomposes, get everything from the variable-length extra data
537         int mapping=norm16>>OFFSET_SHIFT;
538         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
539         return extraData.substring(mapping, mapping+length);
540     }
541 
542     // Fixed norm16 values.
543     public static final int MIN_YES_YES_WITH_CC=0xfe02;
544     public static final int JAMO_VT=0xfe00;
545     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
546     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
547     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
548 
549     // norm16 bit 0 is comp-boundary-after.
550     public static final int HAS_COMP_BOUNDARY_AFTER=1;
551     public static final int OFFSET_SHIFT=1;
552 
553     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
554     // tccc (0, 1, >1) for quick FCC boundary-after tests.
555     public static final int DELTA_TCCC_0=0;
556     public static final int DELTA_TCCC_1=2;
557     public static final int DELTA_TCCC_GT_1=4;
558     public static final int DELTA_TCCC_MASK=6;
559     public static final int DELTA_SHIFT=3;
560 
561     public static final int MAX_DELTA=0x40;
562 
563     // Byte offsets from the start of the data, after the generic header.
564     public static final int IX_NORM_TRIE_OFFSET=0;
565     public static final int IX_EXTRA_DATA_OFFSET=1;
566     public static final int IX_SMALL_FCD_OFFSET=2;
567     public static final int IX_RESERVED3_OFFSET=3;
568     public static final int IX_TOTAL_SIZE=7;
569     public static final int MIN_CCC_LCCC_CP=0x300;
570     // Code point thresholds for quick check codes.
571     public static final int IX_MIN_DECOMP_NO_CP=8;
572     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
573 
574     // Norm16 value thresholds for quick check combinations and types of extra data.
575 
576     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
577     public static final int IX_MIN_YES_NO=10;
578     /** Mappings are comp-normalized. */
579     public static final int IX_MIN_NO_NO=11;
580     public static final int IX_LIMIT_NO_NO=12;
581     public static final int IX_MIN_MAYBE_YES=13;
582 
583     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
584     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
585     /** Mappings are not comp-normalized but have a comp boundary before. */
586     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
587     /** Mappings do not have a comp boundary before. */
588     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
589     /** Mappings to the empty string. */
590     public static final int IX_MIN_NO_NO_EMPTY=17;
591 
592     public static final int IX_MIN_LCCC_CP=18;
593     public static final int IX_COUNT=20;
594 
595     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
596     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
597     // unused bit 0x20;
598     public static final int MAPPING_LENGTH_MASK=0x1f;
599 
600     public static final int COMP_1_LAST_TUPLE=0x8000;
601     public static final int COMP_1_TRIPLE=1;
602     public static final int COMP_1_TRAIL_LIMIT=0x3400;
603     public static final int COMP_1_TRAIL_MASK=0x7ffe;
604     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
605     public static final int COMP_2_TRAIL_SHIFT=6;
606     public static final int COMP_2_TRAIL_MASK=0xffc0;
607 
608     // higher-level functionality ------------------------------------------ ***
609 
610     /**
611      * Decomposes s[src, limit[ and writes the result to dest.
612      * limit can be NULL if src is NUL-terminated.
613      * destLengthEstimate is the initial dest buffer capacity and can be -1.
614      */
decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)615     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
616                    int destLengthEstimate) {
617         if(destLengthEstimate<0) {
618             destLengthEstimate=limit-src;
619         }
620         dest.setLength(0);
621         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
622         decompose(s, src, limit, buffer);
623     }
624 
625     // Dual functionality:
626     // buffer!=NULL: normalize
627     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)628     public int decompose(CharSequence s, int src, int limit,
629                          ReorderingBuffer buffer) {
630         int minNoCP=minDecompNoCP;
631 
632         int prevSrc;
633         int c=0;
634         int norm16=0;
635 
636         // only for quick check
637         int prevBoundary=src;
638         int prevCC=0;
639 
640         for(;;) {
641             // count code units below the minimum or with irrelevant data for the quick check
642             for(prevSrc=src; src!=limit;) {
643                 if( (c=s.charAt(src))<minNoCP ||
644                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
645                 ) {
646                     ++src;
647                 } else if(!UTF16.isSurrogate((char)c)) {
648                     break;
649                 } else {
650                     char c2;
651                     if(UTF16Plus.isSurrogateLead(c)) {
652                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
653                             c=Character.toCodePoint((char)c, c2);
654                         }
655                     } else /* trail surrogate */ {
656                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
657                             --src;
658                             c=Character.toCodePoint(c2, (char)c);
659                         }
660                     }
661                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
662                         src+=Character.charCount(c);
663                     } else {
664                         break;
665                     }
666                 }
667             }
668             // copy these code units all at once
669             if(src!=prevSrc) {
670                 if(buffer!=null) {
671                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
672                 } else {
673                     prevCC=0;
674                     prevBoundary=src;
675                 }
676             }
677             if(src==limit) {
678                 break;
679             }
680 
681             // Check one above-minimum, relevant code point.
682             src+=Character.charCount(c);
683             if(buffer!=null) {
684                 decompose(c, norm16, buffer);
685             } else {
686                 if(isDecompYes(norm16)) {
687                     int cc=getCCFromYesOrMaybe(norm16);
688                     if(prevCC<=cc || cc==0) {
689                         prevCC=cc;
690                         if(cc<=1) {
691                             prevBoundary=src;
692                         }
693                         continue;
694                     }
695                 }
696                 return prevBoundary;  // "no" or cc out of order
697             }
698         }
699         return src;
700     }
decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)701     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
702         int limit=s.length();
703         if(limit==0) {
704             return;
705         }
706         if(doDecompose) {
707             decompose(s, 0, limit, buffer);
708             return;
709         }
710         // Just merge the strings at the boundary.
711         int c=Character.codePointAt(s, 0);
712         int src=0;
713         int firstCC, prevCC, cc;
714         firstCC=prevCC=cc=getCC(getNorm16(c));
715         while(cc!=0) {
716             prevCC=cc;
717             src+=Character.charCount(c);
718             if(src>=limit) {
719                 break;
720             }
721             c=Character.codePointAt(s, src);
722             cc=getCC(getNorm16(c));
723         };
724         buffer.append(s, 0, src, firstCC, prevCC);
725         buffer.append(s, src, limit);
726     }
727 
728     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
729     // doCompose: normalize
730     // !doCompose: isNormalized (buffer must be empty and initialized)
compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)731     public boolean compose(CharSequence s, int src, int limit,
732                            boolean onlyContiguous,
733                            boolean doCompose,
734                            ReorderingBuffer buffer) {
735         int prevBoundary=src;
736         int minNoMaybeCP=minCompNoMaybeCP;
737 
738         for (;;) {
739             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
740             // or with (compYes && ccc==0) properties.
741             int prevSrc;
742             int c = 0;
743             int norm16 = 0;
744             for (;;) {
745                 if (src == limit) {
746                     if (prevBoundary != limit && doCompose) {
747                         buffer.append(s, prevBoundary, limit);
748                     }
749                     return true;
750                 }
751                 if( (c=s.charAt(src))<minNoMaybeCP ||
752                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
753                 ) {
754                     ++src;
755                 } else {
756                     prevSrc = src++;
757                     if(!UTF16.isSurrogate((char)c)) {
758                         break;
759                     } else {
760                         char c2;
761                         if(UTF16Plus.isSurrogateLead(c)) {
762                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
763                                 ++src;
764                                 c=Character.toCodePoint((char)c, c2);
765                             }
766                         } else /* trail surrogate */ {
767                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
768                                 --prevSrc;
769                                 c=Character.toCodePoint(c2, (char)c);
770                             }
771                         }
772                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
773                             break;
774                         }
775                     }
776                 }
777             }
778             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
779             // The current character is either a "noNo" (has a mapping)
780             // or a "maybeYes" (combines backward)
781             // or a "yesYes" with ccc!=0.
782             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
783 
784             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
785             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
786                 if (!doCompose) {
787                     return false;
788                 }
789                 // Fast path for mapping a character that is immediately surrounded by boundaries.
790                 // In this case, we need not decompose around the current character.
791                 if (isDecompNoAlgorithmic(norm16)) {
792                     // Maps to a single isCompYesAndZeroCC character
793                     // which also implies hasCompBoundaryBefore.
794                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
795                             hasCompBoundaryBefore(s, src, limit)) {
796                         if (prevBoundary != prevSrc) {
797                             buffer.append(s, prevBoundary, prevSrc);
798                         }
799                         buffer.append(mapAlgorithmic(c, norm16), 0);
800                         prevBoundary = src;
801                         continue;
802                     }
803                 } else if (norm16 < minNoNoCompBoundaryBefore) {
804                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
805                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
806                             hasCompBoundaryBefore(s, src, limit)) {
807                         if (prevBoundary != prevSrc) {
808                             buffer.append(s, prevBoundary, prevSrc);
809                         }
810                         int mapping = norm16 >> OFFSET_SHIFT;
811                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
812                         buffer.append(extraData, mapping, mapping + length);
813                         prevBoundary = src;
814                         continue;
815                     }
816                 } else if (norm16 >= minNoNoEmpty) {
817                     // The current character maps to nothing.
818                     // Simply omit it from the output if there is a boundary before _or_ after it.
819                     // The character itself implies no boundaries.
820                     if (hasCompBoundaryBefore(s, src, limit) ||
821                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
822                         if (prevBoundary != prevSrc) {
823                             buffer.append(s, prevBoundary, prevSrc);
824                         }
825                         prevBoundary = src;
826                         continue;
827                     }
828                 }
829                 // Other "noNo" type, or need to examine more text around this character:
830                 // Fall through to the slow path.
831             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
832                 char prev=s.charAt(prevSrc-1);
833                 if(c<Hangul.JAMO_T_BASE) {
834                     // The current character is a Jamo Vowel,
835                     // compose with previous Jamo L and following Jamo T.
836                     char l = (char)(prev-Hangul.JAMO_L_BASE);
837                     if(l<Hangul.JAMO_L_COUNT) {
838                         if (!doCompose) {
839                             return false;
840                         }
841                         int t;
842                         if (src != limit &&
843                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
844                                 t < Hangul.JAMO_T_COUNT) {
845                             // The next character is a Jamo T.
846                             ++src;
847                         } else if (hasCompBoundaryBefore(s, src, limit)) {
848                             // No Jamo T follows, not even via decomposition.
849                             t = 0;
850                         } else {
851                             t = -1;
852                         }
853                         if (t >= 0) {
854                             int syllable = Hangul.HANGUL_BASE +
855                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
856                                 Hangul.JAMO_T_COUNT + t;
857                             --prevSrc;  // Replace the Jamo L as well.
858                             if (prevBoundary != prevSrc) {
859                                 buffer.append(s, prevBoundary, prevSrc);
860                             }
861                             buffer.append((char)syllable);
862                             prevBoundary = src;
863                             continue;
864                         }
865                         // If we see L+V+x where x!=T then we drop to the slow path,
866                         // decompose and recompose.
867                         // This is to deal with NFKC finding normal L and V but a
868                         // compatibility variant of a T.
869                         // We need to either fully compose that combination here
870                         // (which would complicate the code and may not work with strange custom data)
871                         // or use the slow path.
872                     }
873                 } else if (Hangul.isHangulLV(prev)) {
874                     // The current character is a Jamo Trailing consonant,
875                     // compose with previous Hangul LV that does not contain a Jamo T.
876                     if (!doCompose) {
877                         return false;
878                     }
879                     int syllable = prev + c - Hangul.JAMO_T_BASE;
880                     --prevSrc;  // Replace the Hangul LV as well.
881                     if (prevBoundary != prevSrc) {
882                         buffer.append(s, prevBoundary, prevSrc);
883                     }
884                     buffer.append((char)syllable);
885                     prevBoundary = src;
886                     continue;
887                 }
888                 // No matching context, or may need to decompose surrounding text first:
889                 // Fall through to the slow path.
890             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
891                 // One or more combining marks that do not combine-back:
892                 // Check for canonical order, copy unchanged if ok and
893                 // if followed by a character with a boundary-before.
894                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
895                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
896                     // Fails FCD test, need to decompose and contiguously recompose.
897                     if (!doCompose) {
898                         return false;
899                     }
900                 } else {
901                     // If !onlyContiguous (not FCC), then we ignore the tccc of
902                     // the previous character which passed the quick check "yes && ccc==0" test.
903                     int n16;
904                     for (;;) {
905                         if (src == limit) {
906                             if (doCompose) {
907                                 buffer.append(s, prevBoundary, limit);
908                             }
909                             return true;
910                         }
911                         int prevCC = cc;
912                         c = Character.codePointAt(s, src);
913                         n16 = normTrie.get(c);
914                         if (n16 >= MIN_YES_YES_WITH_CC) {
915                             cc = getCCFromNormalYesOrMaybe(n16);
916                             if (prevCC > cc) {
917                                 if (!doCompose) {
918                                     return false;
919                                 }
920                                 break;
921                             }
922                         } else {
923                             break;
924                         }
925                         src += Character.charCount(c);
926                     }
927                     // p is after the last in-order combining mark.
928                     // If there is a boundary here, then we continue with no change.
929                     if (norm16HasCompBoundaryBefore(n16)) {
930                         if (isCompYesAndZeroCC(n16)) {
931                             src += Character.charCount(c);
932                         }
933                         continue;
934                     }
935                     // Use the slow path. There is no boundary in [prevSrc, src[.
936                 }
937             }
938 
939             // Slow path: Find the nearest boundaries around the current character,
940             // decompose and recompose.
941             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
942                 c = Character.codePointBefore(s, prevSrc);
943                 norm16 = normTrie.get(c);
944                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
945                     prevSrc -= Character.charCount(c);
946                 }
947             }
948             if (doCompose && prevBoundary != prevSrc) {
949                 buffer.append(s, prevBoundary, prevSrc);
950             }
951             int recomposeStartIndex=buffer.length();
952             // We know there is not a boundary here.
953             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
954                            buffer);
955             // Decompose until the next boundary.
956             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
957                                  buffer);
958             recompose(buffer, recomposeStartIndex, onlyContiguous);
959             if(!doCompose) {
960                 if(!buffer.equals(s, prevSrc, src)) {
961                     return false;
962                 }
963                 buffer.remove();
964             }
965             prevBoundary=src;
966         }
967     }
968 
969     /**
970      * Very similar to compose(): Make the same changes in both places if relevant.
971      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
972      * !doSpan: quickCheck
973      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
974      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
975      *         then the quick check result is "no"
976      */
composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)977     public int composeQuickCheck(CharSequence s, int src, int limit,
978                                  boolean onlyContiguous, boolean doSpan) {
979         int qcResult=0;
980         int prevBoundary=src;
981         int minNoMaybeCP=minCompNoMaybeCP;
982 
983         for(;;) {
984             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
985             // or with (compYes && ccc==0) properties.
986             int prevSrc;
987             int c = 0;
988             int norm16 = 0;
989             for (;;) {
990                 if(src==limit) {
991                     return (src<<1)|qcResult;  // "yes" or "maybe"
992                 }
993                 if( (c=s.charAt(src))<minNoMaybeCP ||
994                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
995                 ) {
996                     ++src;
997                 } else {
998                     prevSrc = src++;
999                     if(!UTF16.isSurrogate((char)c)) {
1000                         break;
1001                     } else {
1002                         char c2;
1003                         if(UTF16Plus.isSurrogateLead(c)) {
1004                             if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
1005                                 ++src;
1006                                 c=Character.toCodePoint((char)c, c2);
1007                             }
1008                         } else /* trail surrogate */ {
1009                             if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
1010                                 --prevSrc;
1011                                 c=Character.toCodePoint(c2, (char)c);
1012                             }
1013                         }
1014                         if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1015                             break;
1016                         }
1017                     }
1018                 }
1019             }
1020             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1021             // The current character is either a "noNo" (has a mapping)
1022             // or a "maybeYes" (combines backward)
1023             // or a "yesYes" with ccc!=0.
1024             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1025 
1026             int prevNorm16 = INERT;
1027             if (prevBoundary != prevSrc) {
1028                 prevBoundary = prevSrc;
1029                 if (!norm16HasCompBoundaryBefore(norm16)) {
1030                     c = Character.codePointBefore(s, prevSrc);
1031                     int n16 = getNorm16(c);
1032                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1033                         prevBoundary -= Character.charCount(c);
1034                         prevNorm16 = n16;
1035                     }
1036                 }
1037             }
1038 
1039             if(isMaybeOrNonZeroCC(norm16)) {
1040                 int cc=getCCFromYesOrMaybe(norm16);
1041                 if (onlyContiguous /* FCC */ && cc != 0 &&
1042                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1043                     // The [prevBoundary..prevSrc[ character
1044                     // passed the quick check "yes && ccc==0" test
1045                     // but is out of canonical order with the current combining mark.
1046                 } else {
1047                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1048                     // the previous character which passed the quick check "yes && ccc==0" test.
1049                     for (;;) {
1050                         if (norm16 < MIN_YES_YES_WITH_CC) {
1051                             if (!doSpan) {
1052                                 qcResult = 1;
1053                             } else {
1054                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1055                             }
1056                         }
1057                         if (src == limit) {
1058                             return (src<<1) | qcResult;  // "yes" or "maybe"
1059                         }
1060                         int prevCC = cc;
1061                         c = Character.codePointAt(s, src);
1062                         norm16 = getNorm16(c);
1063                         if (isMaybeOrNonZeroCC(norm16)) {
1064                             cc = getCCFromYesOrMaybe(norm16);
1065                             if (!(prevCC <= cc || cc == 0)) {
1066                                 break;
1067                             }
1068                         } else {
1069                             break;
1070                         }
1071                         src += Character.charCount(c);
1072                     }
1073                     // src is after the last in-order combining mark.
1074                     if (isCompYesAndZeroCC(norm16)) {
1075                         prevBoundary = src;
1076                         src += Character.charCount(c);
1077                         continue;
1078                     }
1079                 }
1080             }
1081             return prevBoundary<<1;  // "no"
1082         }
1083     }
composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1084     public void composeAndAppend(CharSequence s,
1085                                  boolean doCompose,
1086                                  boolean onlyContiguous,
1087                                  ReorderingBuffer buffer) {
1088         int src=0, limit=s.length();
1089         if(!buffer.isEmpty()) {
1090             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1091             if(0!=firstStarterInSrc) {
1092                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1093                                                                buffer.length(), onlyContiguous);
1094                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1095                                                        firstStarterInSrc+16);
1096                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1097                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1098                 middle.append(s, 0, firstStarterInSrc);
1099                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1100                 src=firstStarterInSrc;
1101             }
1102         }
1103         if(doCompose) {
1104             compose(s, src, limit, onlyContiguous, true, buffer);
1105         } else {
1106             buffer.append(s, src, limit);
1107         }
1108     }
1109     // Dual functionality:
1110     // buffer!=NULL: normalize
1111     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1112     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1113         // Note: In this function we use buffer->appendZeroCC() because we track
1114         // the lead and trail combining classes here, rather than leaving it to
1115         // the ReorderingBuffer.
1116         // The exception is the call to decomposeShort() which uses the buffer
1117         // in the normal way.
1118 
1119         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1120         // Similar to the prevBoundary in the compose() implementation.
1121         int prevBoundary=src;
1122         int prevSrc;
1123         int c=0;
1124         int prevFCD16=0;
1125         int fcd16=0;
1126 
1127         for(;;) {
1128             // count code units with lccc==0
1129             for(prevSrc=src; src!=limit;) {
1130                 if((c=s.charAt(src))<minLcccCP) {
1131                     prevFCD16=~c;
1132                     ++src;
1133                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1134                     prevFCD16=0;
1135                     ++src;
1136                 } else {
1137                     if(UTF16.isSurrogate((char)c)) {
1138                         char c2;
1139                         if(UTF16Plus.isSurrogateLead(c)) {
1140                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1141                                 c=Character.toCodePoint((char)c, c2);
1142                             }
1143                         } else /* trail surrogate */ {
1144                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1145                                 --src;
1146                                 c=Character.toCodePoint(c2, (char)c);
1147                             }
1148                         }
1149                     }
1150                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1151                         prevFCD16=fcd16;
1152                         src+=Character.charCount(c);
1153                     } else {
1154                         break;
1155                     }
1156                 }
1157             }
1158             // copy these code units all at once
1159             if(src!=prevSrc) {
1160                 if(src==limit) {
1161                     if(buffer!=null) {
1162                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1163                     }
1164                     break;
1165                 }
1166                 prevBoundary=src;
1167                 // We know that the previous character's lccc==0.
1168                 if(prevFCD16<0) {
1169                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1170                     int prev=~prevFCD16;
1171                     if(prev<minDecompNoCP) {
1172                         prevFCD16=0;
1173                     } else {
1174                         prevFCD16=getFCD16FromNormData(prev);
1175                         if(prevFCD16>1) {
1176                             --prevBoundary;
1177                         }
1178                     }
1179                 } else {
1180                     int p=src-1;
1181                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1182                         Character.isHighSurrogate(s.charAt(p-1))
1183                     ) {
1184                         --p;
1185                         // Need to fetch the previous character's FCD value because
1186                         // prevFCD16 was just for the trail surrogate code point.
1187                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1188                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1189                     }
1190                     if(prevFCD16>1) {
1191                         prevBoundary=p;
1192                     }
1193                 }
1194                 if(buffer!=null) {
1195                     // The last lccc==0 character is excluded from the
1196                     // flush-and-append call in case it needs to be modified.
1197                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1198                     buffer.append(s, prevBoundary, src);
1199                 }
1200                 // The start of the current character (c).
1201                 prevSrc=src;
1202             } else if(src==limit) {
1203                 break;
1204             }
1205 
1206             src+=Character.charCount(c);
1207             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1208             // Check for proper order, and decompose locally if necessary.
1209             if((prevFCD16&0xff)<=(fcd16>>8)) {
1210                 // proper order: prev tccc <= current lccc
1211                 if((fcd16&0xff)<=1) {
1212                     prevBoundary=src;
1213                 }
1214                 if(buffer!=null) {
1215                     buffer.appendZeroCC(c);
1216                 }
1217                 prevFCD16=fcd16;
1218                 continue;
1219             } else if(buffer==null) {
1220                 return prevBoundary;  // quick check "no"
1221             } else {
1222                 /*
1223                  * Back out the part of the source that we copied or appended
1224                  * already but is now going to be decomposed.
1225                  * prevSrc is set to after what was copied/appended.
1226                  */
1227                 buffer.removeSuffix(prevSrc-prevBoundary);
1228                 /*
1229                  * Find the part of the source that needs to be decomposed,
1230                  * up to the next safe boundary.
1231                  */
1232                 src=findNextFCDBoundary(s, src, limit);
1233                 /*
1234                  * The source text does not fulfill the conditions for FCD.
1235                  * Decompose and reorder a limited piece of the text.
1236                  */
1237                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1238                 prevBoundary=src;
1239                 prevFCD16=0;
1240             }
1241         }
1242         return src;
1243     }
1244 
hasDecompBoundaryBefore(int c)1245     public boolean hasDecompBoundaryBefore(int c) {
1246         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1247             norm16HasDecompBoundaryBefore(getNorm16(c));
1248     }
norm16HasDecompBoundaryBefore(int norm16)1249     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1250         if (norm16 < minNoNoCompNoMaybeCC) {
1251             return true;
1252         }
1253         if (norm16 >= limitNoNo) {
1254             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1255         }
1256         // c decomposes, get everything from the variable-length extra data
1257         int mapping=norm16>>OFFSET_SHIFT;
1258         int firstUnit=extraData.charAt(mapping);
1259         // true if leadCC==0 (hasFCDBoundaryBefore())
1260         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1261     }
hasDecompBoundaryAfter(int c)1262     public boolean hasDecompBoundaryAfter(int c) {
1263         if (c < minDecompNoCP) {
1264             return true;
1265         }
1266         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1267             return true;
1268         }
1269         return norm16HasDecompBoundaryAfter(getNorm16(c));
1270     }
norm16HasDecompBoundaryAfter(int norm16)1271     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1272         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1273             return true;
1274         }
1275         if (norm16 >= limitNoNo) {
1276             if (isMaybeOrNonZeroCC(norm16)) {
1277                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1278             }
1279             // Maps to an isCompYesAndZeroCC.
1280             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1281         }
1282         // c decomposes, get everything from the variable-length extra data
1283         int mapping=norm16>>OFFSET_SHIFT;
1284         int firstUnit=extraData.charAt(mapping);
1285         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1286         // fcd16<=1 || trailCC==0
1287         if(firstUnit>0x1ff) {
1288             return false;  // trailCC>1
1289         }
1290         if(firstUnit<=0xff) {
1291             return true;  // trailCC==0
1292         }
1293         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1294         // true if leadCC==0 (hasFCDBoundaryBefore())
1295         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1296     }
isDecompInert(int c)1297     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1298 
hasCompBoundaryBefore(int c)1299     public boolean hasCompBoundaryBefore(int c) {
1300         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1301     }
hasCompBoundaryAfter(int c, boolean onlyContiguous)1302     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1303         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1304     }
1305 
isMaybe(int norm16)1306     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
isMaybeOrNonZeroCC(int norm16)1307     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
isInert(int norm16)1308     private static boolean isInert(int norm16) { return norm16==INERT; }
isJamoVT(int norm16)1309     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
hangulLVT()1310     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
isHangulLV(int norm16)1311     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
isHangulLVT(int norm16)1312     private boolean isHangulLVT(int norm16) {
1313         return norm16==hangulLVT();
1314     }
isCompYesAndZeroCC(int norm16)1315     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1316     // UBool isCompYes(uint16_t norm16) const {
1317     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1318     // }
1319     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1320     //     return norm16<minNoNo || minMaybeYes<=norm16;
1321     // }
1322     // private boolean hasZeroCCFromDecompYes(int norm16) {
1323     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1324     // }
isDecompYesAndZeroCC(int norm16)1325     private boolean isDecompYesAndZeroCC(int norm16) {
1326         return norm16<minYesNo ||
1327                norm16==JAMO_VT ||
1328                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1329     }
1330     /**
1331      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1332      * the MaybeYes which combine-forward and have ccc=0.
1333      * (Standard Unicode 10 normalization does not have such characters.)
1334      */
isMostDecompYesAndZeroCC(int norm16)1335     private boolean isMostDecompYesAndZeroCC(int norm16) {
1336         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1337     }
isDecompNoAlgorithmic(int norm16)1338     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1339 
1340     // For use with isCompYes().
1341     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1342     // static uint8_t getCCFromYes(uint16_t norm16) {
1343     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1344     // }
getCCFromNoNo(int norm16)1345     private int getCCFromNoNo(int norm16) {
1346         int mapping=norm16>>OFFSET_SHIFT;
1347         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1348             return extraData.charAt(mapping-1)&0xff;
1349         } else {
1350             return 0;
1351         }
1352     }
getTrailCCFromCompYesAndZeroCC(int norm16)1353     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1354         if(norm16<=minYesNo) {
1355             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1356         } else {
1357             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1358             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1359         }
1360     }
1361 
1362     // Requires algorithmic-NoNo.
mapAlgorithmic(int c, int norm16)1363     private int mapAlgorithmic(int c, int norm16) {
1364         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1365     }
1366 
1367     // Requires minYesNo<norm16<limitNoNo.
1368     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1369 
1370     /**
1371      * @return index into maybeYesCompositions, or -1
1372      */
getCompositionsListForDecompYes(int norm16)1373     private int getCompositionsListForDecompYes(int norm16) {
1374         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1375             return -1;
1376         } else {
1377             if((norm16-=minMaybeYes)<0) {
1378                 // norm16<minMaybeYes: index into extraData which is a substring at
1379                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1380                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1381                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1382             }
1383             return norm16>>OFFSET_SHIFT;
1384         }
1385     }
1386     /**
1387      * @return index into maybeYesCompositions
1388      */
getCompositionsListForComposite(int norm16)1389     private int getCompositionsListForComposite(int norm16) {
1390         // A composite has both mapping & compositions list.
1391         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1392         int firstUnit=maybeYesCompositions.charAt(list);
1393         return list+  // mapping in maybeYesCompositions
1394             1+  // +1 to skip the first unit with the mapping length
1395             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1396     }
1397 
1398     // Decompose a short piece of text which is likely to contain characters that
1399     // fail the quick check loop and/or where the quick check loop's overhead
1400     // is unlikely to be amortized.
1401     // Called by the compose() and makeFCD() implementations.
1402     // Public in Java for collation implementation code.
decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1403     private int decomposeShort(
1404             CharSequence s, int src, int limit,
1405             boolean stopAtCompBoundary, boolean onlyContiguous,
1406             ReorderingBuffer buffer) {
1407         while(src<limit) {
1408             int c=Character.codePointAt(s, src);
1409             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1410                 return src;
1411             }
1412             int norm16 = getNorm16(c);
1413             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1414                 return src;
1415             }
1416             src+=Character.charCount(c);
1417             decompose(c, norm16, buffer);
1418             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1419                 return src;
1420             }
1421         }
1422         return src;
1423     }
decompose(int c, int norm16, ReorderingBuffer buffer)1424     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1425         // get the decomposition and the lead and trail cc's
1426         if (norm16 >= limitNoNo) {
1427             if (isMaybeOrNonZeroCC(norm16)) {
1428                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1429                 return;
1430             }
1431             // Maps to an isCompYesAndZeroCC.
1432             c=mapAlgorithmic(c, norm16);
1433             norm16=getNorm16(c);
1434         }
1435         if (norm16 < minYesNo) {
1436             // c does not decompose
1437             buffer.append(c, 0);
1438         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1439             // Hangul syllable: decompose algorithmically
1440             Hangul.decompose(c, buffer);
1441         } else {
1442             // c decomposes, get everything from the variable-length extra data
1443             int mapping=norm16>>OFFSET_SHIFT;
1444             int firstUnit=extraData.charAt(mapping);
1445             int length=firstUnit&MAPPING_LENGTH_MASK;
1446             int leadCC, trailCC;
1447             trailCC=firstUnit>>8;
1448             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1449                 leadCC=extraData.charAt(mapping-1)>>8;
1450             } else {
1451                 leadCC=0;
1452             }
1453             ++mapping;  // skip over the firstUnit
1454             buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
1455         }
1456     }
1457 
1458     /**
1459      * Finds the recomposition result for
1460      * a forward-combining "lead" character,
1461      * specified with a pointer to its compositions list,
1462      * and a backward-combining "trail" character.
1463      *
1464      * <p>If the lead and trail characters combine, then this function returns
1465      * the following "compositeAndFwd" value:
1466      * <pre>
1467      * Bits 21..1  composite character
1468      * Bit      0  set if the composite is a forward-combining starter
1469      * </pre>
1470      * otherwise it returns -1.
1471      *
1472      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1473      * encoded as either pairs or triples of 16-bit units.
1474      * The last entry has the high bit of its first unit set.
1475      *
1476      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1477      * A linear search is used.
1478      *
1479      * <p>See normalizer2impl.h for a more detailed description
1480      * of the compositions list format.
1481      */
combine(String compositions, int list, int trail)1482     private static int combine(String compositions, int list, int trail) {
1483         int key1, firstUnit;
1484         if(trail<COMP_1_TRAIL_LIMIT) {
1485             // trail character is 0..33FF
1486             // result entry may have 2 or 3 units
1487             key1=(trail<<1);
1488             while(key1>(firstUnit=compositions.charAt(list))) {
1489                 list+=2+(firstUnit&COMP_1_TRIPLE);
1490             }
1491             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1492                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1493                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1494                 } else {
1495                     return compositions.charAt(list+1);
1496                 }
1497             }
1498         } else {
1499             // trail character is 3400..10FFFF
1500             // result entry has 3 units
1501             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1502             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1503             int secondUnit;
1504             for(;;) {
1505                 if(key1>(firstUnit=compositions.charAt(list))) {
1506                     list+=2+(firstUnit&COMP_1_TRIPLE);
1507                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1508                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1509                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1510                             break;
1511                         } else {
1512                             list+=3;
1513                         }
1514                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1515                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1516                     } else {
1517                         break;
1518                     }
1519                 } else {
1520                     break;
1521                 }
1522             }
1523         }
1524         return -1;
1525     }
1526 
1527     /*
1528      * Recomposes the buffer text starting at recomposeStartIndex
1529      * (which is in NFD - decomposed and canonically ordered),
1530      * and truncates the buffer contents.
1531      *
1532      * Note that recomposition never lengthens the text:
1533      * Any character consists of either one or two code units;
1534      * a composition may contain at most one more code unit than the original starter,
1535      * while the combining mark that is removed has at least one code unit.
1536      */
recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1537     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1538                            boolean onlyContiguous) {
1539         StringBuilder sb=buffer.getStringBuilder();
1540         int p=recomposeStartIndex;
1541         if(p==sb.length()) {
1542             return;
1543         }
1544 
1545         int starter, pRemove;
1546         int compositionsList;
1547         int c, compositeAndFwd;
1548         int norm16;
1549         int cc, prevCC;
1550         boolean starterIsSupplementary;
1551 
1552         // Some of the following variables are not used until we have a forward-combining starter
1553         // and are only initialized now to avoid compiler warnings.
1554         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1555         starter=-1;
1556         starterIsSupplementary=false;
1557         prevCC=0;
1558 
1559         for(;;) {
1560             c=sb.codePointAt(p);
1561             p+=Character.charCount(c);
1562             norm16=getNorm16(c);
1563             cc=getCCFromYesOrMaybe(norm16);
1564             if( // this character combines backward and
1565                 isMaybe(norm16) &&
1566                 // we have seen a starter that combines forward and
1567                 compositionsList>=0 &&
1568                 // the backward-combining character is not blocked
1569                 (prevCC<cc || prevCC==0)
1570             ) {
1571                 if(isJamoVT(norm16)) {
1572                     // c is a Jamo V/T, see if we can compose it with the previous character.
1573                     if(c<Hangul.JAMO_T_BASE) {
1574                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1575                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1576                         if(prev<Hangul.JAMO_L_COUNT) {
1577                             pRemove=p-1;
1578                             char syllable=(char)
1579                                 (Hangul.HANGUL_BASE+
1580                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1581                                  Hangul.JAMO_T_COUNT);
1582                             char t;
1583                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1584                                 ++p;
1585                                 syllable+=t;  // The next character was a Jamo T.
1586                             }
1587                             sb.setCharAt(starter, syllable);
1588                             // remove the Jamo V/T
1589                             sb.delete(pRemove, p);
1590                             p=pRemove;
1591                         }
1592                     }
1593                     /*
1594                      * No "else" for Jamo T:
1595                      * Since the input is in NFD, there are no Hangul LV syllables that
1596                      * a Jamo T could combine with.
1597                      * All Jamo Ts are combined above when handling Jamo Vs.
1598                      */
1599                     if(p==sb.length()) {
1600                         break;
1601                     }
1602                     compositionsList=-1;
1603                     continue;
1604                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1605                     // The starter and the combining mark (c) do combine.
1606                     int composite=compositeAndFwd>>1;
1607 
1608                     // Remove the combining mark.
1609                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1610                     sb.delete(pRemove, p);
1611                     p=pRemove;
1612                     // Replace the starter with the composite.
1613                     if(starterIsSupplementary) {
1614                         if(composite>0xffff) {
1615                             // both are supplementary
1616                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1617                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1618                         } else {
1619                             sb.setCharAt(starter, (char)c);
1620                             sb.deleteCharAt(starter+1);
1621                             // The composite is shorter than the starter,
1622                             // move the intermediate characters forward one.
1623                             starterIsSupplementary=false;
1624                             --p;
1625                         }
1626                     } else if(composite>0xffff) {
1627                         // The composite is longer than the starter,
1628                         // move the intermediate characters back one.
1629                         starterIsSupplementary=true;
1630                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1631                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1632                         ++p;
1633                     } else {
1634                         // both are on the BMP
1635                         sb.setCharAt(starter, (char)composite);
1636                     }
1637 
1638                     // Keep prevCC because we removed the combining mark.
1639 
1640                     if(p==sb.length()) {
1641                         break;
1642                     }
1643                     // Is the composite a starter that combines forward?
1644                     if((compositeAndFwd&1)!=0) {
1645                         compositionsList=
1646                             getCompositionsListForComposite(getNorm16(composite));
1647                     } else {
1648                         compositionsList=-1;
1649                     }
1650 
1651                     // We combined; continue with looking for compositions.
1652                     continue;
1653                 }
1654             }
1655 
1656             // no combination this time
1657             prevCC=cc;
1658             if(p==sb.length()) {
1659                 break;
1660             }
1661 
1662             // If c did not combine, then check if it is a starter.
1663             if(cc==0) {
1664                 // Found a new starter.
1665                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
1666                     // It may combine with something, prepare for it.
1667                     if(c<=0xffff) {
1668                         starterIsSupplementary=false;
1669                         starter=p-1;
1670                     } else {
1671                         starterIsSupplementary=true;
1672                         starter=p-2;
1673                     }
1674                 }
1675             } else if(onlyContiguous) {
1676                 // FCC: no discontiguous compositions; any intervening character blocks.
1677                 compositionsList=-1;
1678             }
1679         }
1680         buffer.flush();
1681     }
1682 
1683     /**
1684      * Does c have a composition boundary before it?
1685      * True if its decomposition begins with a character that has
1686      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1687      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1688      * (isCompYesAndZeroCC()) so we need not decompose.
1689      */
hasCompBoundaryBefore(int c, int norm16)1690     private boolean hasCompBoundaryBefore(int c, int norm16) {
1691         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
1692     }
norm16HasCompBoundaryBefore(int norm16)1693     private boolean norm16HasCompBoundaryBefore(int norm16) {
1694         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
1695     }
hasCompBoundaryBefore(CharSequence s, int src, int limit)1696     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
1697         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
1698     }
norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)1699     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
1700         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1701             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
1702     }
hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)1703     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
1704         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
1705     }
1706     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
isTrailCC01ForCompBoundaryAfter(int norm16)1707     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
1708         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
1709             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
1710     }
1711 
findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)1712     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
1713         while(p>0) {
1714             int c=Character.codePointBefore(s, p);
1715             int norm16 = getNorm16(c);
1716             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1717                 break;
1718             }
1719             p-=Character.charCount(c);
1720             if(hasCompBoundaryBefore(c, norm16)) {
1721                 break;
1722             }
1723         }
1724         return p;
1725     }
findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)1726     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
1727         while(p<limit) {
1728             int c=Character.codePointAt(s, p);
1729             int norm16=normTrie.get(c);
1730             if(hasCompBoundaryBefore(c, norm16)) {
1731                 break;
1732             }
1733             p+=Character.charCount(c);
1734             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1735                 break;
1736             }
1737         }
1738         return p;
1739     }
1740 
1741 
findNextFCDBoundary(CharSequence s, int p, int limit)1742     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
1743         while(p<limit) {
1744             int c=Character.codePointAt(s, p);
1745             int norm16;
1746             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
1747                 break;
1748             }
1749             p+=Character.charCount(c);
1750             if (norm16HasDecompBoundaryAfter(norm16)) {
1751                 break;
1752             }
1753         }
1754         return p;
1755     }
1756 
1757     /**
1758      * Get the canonical decomposition
1759      * sherman  for ComposedCharIter
1760      */
getDecompose(int chars[], String decomps[])1761     public static int getDecompose(int chars[], String decomps[]) {
1762         Normalizer2 impl = Normalizer2.getNFDInstance();
1763 
1764         int length=0;
1765         int norm16 = 0;
1766         int ch = -1;
1767         int i = 0;
1768 
1769         while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
1770             //TBD !!!! the hack code heres save us about 50ms for startup
1771             //need a better solution/lookup
1772             if (ch == 0x30ff)
1773                 ch = 0xf900;
1774             else if (ch == 0x115bc)
1775                 ch = 0x1d15e;
1776             else if (ch == 0x1d1c1)
1777                 ch = 0x2f800;
1778 
1779             String s = impl.getDecomposition(ch);
1780 
1781             if(s != null && i < chars.length) {
1782                 chars[i] = ch;
1783                 decomps[i++] = s;
1784             }
1785         }
1786         return i;
1787     }
1788 
1789     //------------------------------------------------------
1790     // special method for Collation (RBTableBuilder.build())
1791     //------------------------------------------------------
needSingleQuotation(char c)1792     private static boolean needSingleQuotation(char c) {
1793         return (c >= 0x0009 && c <= 0x000D) ||
1794                (c >= 0x0020 && c <= 0x002F) ||
1795                (c >= 0x003A && c <= 0x0040) ||
1796                (c >= 0x005B && c <= 0x0060) ||
1797                (c >= 0x007B && c <= 0x007E);
1798     }
1799 
canonicalDecomposeWithSingleQuotation(String string)1800     public static String canonicalDecomposeWithSingleQuotation(String string) {
1801        Normalizer2 impl = Normalizer2.getNFDInstance();
1802        char[] src = string.toCharArray();
1803        int    srcIndex = 0;
1804        int    srcLimit = src.length;
1805        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
1806        int    destIndex = 0;
1807        int    destLimit = dest.length;
1808 
1809         int prevSrc;
1810         String norm;
1811         int reorderStartIndex, length;
1812         char c1, c2;
1813         int cp;
1814         int minNoMaybe = 0x00c0;
1815         int cc, prevCC, trailCC;
1816         char[] p;
1817         int pStart;
1818 
1819         // initialize
1820         reorderStartIndex = 0;
1821         prevCC = 0;
1822         norm = null;
1823         cp = 0;
1824         pStart = 0;
1825 
1826         cc = trailCC = -1; // initialize to bogus value
1827         c1 = 0;
1828         for (;;) {
1829             prevSrc=srcIndex;
1830             //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
1831             while (srcIndex != srcLimit &&
1832                    ((c1 = src[srcIndex]) < minNoMaybe ||
1833                     (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
1834                     (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
1835                 prevCC = 0;
1836                 srcIndex += (cp < 0x10000) ? 1 : 2;
1837             }
1838 
1839             // copy these code units all at once
1840             if (srcIndex != prevSrc) {
1841                 length = srcIndex - prevSrc;
1842                 if ((destIndex + length) <= destLimit) {
1843                     System.arraycopy(src,prevSrc,dest,destIndex,length);
1844                 }
1845 
1846                 destIndex += length;
1847                 reorderStartIndex = destIndex;
1848             }
1849 
1850             // end of source reached?
1851             if (srcIndex == srcLimit) {
1852                 break;
1853             }
1854 
1855             // cp already contains *src and norm32 is set for it, increment src
1856             srcIndex += (cp < 0x10000) ? 1 : 2;
1857 
1858             if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1859                 c2 = 0;
1860                 length = 1;
1861 
1862                 if (Character.isHighSurrogate(c1)
1863                     || Character.isLowSurrogate(c1)) {
1864                     norm = null;
1865                 }
1866             } else {
1867                 length = 2;
1868                 c2 = src[srcIndex-1];
1869             }
1870 
1871           // get the decomposition and the lead and trail cc's
1872           if (norm == null) {
1873               // cp does not decompose
1874               cc = trailCC = UCharacter.getCombiningClass(cp);
1875               p = null;
1876               pStart = -1;
1877           } else {
1878 
1879                 pStart = 0;
1880                 p = norm.toCharArray();
1881                 length = p.length;
1882                 int cpNum = norm.codePointCount(0, length);
1883                 cc= UCharacter.getCombiningClass(norm.codePointAt(0));
1884                 trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
1885                 if (length == 1) {
1886                     // fastpath a single code unit from decomposition
1887                     c1 = p[pStart];
1888                     c2 = 0;
1889                     p = null;
1890                     pStart = -1;
1891                 }
1892             }
1893 
1894             if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
1895                 // buffer overflow
1896                 char[] tmpBuf = new char[destLimit * 2];
1897                 System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
1898                 dest = tmpBuf;
1899                 destLimit = dest.length;
1900             }
1901 
1902             // append the decomposition to the destination buffer, assume length>0
1903             {
1904                 int reorderSplit = destIndex;
1905                 if (p == null) {
1906                     // fastpath: single code point
1907                     if (needSingleQuotation(c1)) {
1908                         //if we need single quotation, no need to consider "prevCC"
1909                         //and it must NOT be a supplementary pair
1910                         dest[destIndex++] = '\'';
1911                         dest[destIndex++] = c1;
1912                         dest[destIndex++] = '\'';
1913                         trailCC = 0;
1914                     } else if(cc != 0 && cc < prevCC) {
1915                         // (c1, c2) is out of order with respect to the preceding
1916                         //  text
1917                         destIndex += length;
1918                         trailCC = insertOrdered(dest, reorderStartIndex,
1919                                                 reorderSplit, destIndex, c1, c2, cc);
1920                     } else {
1921                         // just append (c1, c2)
1922                         dest[destIndex++] = c1;
1923                         if(c2 != 0) {
1924                             dest[destIndex++] = c2;
1925                         }
1926                     }
1927                 } else {
1928                     // general: multiple code points (ordered by themselves)
1929                     // from decomposition
1930                     if (needSingleQuotation(p[pStart])) {
1931                         dest[destIndex++] = '\'';
1932                         dest[destIndex++] = p[pStart++];
1933                         dest[destIndex++] = '\'';
1934                         length--;
1935                         do {
1936                             dest[destIndex++] = p[pStart++];
1937                         } while(--length > 0);
1938                     } else if (cc != 0 && cc < prevCC) {
1939                         destIndex += length;
1940                         trailCC = mergeOrdered(dest, reorderStartIndex,
1941                                                reorderSplit, p, pStart,
1942                                                pStart+length);
1943                     } else {
1944                         // just append the decomposition
1945                         do {
1946                             dest[destIndex++] = p[pStart++];
1947                         } while (--length > 0);
1948                     }
1949                 }
1950             }
1951             prevCC = trailCC;
1952             if(prevCC == 0) {
1953                 reorderStartIndex = destIndex;
1954             }
1955         }
1956 
1957         return new String(dest, 0, destIndex);
1958     }
1959 
1960     /**
1961      * simpler, single-character version of mergeOrdered() -
1962      * bubble-insert one single code point into the preceding string
1963      * which is already canonically ordered
1964      * (c, c2) may or may not yet have been inserted at src[current]..src[p]
1965      *
1966      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1967      *
1968      * before: src[start]..src[current] is already ordered, and
1969      *         src[current]..src[p]     may or may not hold (c, c2) but
1970      *                          must be exactly the same length as (c, c2)
1971      * after: src[start]..src[p] is ordered
1972      *
1973      * @return the trailing combining class
1974      */
insertOrdered(char[] source, int start, int current, int p, char c1, char c2, int cc)1975     private static int/*unsigned byte*/ insertOrdered(char[] source,
1976                                                       int start,
1977                                                       int current, int p,
1978                                                       char c1, char c2,
1979                                                       int/*unsigned byte*/ cc) {
1980         int back, preBack;
1981         int r;
1982         int prevCC, trailCC=cc;
1983 
1984         if (start<current && cc!=0) {
1985             // search for the insertion point where cc>=prevCC
1986             preBack=back=current;
1987 
1988             PrevArgs prevArgs = new PrevArgs();
1989             prevArgs.current  = current;
1990             prevArgs.start    = start;
1991             prevArgs.src      = source;
1992             prevArgs.c1       = c1;
1993             prevArgs.c2       = c2;
1994 
1995             // get the prevCC
1996             prevCC=getPrevCC(prevArgs);
1997             preBack = prevArgs.current;
1998 
1999             if(cc<prevCC) {
2000                 // this will be the last code point, so keep its cc
2001                 trailCC=prevCC;
2002                 back=preBack;
2003                 while(start<preBack) {
2004                     prevCC=getPrevCC(prevArgs);
2005                     preBack=prevArgs.current;
2006                     if(cc>=prevCC) {
2007                         break;
2008                     }
2009                     back=preBack;
2010                 }
2011 
2012                 // this is where we are right now with all these indicies:
2013                 // [start]..[pPreBack] 0..? code points that we can ignore
2014                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
2015                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
2016                 // [current]..[p]         1 code point (c, c2) with cc
2017 
2018                 // move the code units in between up
2019                 r=p;
2020                 do {
2021                     source[--r]=source[--current];
2022                 } while (back!=current);
2023             }
2024         }
2025 
2026         // insert (c1, c2)
2027         source[current] = c1;
2028         if (c2!=0) {
2029             source[(current+1)] = c2;
2030         }
2031 
2032         // we know the cc of the last code point
2033         return trailCC;
2034     }
2035     /**
2036      * merge two UTF-16 string parts together
2037      * to canonically order (order by combining classes) their concatenation
2038      *
2039      * the two strings may already be adjacent, so that the merging is done
2040      * in-place if the two strings are not adjacent, then the buffer holding the
2041      * first one must be large enough
2042      * the second string may or may not be ordered in itself
2043      *
2044      * before: [start]..[current] is already ordered, and
2045      *         [next]..[limit]    may be ordered in itself, but
2046      *                          is not in relation to [start..current[
2047      * after: [start..current+(limit-next)[ is ordered
2048      *
2049      * the algorithm is a simple bubble-sort that takes the characters from
2050      * src[next++] and inserts them in correct combining class order into the
2051      * preceding part of the string
2052      *
2053      * since this function is called much less often than the single-code point
2054      * insertOrdered(), it just uses that for easier maintenance
2055      *
2056      * @return the trailing combining class
2057      */
mergeOrdered(char[] source, int start, int current, char[] data, int next, int limit)2058     private static int /*unsigned byte*/ mergeOrdered(char[] source,
2059                                                       int start,
2060                                                       int current,
2061                                                       char[] data,
2062                                                         int next,
2063                                                         int limit) {
2064             int r;
2065             int /*unsigned byte*/ cc, trailCC=0;
2066             boolean adjacent;
2067 
2068             adjacent= current==next;
2069             NextCCArgs ncArgs = new NextCCArgs();
2070             ncArgs.source = data;
2071             ncArgs.next   = next;
2072             ncArgs.limit  = limit;
2073 
2074             if(start!=current) {
2075 
2076                 while(ncArgs.next<ncArgs.limit) {
2077                     cc=getNextCC(ncArgs);
2078                     if(cc==0) {
2079                         // does not bubble back
2080                         trailCC=0;
2081                         if(adjacent) {
2082                             current=ncArgs.next;
2083                         } else {
2084                             data[current++]=ncArgs.c1;
2085                             if(ncArgs.c2!=0) {
2086                                 data[current++]=ncArgs.c2;
2087                             }
2088                         }
2089                         break;
2090                     } else {
2091                         r=current+(ncArgs.c2==0 ? 1 : 2);
2092                         trailCC=insertOrdered(source,start, current, r,
2093                                               ncArgs.c1, ncArgs.c2, cc);
2094                         current=r;
2095                     }
2096                 }
2097             }
2098 
2099             if(ncArgs.next==ncArgs.limit) {
2100                 // we know the cc of the last code point
2101                 return trailCC;
2102             } else {
2103                 if(!adjacent) {
2104                     // copy the second string part
2105                     do {
2106                         source[current++]=data[ncArgs.next++];
2107                     } while(ncArgs.next!=ncArgs.limit);
2108                     ncArgs.limit=current;
2109                 }
2110                 PrevArgs prevArgs = new PrevArgs();
2111                 prevArgs.src   = data;
2112                 prevArgs.start = start;
2113                 prevArgs.current =  ncArgs.limit;
2114                 return getPrevCC(prevArgs);
2115             }
2116 
2117     }
2118     private static final class PrevArgs{
2119         char[] src;
2120         int start;
2121         int current;
2122         char c1;
2123         char c2;
2124     }
2125 
2126     private static final class NextCCArgs{
2127         char[] source;
2128         int next;
2129         int limit;
2130         char c1;
2131         char c2;
2132     }
getNextCC(NextCCArgs args)2133     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
2134         args.c1=args.source[args.next++];
2135         args.c2=0;
2136 
2137         if (UTF16.isTrailSurrogate(args.c1)) {
2138             /* unpaired second surrogate */
2139             return 0;
2140         } else if (!UTF16.isLeadSurrogate(args.c1)) {
2141             return UCharacter.getCombiningClass(args.c1);
2142         } else if (args.next!=args.limit &&
2143                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
2144             ++args.next;
2145             return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
2146         } else {
2147             /* unpaired first surrogate */
2148             args.c2=0;
2149             return 0;
2150         }
2151     }
getPrevCC(PrevArgs args)2152     private static int /*unsigned*/ getPrevCC(PrevArgs args) {
2153         args.c1=args.src[--args.current];
2154         args.c2=0;
2155 
2156         if (args.c1 < MIN_CCC_LCCC_CP) {
2157             return 0;
2158         } else if (UTF16.isLeadSurrogate(args.c1)) {
2159             /* unpaired first surrogate */
2160             return 0;
2161         } else if (!UTF16.isTrailSurrogate(args.c1)) {
2162             return UCharacter.getCombiningClass(args.c1);
2163         } else if (args.current!=args.start &&
2164                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
2165             --args.current;
2166             return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
2167         } else {
2168             /* unpaired second surrogate */
2169             args.c2=0;
2170             return 0;
2171         }
2172     }
2173 
getPreviousTrailCC(CharSequence s, int start, int p)2174     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2175         if (start == p) {
2176             return 0;
2177         }
2178         return getFCD16(Character.codePointBefore(s, p));
2179     }
2180 
2181     private VersionInfo dataVersion;
2182 
2183     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2184     private int minDecompNoCP;
2185     private int minCompNoMaybeCP;
2186     private int minLcccCP;
2187 
2188     // Norm16 value thresholds for quick check combinations and types of extra data.
2189     private int minYesNo;
2190     private int minYesNoMappingsOnly;
2191     private int minNoNo;
2192     private int minNoNoCompBoundaryBefore;
2193     private int minNoNoCompNoMaybeCC;
2194     private int minNoNoEmpty;
2195     private int limitNoNo;
2196     private int centerNoNoDelta;
2197     private int minMaybeYes;
2198 
2199     private Trie2_16 normTrie;
2200     private String maybeYesCompositions;
2201     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2202     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2203 
2204    }
2205