1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2impl.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18 
19 // #define UCPTRIE_DEBUG
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_NORMALIZATION
24 
25 #include "unicode/bytestream.h"
26 #include "unicode/edits.h"
27 #include "unicode/normalizer2.h"
28 #include "unicode/stringoptions.h"
29 #include "unicode/ucptrie.h"
30 #include "unicode/udata.h"
31 #include "unicode/umutablecptrie.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utf16.h"
34 #include "unicode/utf8.h"
35 #include "bytesinkutil.h"
36 #include "cmemory.h"
37 #include "mutex.h"
38 #include "normalizer2impl.h"
39 #include "putilimp.h"
40 #include "uassert.h"
41 #include "ucptrie_impl.h"
42 #include "uset_imp.h"
43 #include "uvector.h"
44 
45 U_NAMESPACE_BEGIN
46 
47 namespace {
48 
49 /**
50  * UTF-8 lead byte for minNoMaybeCP.
51  * Can be lower than the actual lead byte for c.
52  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
53  */
leadByteForCP(UChar32 c)54 inline uint8_t leadByteForCP(UChar32 c) {
55     if (c <= 0x7f) {
56         return (uint8_t)c;
57     } else if (c <= 0x7ff) {
58         return (uint8_t)(0xc0+(c>>6));
59     } else {
60         // Should not occur because ccc(U+0300)!=0.
61         return 0xe0;
62     }
63 }
64 
65 /**
66  * Returns the code point from one single well-formed UTF-8 byte sequence
67  * between cpStart and cpLimit.
68  *
69  * Trie UTF-8 macros do not assemble whole code points (for efficiency).
70  * When we do need the code point, we call this function.
71  * We should not need it for normalization-inert data (norm16==0).
72  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
73  */
codePointFromValidUTF8(const uint8_t * cpStart,const uint8_t * cpLimit)74 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
75     // Similar to U8_NEXT_UNSAFE(s, i, c).
76     U_ASSERT(cpStart < cpLimit);
77     uint8_t c = *cpStart;
78     switch(cpLimit-cpStart) {
79     case 1:
80         return c;
81     case 2:
82         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
83     case 3:
84         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
85         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
86     case 4:
87         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
88     default:
89         UPRV_UNREACHABLE;  // Should not occur.
90 #ifdef U_STRINGI_PATCHES
91         return c;
92 #endif
93     }
94 }
95 
96 /**
97  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
98  * Otherwise returns a negative value.
99  */
previousHangulOrJamo(const uint8_t * start,const uint8_t * p)100 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
101     if ((p - start) >= 3) {
102         p -= 3;
103         uint8_t l = *p;
104         uint8_t t1, t2;
105         if (0xe1 <= l && l <= 0xed &&
106                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
107                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
108                 (l < 0xed || t1 <= 0x1f)) {
109             return ((l & 0xf) << 12) | (t1 << 6) | t2;
110         }
111     }
112     return U_SENTINEL;
113 }
114 
115 /**
116  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
117  * Otherwise returns a negative value.
118  */
getJamoTMinusBase(const uint8_t * src,const uint8_t * limit)119 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
120     // Jamo T: E1 86 A8..E1 87 82
121     if ((limit - src) >= 3 && *src == 0xe1) {
122         if (src[1] == 0x86) {
123             uint8_t t = src[2];
124             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
125             // Offset 0 does not correspond to any conjoining Jamo.
126             if (0xa8 <= t && t <= 0xbf) {
127                 return t - 0xa7;
128             }
129         } else if (src[1] == 0x87) {
130             uint8_t t = src[2];
131             if ((int8_t)t <= (int8_t)0x82u) {
132                 return t - (0xa7 - 0x40);
133             }
134         }
135     }
136     return -1;
137 }
138 
139 void
appendCodePointDelta(const uint8_t * cpStart,const uint8_t * cpLimit,int32_t delta,ByteSink & sink,Edits * edits)140 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
141                      ByteSink &sink, Edits *edits) {
142     char buffer[U8_MAX_LENGTH];
143     int32_t length;
144     int32_t cpLength = (int32_t)(cpLimit - cpStart);
145     if (cpLength == 1) {
146         // The builder makes ASCII map to ASCII.
147         buffer[0] = (uint8_t)(*cpStart + delta);
148         length = 1;
149     } else {
150         int32_t trail = *(cpLimit-1) + delta;
151         if (0x80 <= trail && trail <= 0xbf) {
152             // The delta only changes the last trail byte.
153             --cpLimit;
154             length = 0;
155             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
156             buffer[length++] = (uint8_t)trail;
157         } else {
158             // Decode the code point, add the delta, re-encode.
159             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
160             length = 0;
161             U8_APPEND_UNSAFE(buffer, length, c);
162         }
163     }
164     if (edits != nullptr) {
165         edits->addReplace(cpLength, length);
166     }
167     sink.Append(buffer, length);
168 }
169 
170 }  // namespace
171 
172 // ReorderingBuffer -------------------------------------------------------- ***
173 
ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest,UErrorCode & errorCode)174 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
175                                    UErrorCode &errorCode) :
176         impl(ni), str(dest),
177         start(str.getBuffer(8)), reorderStart(start), limit(start),
178         remainingCapacity(str.getCapacity()), lastCC(0) {
179     if (start == nullptr && U_SUCCESS(errorCode)) {
180         // getBuffer() already did str.setToBogus()
181         errorCode = U_MEMORY_ALLOCATION_ERROR;
182     }
183 }
184 
init(int32_t destCapacity,UErrorCode & errorCode)185 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
186     int32_t length=str.length();
187     start=str.getBuffer(destCapacity);
188     if(start==NULL) {
189         // getBuffer() already did str.setToBogus()
190         errorCode=U_MEMORY_ALLOCATION_ERROR;
191         return FALSE;
192     }
193     limit=start+length;
194     remainingCapacity=str.getCapacity()-length;
195     reorderStart=start;
196     if(start==limit) {
197         lastCC=0;
198     } else {
199         setIterator();
200         lastCC=previousCC();
201         // Set reorderStart after the last code point with cc<=1 if there is one.
202         if(lastCC>1) {
203             while(previousCC()>1) {}
204         }
205         reorderStart=codePointLimit;
206     }
207     return TRUE;
208 }
209 
equals(const UChar * otherStart,const UChar * otherLimit) const210 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
211     int32_t length=(int32_t)(limit-start);
212     return
213         length==(int32_t)(otherLimit-otherStart) &&
214         0==u_memcmp(start, otherStart, length);
215 }
216 
equals(const uint8_t * otherStart,const uint8_t * otherLimit) const217 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
218     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
219     int32_t length = (int32_t)(limit - start);
220     int32_t otherLength = (int32_t)(otherLimit - otherStart);
221     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
222     if (otherLength < length || (otherLength / 3) > length) {
223         return FALSE;
224     }
225     // Compare valid strings from between normalization boundaries.
226     // (Invalid sequences are normalization-inert.)
227     for (int32_t i = 0, j = 0;;) {
228         if (i >= length) {
229             return j >= otherLength;
230         } else if (j >= otherLength) {
231             return FALSE;
232         }
233         // Not at the end of either string yet.
234         UChar32 c, other;
235         U16_NEXT_UNSAFE(start, i, c);
236         U8_NEXT_UNSAFE(otherStart, j, other);
237         if (c != other) {
238             return FALSE;
239         }
240     }
241 }
242 
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)243 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
244     if(remainingCapacity<2 && !resize(2, errorCode)) {
245         return FALSE;
246     }
247     if(lastCC<=cc || cc==0) {
248         limit[0]=U16_LEAD(c);
249         limit[1]=U16_TRAIL(c);
250         limit+=2;
251         lastCC=cc;
252         if(cc<=1) {
253             reorderStart=limit;
254         }
255     } else {
256         insert(c, cc);
257     }
258     remainingCapacity-=2;
259     return TRUE;
260 }
261 
append(const UChar * s,int32_t length,UBool isNFD,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)262 UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,
263                                uint8_t leadCC, uint8_t trailCC,
264                                UErrorCode &errorCode) {
265     if(length==0) {
266         return TRUE;
267     }
268     if(remainingCapacity<length && !resize(length, errorCode)) {
269         return FALSE;
270     }
271     remainingCapacity-=length;
272     if(lastCC<=leadCC || leadCC==0) {
273         if(trailCC<=1) {
274             reorderStart=limit+length;
275         } else if(leadCC<=1) {
276             reorderStart=limit+1;  // Ok if not a code point boundary.
277         }
278         const UChar *sLimit=s+length;
279         do { *limit++=*s++; } while(s!=sLimit);
280         lastCC=trailCC;
281     } else {
282         int32_t i=0;
283         UChar32 c;
284         U16_NEXT(s, i, length, c);
285         insert(c, leadCC);  // insert first code point
286         while(i<length) {
287             U16_NEXT(s, i, length, c);
288             if(i<length) {
289                 if (isNFD) {
290                     leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
291                 } else {
292                     leadCC = impl.getCC(impl.getNorm16(c));
293                 }
294             } else {
295                 leadCC=trailCC;
296             }
297             append(c, leadCC, errorCode);
298         }
299     }
300     return TRUE;
301 }
302 
appendZeroCC(UChar32 c,UErrorCode & errorCode)303 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
304     int32_t cpLength=U16_LENGTH(c);
305     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
306         return FALSE;
307     }
308     remainingCapacity-=cpLength;
309     if(cpLength==1) {
310         *limit++=(UChar)c;
311     } else {
312         limit[0]=U16_LEAD(c);
313         limit[1]=U16_TRAIL(c);
314         limit+=2;
315     }
316     lastCC=0;
317     reorderStart=limit;
318     return TRUE;
319 }
320 
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)321 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
322     if(s==sLimit) {
323         return TRUE;
324     }
325     int32_t length=(int32_t)(sLimit-s);
326     if(remainingCapacity<length && !resize(length, errorCode)) {
327         return FALSE;
328     }
329     u_memcpy(limit, s, length);
330     limit+=length;
331     remainingCapacity-=length;
332     lastCC=0;
333     reorderStart=limit;
334     return TRUE;
335 }
336 
remove()337 void ReorderingBuffer::remove() {
338     reorderStart=limit=start;
339     remainingCapacity=str.getCapacity();
340     lastCC=0;
341 }
342 
removeSuffix(int32_t suffixLength)343 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
344     if(suffixLength<(limit-start)) {
345         limit-=suffixLength;
346         remainingCapacity+=suffixLength;
347     } else {
348         limit=start;
349         remainingCapacity=str.getCapacity();
350     }
351     lastCC=0;
352     reorderStart=limit;
353 }
354 
resize(int32_t appendLength,UErrorCode & errorCode)355 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
356     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
357     int32_t length=(int32_t)(limit-start);
358     str.releaseBuffer(length);
359     int32_t newCapacity=length+appendLength;
360     int32_t doubleCapacity=2*str.getCapacity();
361     if(newCapacity<doubleCapacity) {
362         newCapacity=doubleCapacity;
363     }
364     if(newCapacity<256) {
365         newCapacity=256;
366     }
367     start=str.getBuffer(newCapacity);
368     if(start==NULL) {
369         // getBuffer() already did str.setToBogus()
370         errorCode=U_MEMORY_ALLOCATION_ERROR;
371         return FALSE;
372     }
373     reorderStart=start+reorderStartIndex;
374     limit=start+length;
375     remainingCapacity=str.getCapacity()-length;
376     return TRUE;
377 }
378 
skipPrevious()379 void ReorderingBuffer::skipPrevious() {
380     codePointLimit=codePointStart;
381     UChar c=*--codePointStart;
382     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
383         --codePointStart;
384     }
385 }
386 
previousCC()387 uint8_t ReorderingBuffer::previousCC() {
388     codePointLimit=codePointStart;
389     if(reorderStart>=codePointStart) {
390         return 0;
391     }
392     UChar32 c=*--codePointStart;
393     UChar c2;
394     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
395         --codePointStart;
396         c=U16_GET_SUPPLEMENTARY(c2, c);
397     }
398     return impl.getCCFromYesOrMaybeCP(c);
399 }
400 
401 // Inserts c somewhere before the last character.
402 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)403 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
404     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
405     // insert c at codePointLimit, after the character with prevCC<=cc
406     UChar *q=limit;
407     UChar *r=limit+=U16_LENGTH(c);
408     do {
409         *--r=*--q;
410     } while(codePointLimit!=q);
411     writeCodePoint(q, c);
412     if(cc<=1) {
413         reorderStart=r;
414     }
415 }
416 
417 // Normalizer2Impl --------------------------------------------------------- ***
418 
419 struct CanonIterData : public UMemory {
420     CanonIterData(UErrorCode &errorCode);
421     ~CanonIterData();
422     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
423     UMutableCPTrie *mutableTrie;
424     UCPTrie *trie;
425     UVector canonStartSets;  // contains UnicodeSet *
426 };
427 
~Normalizer2Impl()428 Normalizer2Impl::~Normalizer2Impl() {
429     delete fCanonIterData;
430 }
431 
432 void
init(const int32_t * inIndexes,const UCPTrie * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)433 Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
434                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
435     minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);
436     minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
437     minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);
438 
439     minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
440     minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
441     minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
442     minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
443     minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
444     minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
445     limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
446     minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
447     U_ASSERT((minMaybeYes & 7) == 0);  // 8-aligned for noNoDelta bit fields
448     centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
449 
450     normTrie=inTrie;
451 
452     maybeYesCompositions=inExtraData;
453     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
454 
455     smallFCD=inSmallFCD;
456 }
457 
458 U_CDECL_BEGIN
459 
460 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)461 segmentStarterMapper(const void * /*context*/, uint32_t value) {
462     return value&CANON_NOT_SEGMENT_STARTER;
463 }
464 
465 U_CDECL_END
466 
467 void
addLcccChars(UnicodeSet & set) const468 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
469     UChar32 start = 0, end;
470     uint32_t norm16;
471     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
472                                    nullptr, nullptr, &norm16)) >= 0) {
473         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
474                 norm16 != Normalizer2Impl::JAMO_VT) {
475             set.add(start, end);
476         } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
477             uint16_t fcd16 = getFCD16(start);
478             if (fcd16 > 0xff) { set.add(start, end); }
479         }
480         start = end + 1;
481     }
482 }
483 
484 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const485 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
486     // Add the start code point of each same-value range of the trie.
487     UChar32 start = 0, end;
488     uint32_t value;
489     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
490                                    nullptr, nullptr, &value)) >= 0) {
491         sa->add(sa->set, start);
492         if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
493                 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
494             // Range of code points with same-norm16-value algorithmic decompositions.
495             // They might have different non-zero FCD16 values.
496             uint16_t prevFCD16 = getFCD16(start);
497             while (++start <= end) {
498                 uint16_t fcd16 = getFCD16(start);
499                 if (fcd16 != prevFCD16) {
500                     sa->add(sa->set, start);
501                     prevFCD16 = fcd16;
502                 }
503             }
504         }
505         start = end + 1;
506     }
507 
508     /* add Hangul LV syllables and LV+1 because of skippables */
509     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
510         sa->add(sa->set, c);
511         sa->add(sa->set, c+1);
512     }
513     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
514 }
515 
516 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const517 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
518     // Add the start code point of each same-value range of the canonical iterator data trie.
519     if (!ensureCanonIterData(errorCode)) { return; }
520     // Currently only used for the SEGMENT_STARTER property.
521     UChar32 start = 0, end;
522     uint32_t value;
523     while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
524                                    segmentStarterMapper, nullptr, &value)) >= 0) {
525         sa->add(sa->set, start);
526         start = end + 1;
527     }
528 }
529 
530 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const531 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
532                                                 UChar32 minNeedDataCP,
533                                                 ReorderingBuffer *buffer,
534                                                 UErrorCode &errorCode) const {
535     // Make some effort to support NUL-terminated strings reasonably.
536     // Take the part of the fast quick check loop that does not look up
537     // data and check the first part of the string.
538     // After this prefix, determine the string length to simplify the rest
539     // of the code.
540     const UChar *prevSrc=src;
541     UChar c;
542     while((c=*src++)<minNeedDataCP && c!=0) {}
543     // Back out the last character for full processing.
544     // Copy this prefix.
545     if(--src!=prevSrc) {
546         if(buffer!=NULL) {
547             buffer->appendZeroCC(prevSrc, src, errorCode);
548         }
549     }
550     return src;
551 }
552 
553 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const554 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
555                            UErrorCode &errorCode) const {
556     if(U_FAILURE(errorCode)) {
557         dest.setToBogus();
558         return dest;
559     }
560     const UChar *sArray=src.getBuffer();
561     if(&dest==&src || sArray==NULL) {
562         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
563         dest.setToBogus();
564         return dest;
565     }
566     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
567     return dest;
568 }
569 
570 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const571 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
572                            UnicodeString &dest,
573                            int32_t destLengthEstimate,
574                            UErrorCode &errorCode) const {
575     if(destLengthEstimate<0 && limit!=NULL) {
576         destLengthEstimate=(int32_t)(limit-src);
577     }
578     dest.remove();
579     ReorderingBuffer buffer(*this, dest);
580     if(buffer.init(destLengthEstimate, errorCode)) {
581         decompose(src, limit, &buffer, errorCode);
582     }
583 }
584 
585 // Dual functionality:
586 // buffer!=NULL: normalize
587 // buffer==NULL: isNormalized/spanQuickCheckYes
588 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const589 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
590                            ReorderingBuffer *buffer,
591                            UErrorCode &errorCode) const {
592     UChar32 minNoCP=minDecompNoCP;
593     if(limit==NULL) {
594         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
595         if(U_FAILURE(errorCode)) {
596             return src;
597         }
598         limit=u_strchr(src, 0);
599     }
600 
601     const UChar *prevSrc;
602     UChar32 c=0;
603     uint16_t norm16=0;
604 
605     // only for quick check
606     const UChar *prevBoundary=src;
607     uint8_t prevCC=0;
608 
609     for(;;) {
610         // count code units below the minimum or with irrelevant data for the quick check
611         for(prevSrc=src; src!=limit;) {
612             if( (c=*src)<minNoCP ||
613                 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
614             ) {
615                 ++src;
616             } else if(!U16_IS_LEAD(c)) {
617                 break;
618             } else {
619                 UChar c2;
620                 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
621                     c=U16_GET_SUPPLEMENTARY(c, c2);
622                     norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
623                     if(isMostDecompYesAndZeroCC(norm16)) {
624                         src+=2;
625                     } else {
626                         break;
627                     }
628                 } else {
629                     ++src;  // unpaired lead surrogate: inert
630                 }
631             }
632         }
633         // copy these code units all at once
634         if(src!=prevSrc) {
635             if(buffer!=NULL) {
636                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
637                     break;
638                 }
639             } else {
640                 prevCC=0;
641                 prevBoundary=src;
642             }
643         }
644         if(src==limit) {
645             break;
646         }
647 
648         // Check one above-minimum, relevant code point.
649         src+=U16_LENGTH(c);
650         if(buffer!=NULL) {
651             if(!decompose(c, norm16, *buffer, errorCode)) {
652                 break;
653             }
654         } else {
655             if(isDecompYes(norm16)) {
656                 uint8_t cc=getCCFromYesOrMaybe(norm16);
657                 if(prevCC<=cc || cc==0) {
658                     prevCC=cc;
659                     if(cc<=1) {
660                         prevBoundary=src;
661                     }
662                     continue;
663                 }
664             }
665             return prevBoundary;  // "no" or cc out of order
666         }
667     }
668     return src;
669 }
670 
671 // Decompose a short piece of text which is likely to contain characters that
672 // fail the quick check loop and/or where the quick check loop's overhead
673 // is unlikely to be amortized.
674 // Called by the compose() and makeFCD() implementations.
675 const UChar *
decomposeShort(const UChar * src,const UChar * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const676 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
677                                 UBool stopAtCompBoundary, UBool onlyContiguous,
678                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
679     if (U_FAILURE(errorCode)) {
680         return nullptr;
681     }
682     while(src<limit) {
683         if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
684             return src;
685         }
686         const UChar *prevSrc = src;
687         UChar32 c;
688         uint16_t norm16;
689         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
690         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
691             return prevSrc;
692         }
693         if(!decompose(c, norm16, buffer, errorCode)) {
694             return nullptr;
695         }
696         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
697             return src;
698         }
699     }
700     return src;
701 }
702 
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const703 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
704                                  ReorderingBuffer &buffer,
705                                  UErrorCode &errorCode) const {
706     // get the decomposition and the lead and trail cc's
707     if (norm16 >= limitNoNo) {
708         if (isMaybeOrNonZeroCC(norm16)) {
709             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
710         }
711         // Maps to an isCompYesAndZeroCC.
712         c=mapAlgorithmic(c, norm16);
713         norm16=getRawNorm16(c);
714     }
715     if (norm16 < minYesNo) {
716         // c does not decompose
717         return buffer.append(c, 0, errorCode);
718     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
719         // Hangul syllable: decompose algorithmically
720         UChar jamos[3];
721         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
722     }
723     // c decomposes, get everything from the variable-length extra data
724     const uint16_t *mapping=getMapping(norm16);
725     uint16_t firstUnit=*mapping;
726     int32_t length=firstUnit&MAPPING_LENGTH_MASK;
727     uint8_t leadCC, trailCC;
728     trailCC=(uint8_t)(firstUnit>>8);
729     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
730         leadCC=(uint8_t)(*(mapping-1)>>8);
731     } else {
732         leadCC=0;
733     }
734     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
735 }
736 
737 // Dual functionality:
738 // sink != nullptr: normalize
739 // sink == nullptr: isNormalized/spanQuickCheckYes
740 const uint8_t *
decomposeUTF8(uint32_t options,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const741 Normalizer2Impl::decomposeUTF8(uint32_t options,
742                                const uint8_t *src, const uint8_t *limit,
743                                ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
744     U_ASSERT(limit != nullptr);
745     UnicodeString s16;
746     uint8_t minNoLead = leadByteForCP(minDecompNoCP);
747 
748     const uint8_t *prevBoundary = src;
749     // only for quick check
750     uint8_t prevCC = 0;
751 
752     for (;;) {
753         // Fast path: Scan over a sequence of characters below the minimum "no" code point,
754         // or with (decompYes && ccc==0) properties.
755         const uint8_t *fastStart = src;
756         const uint8_t *prevSrc;
757         uint16_t norm16 = 0;
758 
759         for (;;) {
760             if (src == limit) {
761                 if (prevBoundary != limit && sink != nullptr) {
762                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
763                                                   *sink, options, edits, errorCode);
764                 }
765                 return src;
766             }
767             if (*src < minNoLead) {
768                 ++src;
769             } else {
770                 prevSrc = src;
771                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
772                 if (!isMostDecompYesAndZeroCC(norm16)) {
773                     break;
774                 }
775             }
776         }
777         // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
778         // and the current character at [prevSrc..src[ is not a common case with cc=0
779         // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
780         // It could still be a maybeYes with cc=0.
781         if (prevSrc != fastStart) {
782             // The fast path looped over yes/0 characters before the current one.
783             if (sink != nullptr &&
784                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
785                                                    *sink, options, edits, errorCode)) {
786                 break;
787             }
788             prevBoundary = prevSrc;
789             prevCC = 0;
790         }
791 
792         // Medium-fast path: Quick check.
793         if (isMaybeOrNonZeroCC(norm16)) {
794             // Does not decompose.
795             uint8_t cc = getCCFromYesOrMaybe(norm16);
796             if (prevCC <= cc || cc == 0) {
797                 prevCC = cc;
798                 if (cc <= 1) {
799                     if (sink != nullptr &&
800                             !ByteSinkUtil::appendUnchanged(prevBoundary, src,
801                                                            *sink, options, edits, errorCode)) {
802                         break;
803                     }
804                     prevBoundary = src;
805                 }
806                 continue;
807             }
808         }
809         if (sink == nullptr) {
810             return prevBoundary;  // quick check: "no" or cc out of order
811         }
812 
813         // Slow path
814         // Decompose up to and including the current character.
815         if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
816             if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
817                                                *sink, options, edits, errorCode)) {
818                 break;
819             }
820             prevBoundary = prevSrc;
821         }
822         ReorderingBuffer buffer(*this, s16, errorCode);
823         if (U_FAILURE(errorCode)) {
824             break;
825         }
826         decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
827                        buffer, errorCode);
828         // Decompose until the next boundary.
829         if (buffer.getLastCC() > 1) {
830             src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
831                                  buffer, errorCode);
832         }
833         if (U_FAILURE(errorCode)) {
834             break;
835         }
836         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
837             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
838             break;
839         }
840         // We already know there was a change if the original character decomposed;
841         // otherwise compare.
842         if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
843             if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
844                                                *sink, options, edits, errorCode)) {
845                 break;
846             }
847         } else {
848             if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
849                                             *sink, edits, errorCode)) {
850                 break;
851             }
852         }
853         prevBoundary = src;
854         prevCC = 0;
855     }
856     return src;
857 }
858 
859 const uint8_t *
decomposeShort(const uint8_t * src,const uint8_t * limit,StopAt stopAt,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const860 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
861                                 StopAt stopAt, UBool onlyContiguous,
862                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
863     if (U_FAILURE(errorCode)) {
864         return nullptr;
865     }
866     while (src < limit) {
867         const uint8_t *prevSrc = src;
868         uint16_t norm16;
869         UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
870         // Get the decomposition and the lead and trail cc's.
871         UChar32 c = U_SENTINEL;
872         if (norm16 >= limitNoNo) {
873             if (isMaybeOrNonZeroCC(norm16)) {
874                 // No comp boundaries around this character.
875                 uint8_t cc = getCCFromYesOrMaybe(norm16);
876                 if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
877                     return prevSrc;
878                 }
879                 c = codePointFromValidUTF8(prevSrc, src);
880                 if (!buffer.append(c, cc, errorCode)) {
881                     return nullptr;
882                 }
883                 if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
884                     return src;
885                 }
886                 continue;
887             }
888             // Maps to an isCompYesAndZeroCC.
889             if (stopAt != STOP_AT_LIMIT) {
890                 return prevSrc;
891             }
892             c = codePointFromValidUTF8(prevSrc, src);
893             c = mapAlgorithmic(c, norm16);
894             norm16 = getRawNorm16(c);
895         } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
896             return prevSrc;
897         }
898         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
899         // We do not see invalid UTF-8 here because
900         // its norm16==INERT is normalization-inert,
901         // so it gets copied unchanged in the fast path,
902         // and we stop the slow path where invalid UTF-8 begins.
903         // c >= 0 is the result of an algorithmic mapping.
904         U_ASSERT(c >= 0 || norm16 != INERT);
905         if (norm16 < minYesNo) {
906             if (c < 0) {
907                 c = codePointFromValidUTF8(prevSrc, src);
908             }
909             // does not decompose
910             if (!buffer.append(c, 0, errorCode)) {
911                 return nullptr;
912             }
913         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
914             // Hangul syllable: decompose algorithmically
915             if (c < 0) {
916                 c = codePointFromValidUTF8(prevSrc, src);
917             }
918             char16_t jamos[3];
919             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
920                 return nullptr;
921             }
922         } else {
923             // The character decomposes, get everything from the variable-length extra data.
924             const uint16_t *mapping = getMapping(norm16);
925             uint16_t firstUnit = *mapping;
926             int32_t length = firstUnit & MAPPING_LENGTH_MASK;
927             uint8_t trailCC = (uint8_t)(firstUnit >> 8);
928             uint8_t leadCC;
929             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
930                 leadCC = (uint8_t)(*(mapping-1) >> 8);
931             } else {
932                 leadCC = 0;
933             }
934             if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
935                 return prevSrc;
936             }
937             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
938                 return nullptr;
939             }
940         }
941         if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
942                 (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
943             return src;
944         }
945     }
946     return src;
947 }
948 
949 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const950 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
951     uint16_t norm16;
952     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
953         // c does not decompose
954         return nullptr;
955     }
956     const UChar *decomp = nullptr;
957     if(isDecompNoAlgorithmic(norm16)) {
958         // Maps to an isCompYesAndZeroCC.
959         c=mapAlgorithmic(c, norm16);
960         decomp=buffer;
961         length=0;
962         U16_APPEND_UNSAFE(buffer, length, c);
963         // The mapping might decompose further.
964         norm16 = getRawNorm16(c);
965     }
966     if (norm16 < minYesNo) {
967         return decomp;
968     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
969         // Hangul syllable: decompose algorithmically
970         length=Hangul::decompose(c, buffer);
971         return buffer;
972     }
973     // c decomposes, get everything from the variable-length extra data
974     const uint16_t *mapping=getMapping(norm16);
975     length=*mapping&MAPPING_LENGTH_MASK;
976     return (const UChar *)mapping+1;
977 }
978 
979 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
980 // so that a raw mapping fits that consists of one unit ("rm0")
981 // plus all but the first two code units of the normal mapping.
982 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
983 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const984 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
985     uint16_t norm16;
986     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
987         // c does not decompose
988         return NULL;
989     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
990         // Hangul syllable: decompose algorithmically
991         Hangul::getRawDecomposition(c, buffer);
992         length=2;
993         return buffer;
994     } else if(isDecompNoAlgorithmic(norm16)) {
995         c=mapAlgorithmic(c, norm16);
996         length=0;
997         U16_APPEND_UNSAFE(buffer, length, c);
998         return buffer;
999     }
1000     // c decomposes, get everything from the variable-length extra data
1001     const uint16_t *mapping=getMapping(norm16);
1002     uint16_t firstUnit=*mapping;
1003     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
1004     if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
1005         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
1006         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
1007         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
1008         uint16_t rm0=*rawMapping;
1009         if(rm0<=MAPPING_LENGTH_MASK) {
1010             length=rm0;
1011             return (const UChar *)rawMapping-rm0;
1012         } else {
1013             // Copy the normal mapping and replace its first two code units with rm0.
1014             buffer[0]=(UChar)rm0;
1015             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
1016             length=mLength-1;
1017             return buffer;
1018         }
1019     } else {
1020         length=mLength;
1021         return (const UChar *)mapping+1;
1022     }
1023 }
1024 
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1025 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
1026                                          UBool doDecompose,
1027                                          UnicodeString &safeMiddle,
1028                                          ReorderingBuffer &buffer,
1029                                          UErrorCode &errorCode) const {
1030     buffer.copyReorderableSuffixTo(safeMiddle);
1031     if(doDecompose) {
1032         decompose(src, limit, &buffer, errorCode);
1033         return;
1034     }
1035     // Just merge the strings at the boundary.
1036     bool isFirst = true;
1037     uint8_t firstCC = 0, prevCC = 0, cc;
1038     const UChar *p = src;
1039     while (p != limit) {
1040         const UChar *codePointStart = p;
1041         UChar32 c;
1042         uint16_t norm16;
1043         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1044         if ((cc = getCC(norm16)) == 0) {
1045             p = codePointStart;
1046             break;
1047         }
1048         if (isFirst) {
1049             firstCC = cc;
1050             isFirst = false;
1051         }
1052         prevCC = cc;
1053     }
1054     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1055         limit=u_strchr(p, 0);
1056     }
1057 
1058     if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) {
1059         buffer.appendZeroCC(p, limit, errorCode);
1060     }
1061 }
1062 
hasDecompBoundaryBefore(UChar32 c) const1063 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
1064     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1065         norm16HasDecompBoundaryBefore(getNorm16(c));
1066 }
1067 
norm16HasDecompBoundaryBefore(uint16_t norm16) const1068 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
1069     if (norm16 < minNoNoCompNoMaybeCC) {
1070         return TRUE;
1071     }
1072     if (norm16 >= limitNoNo) {
1073         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1074     }
1075     // c decomposes, get everything from the variable-length extra data
1076     const uint16_t *mapping=getMapping(norm16);
1077     uint16_t firstUnit=*mapping;
1078     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
1079     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1080 }
1081 
hasDecompBoundaryAfter(UChar32 c) const1082 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
1083     if (c < minDecompNoCP) {
1084         return TRUE;
1085     }
1086     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1087         return TRUE;
1088     }
1089     return norm16HasDecompBoundaryAfter(getNorm16(c));
1090 }
1091 
norm16HasDecompBoundaryAfter(uint16_t norm16) const1092 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
1093     if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1094         return TRUE;
1095     }
1096     if (norm16 >= limitNoNo) {
1097         if (isMaybeOrNonZeroCC(norm16)) {
1098             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1099         }
1100         // Maps to an isCompYesAndZeroCC.
1101         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1102     }
1103     // c decomposes, get everything from the variable-length extra data
1104     const uint16_t *mapping=getMapping(norm16);
1105     uint16_t firstUnit=*mapping;
1106     // decomp after-boundary: same as hasFCDBoundaryAfter(),
1107     // fcd16<=1 || trailCC==0
1108     if(firstUnit>0x1ff) {
1109         return FALSE;  // trailCC>1
1110     }
1111     if(firstUnit<=0xff) {
1112         return TRUE;  // trailCC==0
1113     }
1114     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1115     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
1116     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1117 }
1118 
1119 /*
1120  * Finds the recomposition result for
1121  * a forward-combining "lead" character,
1122  * specified with a pointer to its compositions list,
1123  * and a backward-combining "trail" character.
1124  *
1125  * If the lead and trail characters combine, then this function returns
1126  * the following "compositeAndFwd" value:
1127  * Bits 21..1  composite character
1128  * Bit      0  set if the composite is a forward-combining starter
1129  * otherwise it returns -1.
1130  *
1131  * The compositions list has (trail, compositeAndFwd) pair entries,
1132  * encoded as either pairs or triples of 16-bit units.
1133  * The last entry has the high bit of its first unit set.
1134  *
1135  * The list is sorted by ascending trail characters (there are no duplicates).
1136  * A linear search is used.
1137  *
1138  * See normalizer2impl.h for a more detailed description
1139  * of the compositions list format.
1140  */
combine(const uint16_t * list,UChar32 trail)1141 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1142     uint16_t key1, firstUnit;
1143     if(trail<COMP_1_TRAIL_LIMIT) {
1144         // trail character is 0..33FF
1145         // result entry may have 2 or 3 units
1146         key1=(uint16_t)(trail<<1);
1147         while(key1>(firstUnit=*list)) {
1148             list+=2+(firstUnit&COMP_1_TRIPLE);
1149         }
1150         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1151             if(firstUnit&COMP_1_TRIPLE) {
1152                 return ((int32_t)list[1]<<16)|list[2];
1153             } else {
1154                 return list[1];
1155             }
1156         }
1157     } else {
1158         // trail character is 3400..10FFFF
1159         // result entry has 3 units
1160         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1161                         (((trail>>COMP_1_TRAIL_SHIFT))&
1162                           ~COMP_1_TRIPLE));
1163         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1164         uint16_t secondUnit;
1165         for(;;) {
1166             if(key1>(firstUnit=*list)) {
1167                 list+=2+(firstUnit&COMP_1_TRIPLE);
1168             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1169                 if(key2>(secondUnit=list[1])) {
1170                     if(firstUnit&COMP_1_LAST_TUPLE) {
1171                         break;
1172                     } else {
1173                         list+=3;
1174                     }
1175                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1176                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1177                 } else {
1178                     break;
1179                 }
1180             } else {
1181                 break;
1182             }
1183         }
1184     }
1185     return -1;
1186 }
1187 
1188 /**
1189   * @param list some character's compositions list
1190   * @param set recursively receives the composites from these compositions
1191   */
addComposites(const uint16_t * list,UnicodeSet & set) const1192 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1193     uint16_t firstUnit;
1194     int32_t compositeAndFwd;
1195     do {
1196         firstUnit=*list;
1197         if((firstUnit&COMP_1_TRIPLE)==0) {
1198             compositeAndFwd=list[1];
1199             list+=2;
1200         } else {
1201             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1202             list+=3;
1203         }
1204         UChar32 composite=compositeAndFwd>>1;
1205         if((compositeAndFwd&1)!=0) {
1206             addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1207         }
1208         set.add(composite);
1209     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1210 }
1211 
1212 /*
1213  * Recomposes the buffer text starting at recomposeStartIndex
1214  * (which is in NFD - decomposed and canonically ordered),
1215  * and truncates the buffer contents.
1216  *
1217  * Note that recomposition never lengthens the text:
1218  * Any character consists of either one or two code units;
1219  * a composition may contain at most one more code unit than the original starter,
1220  * while the combining mark that is removed has at least one code unit.
1221  */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const1222 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1223                                 UBool onlyContiguous) const {
1224     UChar *p=buffer.getStart()+recomposeStartIndex;
1225     UChar *limit=buffer.getLimit();
1226     if(p==limit) {
1227         return;
1228     }
1229 
1230     UChar *starter, *pRemove, *q, *r;
1231     const uint16_t *compositionsList;
1232     UChar32 c, compositeAndFwd;
1233     uint16_t norm16;
1234     uint8_t cc, prevCC;
1235     UBool starterIsSupplementary;
1236 
1237     // Some of the following variables are not used until we have a forward-combining starter
1238     // and are only initialized now to avoid compiler warnings.
1239     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
1240     starter=NULL;
1241     starterIsSupplementary=FALSE;
1242     prevCC=0;
1243 
1244     for(;;) {
1245         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1246         cc=getCCFromYesOrMaybe(norm16);
1247         if( // this character combines backward and
1248             isMaybe(norm16) &&
1249             // we have seen a starter that combines forward and
1250             compositionsList!=NULL &&
1251             // the backward-combining character is not blocked
1252             (prevCC<cc || prevCC==0)
1253         ) {
1254             if(isJamoVT(norm16)) {
1255                 // c is a Jamo V/T, see if we can compose it with the previous character.
1256                 if(c<Hangul::JAMO_T_BASE) {
1257                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1258                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1259                     if(prev<Hangul::JAMO_L_COUNT) {
1260                         pRemove=p-1;
1261                         UChar syllable=(UChar)
1262                             (Hangul::HANGUL_BASE+
1263                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1264                              Hangul::JAMO_T_COUNT);
1265                         UChar t;
1266                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1267                             ++p;
1268                             syllable+=t;  // The next character was a Jamo T.
1269                         }
1270                         *starter=syllable;
1271                         // remove the Jamo V/T
1272                         q=pRemove;
1273                         r=p;
1274                         while(r<limit) {
1275                             *q++=*r++;
1276                         }
1277                         limit=q;
1278                         p=pRemove;
1279                     }
1280                 }
1281                 /*
1282                  * No "else" for Jamo T:
1283                  * Since the input is in NFD, there are no Hangul LV syllables that
1284                  * a Jamo T could combine with.
1285                  * All Jamo Ts are combined above when handling Jamo Vs.
1286                  */
1287                 if(p==limit) {
1288                     break;
1289                 }
1290                 compositionsList=NULL;
1291                 continue;
1292             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1293                 // The starter and the combining mark (c) do combine.
1294                 UChar32 composite=compositeAndFwd>>1;
1295 
1296                 // Replace the starter with the composite, remove the combining mark.
1297                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
1298                 if(starterIsSupplementary) {
1299                     if(U_IS_SUPPLEMENTARY(composite)) {
1300                         // both are supplementary
1301                         starter[0]=U16_LEAD(composite);
1302                         starter[1]=U16_TRAIL(composite);
1303                     } else {
1304                         *starter=(UChar)composite;
1305                         // The composite is shorter than the starter,
1306                         // move the intermediate characters forward one.
1307                         starterIsSupplementary=FALSE;
1308                         q=starter+1;
1309                         r=q+1;
1310                         while(r<pRemove) {
1311                             *q++=*r++;
1312                         }
1313                         --pRemove;
1314                     }
1315                 } else if(U_IS_SUPPLEMENTARY(composite)) {
1316                     // The composite is longer than the starter,
1317                     // move the intermediate characters back one.
1318                     starterIsSupplementary=TRUE;
1319                     ++starter;  // temporarily increment for the loop boundary
1320                     q=pRemove;
1321                     r=++pRemove;
1322                     while(starter<q) {
1323                         *--r=*--q;
1324                     }
1325                     *starter=U16_TRAIL(composite);
1326                     *--starter=U16_LEAD(composite);  // undo the temporary increment
1327                 } else {
1328                     // both are on the BMP
1329                     *starter=(UChar)composite;
1330                 }
1331 
1332                 /* remove the combining mark by moving the following text over it */
1333                 if(pRemove<p) {
1334                     q=pRemove;
1335                     r=p;
1336                     while(r<limit) {
1337                         *q++=*r++;
1338                     }
1339                     limit=q;
1340                     p=pRemove;
1341                 }
1342                 // Keep prevCC because we removed the combining mark.
1343 
1344                 if(p==limit) {
1345                     break;
1346                 }
1347                 // Is the composite a starter that combines forward?
1348                 if(compositeAndFwd&1) {
1349                     compositionsList=
1350                         getCompositionsListForComposite(getRawNorm16(composite));
1351                 } else {
1352                     compositionsList=NULL;
1353                 }
1354 
1355                 // We combined; continue with looking for compositions.
1356                 continue;
1357             }
1358         }
1359 
1360         // no combination this time
1361         prevCC=cc;
1362         if(p==limit) {
1363             break;
1364         }
1365 
1366         // If c did not combine, then check if it is a starter.
1367         if(cc==0) {
1368             // Found a new starter.
1369             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1370                 // It may combine with something, prepare for it.
1371                 if(U_IS_BMP(c)) {
1372                     starterIsSupplementary=FALSE;
1373                     starter=p-1;
1374                 } else {
1375                     starterIsSupplementary=TRUE;
1376                     starter=p-2;
1377                 }
1378             }
1379         } else if(onlyContiguous) {
1380             // FCC: no discontiguous compositions; any intervening character blocks.
1381             compositionsList=NULL;
1382         }
1383     }
1384     buffer.setReorderingLimit(limit);
1385 }
1386 
1387 UChar32
composePair(UChar32 a,UChar32 b) const1388 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1389     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16
1390     const uint16_t *list;
1391     if(isInert(norm16)) {
1392         return U_SENTINEL;
1393     } else if(norm16<minYesNoMappingsOnly) {
1394         // a combines forward.
1395         if(isJamoL(norm16)) {
1396             b-=Hangul::JAMO_V_BASE;
1397             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1398                 return
1399                     (Hangul::HANGUL_BASE+
1400                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1401                      Hangul::JAMO_T_COUNT);
1402             } else {
1403                 return U_SENTINEL;
1404             }
1405         } else if(isHangulLV(norm16)) {
1406             b-=Hangul::JAMO_T_BASE;
1407             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1408                 return a+b;
1409             } else {
1410                 return U_SENTINEL;
1411             }
1412         } else {
1413             // 'a' has a compositions list in extraData
1414             list=getMapping(norm16);
1415             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1416                 list+=  // mapping pointer
1417                     1+  // +1 to skip the first unit with the mapping length
1418                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1419             }
1420         }
1421     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1422         return U_SENTINEL;
1423     } else {
1424         list=getCompositionsListForMaybe(norm16);
1425     }
1426     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1427         return U_SENTINEL;
1428     }
1429 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1430     return combine(list, b)>>1;
1431 #else
1432     int32_t compositeAndFwd=combine(list, b);
1433     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1434 #endif
1435 }
1436 
1437 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1438 // doCompose: normalize
1439 // !doCompose: isNormalized (buffer must be empty and initialized)
1440 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1441 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1442                          UBool onlyContiguous,
1443                          UBool doCompose,
1444                          ReorderingBuffer &buffer,
1445                          UErrorCode &errorCode) const {
1446     const UChar *prevBoundary=src;
1447     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1448     if(limit==NULL) {
1449         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1450                                            doCompose ? &buffer : NULL,
1451                                            errorCode);
1452         if(U_FAILURE(errorCode)) {
1453             return FALSE;
1454         }
1455         limit=u_strchr(src, 0);
1456         if (prevBoundary != src) {
1457             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1458                 prevBoundary = src;
1459             } else {
1460                 buffer.removeSuffix(1);
1461                 prevBoundary = --src;
1462             }
1463         }
1464     }
1465 
1466     for (;;) {
1467         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1468         // or with (compYes && ccc==0) properties.
1469         const UChar *prevSrc;
1470         UChar32 c = 0;
1471         uint16_t norm16 = 0;
1472         for (;;) {
1473             if (src == limit) {
1474                 if (prevBoundary != limit && doCompose) {
1475                     buffer.appendZeroCC(prevBoundary, limit, errorCode);
1476                 }
1477                 return TRUE;
1478             }
1479             if( (c=*src)<minNoMaybeCP ||
1480                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1481             ) {
1482                 ++src;
1483             } else {
1484                 prevSrc = src++;
1485                 if(!U16_IS_LEAD(c)) {
1486                     break;
1487                 } else {
1488                     UChar c2;
1489                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1490                         ++src;
1491                         c=U16_GET_SUPPLEMENTARY(c, c2);
1492                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1493                         if(!isCompYesAndZeroCC(norm16)) {
1494                             break;
1495                         }
1496                     }
1497                 }
1498             }
1499         }
1500         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1501         // The current character is either a "noNo" (has a mapping)
1502         // or a "maybeYes" (combines backward)
1503         // or a "yesYes" with ccc!=0.
1504         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1505 
1506         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1507         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1508             if (!doCompose) {
1509                 return FALSE;
1510             }
1511             // Fast path for mapping a character that is immediately surrounded by boundaries.
1512             // In this case, we need not decompose around the current character.
1513             if (isDecompNoAlgorithmic(norm16)) {
1514                 // Maps to a single isCompYesAndZeroCC character
1515                 // which also implies hasCompBoundaryBefore.
1516                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1517                         hasCompBoundaryBefore(src, limit)) {
1518                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1519                         break;
1520                     }
1521                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1522                         break;
1523                     }
1524                     prevBoundary = src;
1525                     continue;
1526                 }
1527             } else if (norm16 < minNoNoCompBoundaryBefore) {
1528                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1529                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1530                         hasCompBoundaryBefore(src, limit)) {
1531                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1532                         break;
1533                     }
1534                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1535                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1536                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1537                         break;
1538                     }
1539                     prevBoundary = src;
1540                     continue;
1541                 }
1542             } else if (norm16 >= minNoNoEmpty) {
1543                 // The current character maps to nothing.
1544                 // Simply omit it from the output if there is a boundary before _or_ after it.
1545                 // The character itself implies no boundaries.
1546                 if (hasCompBoundaryBefore(src, limit) ||
1547                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1548                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1549                         break;
1550                     }
1551                     prevBoundary = src;
1552                     continue;
1553                 }
1554             }
1555             // Other "noNo" type, or need to examine more text around this character:
1556             // Fall through to the slow path.
1557         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1558             UChar prev=*(prevSrc-1);
1559             if(c<Hangul::JAMO_T_BASE) {
1560                 // The current character is a Jamo Vowel,
1561                 // compose with previous Jamo L and following Jamo T.
1562                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1563                 if(l<Hangul::JAMO_L_COUNT) {
1564                     if (!doCompose) {
1565                         return FALSE;
1566                     }
1567                     int32_t t;
1568                     if (src != limit &&
1569                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1570                             t < Hangul::JAMO_T_COUNT) {
1571                         // The next character is a Jamo T.
1572                         ++src;
1573                     } else if (hasCompBoundaryBefore(src, limit)) {
1574                         // No Jamo T follows, not even via decomposition.
1575                         t = 0;
1576                     } else {
1577                         t = -1;
1578                     }
1579                     if (t >= 0) {
1580                         UChar32 syllable = Hangul::HANGUL_BASE +
1581                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1582                             Hangul::JAMO_T_COUNT + t;
1583                         --prevSrc;  // Replace the Jamo L as well.
1584                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1585                             break;
1586                         }
1587                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1588                             break;
1589                         }
1590                         prevBoundary = src;
1591                         continue;
1592                     }
1593                     // If we see L+V+x where x!=T then we drop to the slow path,
1594                     // decompose and recompose.
1595                     // This is to deal with NFKC finding normal L and V but a
1596                     // compatibility variant of a T.
1597                     // We need to either fully compose that combination here
1598                     // (which would complicate the code and may not work with strange custom data)
1599                     // or use the slow path.
1600                 }
1601             } else if (Hangul::isHangulLV(prev)) {
1602                 // The current character is a Jamo Trailing consonant,
1603                 // compose with previous Hangul LV that does not contain a Jamo T.
1604                 if (!doCompose) {
1605                     return FALSE;
1606                 }
1607                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1608                 --prevSrc;  // Replace the Hangul LV as well.
1609                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1610                     break;
1611                 }
1612                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1613                     break;
1614                 }
1615                 prevBoundary = src;
1616                 continue;
1617             }
1618             // No matching context, or may need to decompose surrounding text first:
1619             // Fall through to the slow path.
1620         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1621             // One or more combining marks that do not combine-back:
1622             // Check for canonical order, copy unchanged if ok and
1623             // if followed by a character with a boundary-before.
1624             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1625             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1626                 // Fails FCD test, need to decompose and contiguously recompose.
1627                 if (!doCompose) {
1628                     return FALSE;
1629                 }
1630             } else {
1631                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1632                 // the previous character which passed the quick check "yes && ccc==0" test.
1633                 const UChar *nextSrc;
1634                 uint16_t n16;
1635                 for (;;) {
1636                     if (src == limit) {
1637                         if (doCompose) {
1638                             buffer.appendZeroCC(prevBoundary, limit, errorCode);
1639                         }
1640                         return TRUE;
1641                     }
1642                     uint8_t prevCC = cc;
1643                     nextSrc = src;
1644                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
1645                     if (n16 >= MIN_YES_YES_WITH_CC) {
1646                         cc = getCCFromNormalYesOrMaybe(n16);
1647                         if (prevCC > cc) {
1648                             if (!doCompose) {
1649                                 return FALSE;
1650                             }
1651                             break;
1652                         }
1653                     } else {
1654                         break;
1655                     }
1656                     src = nextSrc;
1657                 }
1658                 // src is after the last in-order combining mark.
1659                 // If there is a boundary here, then we continue with no change.
1660                 if (norm16HasCompBoundaryBefore(n16)) {
1661                     if (isCompYesAndZeroCC(n16)) {
1662                         src = nextSrc;
1663                     }
1664                     continue;
1665                 }
1666                 // Use the slow path. There is no boundary in [prevSrc, src[.
1667             }
1668         }
1669 
1670         // Slow path: Find the nearest boundaries around the current character,
1671         // decompose and recompose.
1672         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1673             const UChar *p = prevSrc;
1674             UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
1675             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1676                 prevSrc = p;
1677             }
1678         }
1679         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1680             break;
1681         }
1682         int32_t recomposeStartIndex=buffer.length();
1683         // We know there is not a boundary here.
1684         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1685                        buffer, errorCode);
1686         // Decompose until the next boundary.
1687         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1688                              buffer, errorCode);
1689         if (U_FAILURE(errorCode)) {
1690             break;
1691         }
1692         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1693             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1694             return TRUE;
1695         }
1696         recompose(buffer, recomposeStartIndex, onlyContiguous);
1697         if(!doCompose) {
1698             if(!buffer.equals(prevSrc, src)) {
1699                 return FALSE;
1700             }
1701             buffer.remove();
1702         }
1703         prevBoundary=src;
1704     }
1705     return TRUE;
1706 }
1707 
1708 // Very similar to compose(): Make the same changes in both places if relevant.
1709 // pQCResult==NULL: spanQuickCheckYes
1710 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1711 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1712 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1713                                    UBool onlyContiguous,
1714                                    UNormalizationCheckResult *pQCResult) const {
1715     const UChar *prevBoundary=src;
1716     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1717     if(limit==NULL) {
1718         UErrorCode errorCode=U_ZERO_ERROR;
1719         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1720         limit=u_strchr(src, 0);
1721         if (prevBoundary != src) {
1722             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1723                 prevBoundary = src;
1724             } else {
1725                 prevBoundary = --src;
1726             }
1727         }
1728     }
1729 
1730     for(;;) {
1731         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1732         // or with (compYes && ccc==0) properties.
1733         const UChar *prevSrc;
1734         UChar32 c = 0;
1735         uint16_t norm16 = 0;
1736         for (;;) {
1737             if(src==limit) {
1738                 return src;
1739             }
1740             if( (c=*src)<minNoMaybeCP ||
1741                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1742             ) {
1743                 ++src;
1744             } else {
1745                 prevSrc = src++;
1746                 if(!U16_IS_LEAD(c)) {
1747                     break;
1748                 } else {
1749                     UChar c2;
1750                     if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1751                         ++src;
1752                         c=U16_GET_SUPPLEMENTARY(c, c2);
1753                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1754                         if(!isCompYesAndZeroCC(norm16)) {
1755                             break;
1756                         }
1757                     }
1758                 }
1759             }
1760         }
1761         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1762         // The current character is either a "noNo" (has a mapping)
1763         // or a "maybeYes" (combines backward)
1764         // or a "yesYes" with ccc!=0.
1765         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1766 
1767         uint16_t prevNorm16 = INERT;
1768         if (prevBoundary != prevSrc) {
1769             if (norm16HasCompBoundaryBefore(norm16)) {
1770                 prevBoundary = prevSrc;
1771             } else {
1772                 const UChar *p = prevSrc;
1773                 uint16_t n16;
1774                 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
1775                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1776                     prevBoundary = prevSrc;
1777                 } else {
1778                     prevBoundary = p;
1779                     prevNorm16 = n16;
1780                 }
1781             }
1782         }
1783 
1784         if(isMaybeOrNonZeroCC(norm16)) {
1785             uint8_t cc=getCCFromYesOrMaybe(norm16);
1786             if (onlyContiguous /* FCC */ && cc != 0 &&
1787                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1788                 // The [prevBoundary..prevSrc[ character
1789                 // passed the quick check "yes && ccc==0" test
1790                 // but is out of canonical order with the current combining mark.
1791             } else {
1792                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1793                 // the previous character which passed the quick check "yes && ccc==0" test.
1794                 const UChar *nextSrc;
1795                 for (;;) {
1796                     if (norm16 < MIN_YES_YES_WITH_CC) {
1797                         if (pQCResult != nullptr) {
1798                             *pQCResult = UNORM_MAYBE;
1799                         } else {
1800                             return prevBoundary;
1801                         }
1802                     }
1803                     if (src == limit) {
1804                         return src;
1805                     }
1806                     uint8_t prevCC = cc;
1807                     nextSrc = src;
1808                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
1809                     if (isMaybeOrNonZeroCC(norm16)) {
1810                         cc = getCCFromYesOrMaybe(norm16);
1811                         if (!(prevCC <= cc || cc == 0)) {
1812                             break;
1813                         }
1814                     } else {
1815                         break;
1816                     }
1817                     src = nextSrc;
1818                 }
1819                 // src is after the last in-order combining mark.
1820                 if (isCompYesAndZeroCC(norm16)) {
1821                     prevBoundary = src;
1822                     src = nextSrc;
1823                     continue;
1824                 }
1825             }
1826         }
1827         if(pQCResult!=NULL) {
1828             *pQCResult=UNORM_NO;
1829         }
1830         return prevBoundary;
1831     }
1832 }
1833 
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1834 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1835                                        UBool doCompose,
1836                                        UBool onlyContiguous,
1837                                        UnicodeString &safeMiddle,
1838                                        ReorderingBuffer &buffer,
1839                                        UErrorCode &errorCode) const {
1840     if(!buffer.isEmpty()) {
1841         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1842         if(src!=firstStarterInSrc) {
1843             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1844                                                                     buffer.getLimit(), onlyContiguous);
1845             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1846             UnicodeString middle(lastStarterInDest, destSuffixLength);
1847             buffer.removeSuffix(destSuffixLength);
1848             safeMiddle=middle;
1849             middle.append(src, (int32_t)(firstStarterInSrc-src));
1850             const UChar *middleStart=middle.getBuffer();
1851             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1852                     TRUE, buffer, errorCode);
1853             if(U_FAILURE(errorCode)) {
1854                 return;
1855             }
1856             src=firstStarterInSrc;
1857         }
1858     }
1859     if(doCompose) {
1860         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1861     } else {
1862         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1863             limit=u_strchr(src, 0);
1864         }
1865         buffer.appendZeroCC(src, limit, errorCode);
1866     }
1867 }
1868 
1869 UBool
composeUTF8(uint32_t options,UBool onlyContiguous,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const1870 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1871                              const uint8_t *src, const uint8_t *limit,
1872                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1873     U_ASSERT(limit != nullptr);
1874     UnicodeString s16;
1875     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1876     const uint8_t *prevBoundary = src;
1877 
1878     for (;;) {
1879         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1880         // or with (compYes && ccc==0) properties.
1881         const uint8_t *prevSrc;
1882         uint16_t norm16 = 0;
1883         for (;;) {
1884             if (src == limit) {
1885                 if (prevBoundary != limit && sink != nullptr) {
1886                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1887                                                   *sink, options, edits, errorCode);
1888                 }
1889                 return TRUE;
1890             }
1891             if (*src < minNoMaybeLead) {
1892                 ++src;
1893             } else {
1894                 prevSrc = src;
1895                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
1896                 if (!isCompYesAndZeroCC(norm16)) {
1897                     break;
1898                 }
1899             }
1900         }
1901         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1902         // The current character is either a "noNo" (has a mapping)
1903         // or a "maybeYes" (combines backward)
1904         // or a "yesYes" with ccc!=0.
1905         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1906 
1907         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1908         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1909             if (sink == nullptr) {
1910                 return FALSE;
1911             }
1912             // Fast path for mapping a character that is immediately surrounded by boundaries.
1913             // In this case, we need not decompose around the current character.
1914             if (isDecompNoAlgorithmic(norm16)) {
1915                 // Maps to a single isCompYesAndZeroCC character
1916                 // which also implies hasCompBoundaryBefore.
1917                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1918                         hasCompBoundaryBefore(src, limit)) {
1919                     if (prevBoundary != prevSrc &&
1920                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1921                                                            *sink, options, edits, errorCode)) {
1922                         break;
1923                     }
1924                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1925                     prevBoundary = src;
1926                     continue;
1927                 }
1928             } else if (norm16 < minNoNoCompBoundaryBefore) {
1929                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1930                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1931                         hasCompBoundaryBefore(src, limit)) {
1932                     if (prevBoundary != prevSrc &&
1933                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1934                                                            *sink, options, edits, errorCode)) {
1935                         break;
1936                     }
1937                     const uint16_t *mapping = getMapping(norm16);
1938                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1939                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1940                                                     *sink, edits, errorCode)) {
1941                         break;
1942                     }
1943                     prevBoundary = src;
1944                     continue;
1945                 }
1946             } else if (norm16 >= minNoNoEmpty) {
1947                 // The current character maps to nothing.
1948                 // Simply omit it from the output if there is a boundary before _or_ after it.
1949                 // The character itself implies no boundaries.
1950                 if (hasCompBoundaryBefore(src, limit) ||
1951                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1952                     if (prevBoundary != prevSrc &&
1953                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1954                                                            *sink, options, edits, errorCode)) {
1955                         break;
1956                     }
1957                     if (edits != nullptr) {
1958                         edits->addReplace((int32_t)(src - prevSrc), 0);
1959                     }
1960                     prevBoundary = src;
1961                     continue;
1962                 }
1963             }
1964             // Other "noNo" type, or need to examine more text around this character:
1965             // Fall through to the slow path.
1966         } else if (isJamoVT(norm16)) {
1967             // Jamo L: E1 84 80..92
1968             // Jamo V: E1 85 A1..B5
1969             // Jamo T: E1 86 A8..E1 87 82
1970             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1971             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1972             if (prevSrc[1] == 0x85) {
1973                 // The current character is a Jamo Vowel,
1974                 // compose with previous Jamo L and following Jamo T.
1975                 UChar32 l = prev - Hangul::JAMO_L_BASE;
1976                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1977                     if (sink == nullptr) {
1978                         return FALSE;
1979                     }
1980                     int32_t t = getJamoTMinusBase(src, limit);
1981                     if (t >= 0) {
1982                         // The next character is a Jamo T.
1983                         src += 3;
1984                     } else if (hasCompBoundaryBefore(src, limit)) {
1985                         // No Jamo T follows, not even via decomposition.
1986                         t = 0;
1987                     }
1988                     if (t >= 0) {
1989                         UChar32 syllable = Hangul::HANGUL_BASE +
1990                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1991                             Hangul::JAMO_T_COUNT + t;
1992                         prevSrc -= 3;  // Replace the Jamo L as well.
1993                         if (prevBoundary != prevSrc &&
1994                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1995                                                                *sink, options, edits, errorCode)) {
1996                             break;
1997                         }
1998                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1999                         prevBoundary = src;
2000                         continue;
2001                     }
2002                     // If we see L+V+x where x!=T then we drop to the slow path,
2003                     // decompose and recompose.
2004                     // This is to deal with NFKC finding normal L and V but a
2005                     // compatibility variant of a T.
2006                     // We need to either fully compose that combination here
2007                     // (which would complicate the code and may not work with strange custom data)
2008                     // or use the slow path.
2009                 }
2010             } else if (Hangul::isHangulLV(prev)) {
2011                 // The current character is a Jamo Trailing consonant,
2012                 // compose with previous Hangul LV that does not contain a Jamo T.
2013                 if (sink == nullptr) {
2014                     return FALSE;
2015                 }
2016                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
2017                 prevSrc -= 3;  // Replace the Hangul LV as well.
2018                 if (prevBoundary != prevSrc &&
2019                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2020                                                        *sink, options, edits, errorCode)) {
2021                     break;
2022                 }
2023                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
2024                 prevBoundary = src;
2025                 continue;
2026             }
2027             // No matching context, or may need to decompose surrounding text first:
2028             // Fall through to the slow path.
2029         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
2030             // One or more combining marks that do not combine-back:
2031             // Check for canonical order, copy unchanged if ok and
2032             // if followed by a character with a boundary-before.
2033             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
2034             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
2035                 // Fails FCD test, need to decompose and contiguously recompose.
2036                 if (sink == nullptr) {
2037                     return FALSE;
2038                 }
2039             } else {
2040                 // If !onlyContiguous (not FCC), then we ignore the tccc of
2041                 // the previous character which passed the quick check "yes && ccc==0" test.
2042                 const uint8_t *nextSrc;
2043                 uint16_t n16;
2044                 for (;;) {
2045                     if (src == limit) {
2046                         if (sink != nullptr) {
2047                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,
2048                                                           *sink, options, edits, errorCode);
2049                         }
2050                         return TRUE;
2051                     }
2052                     uint8_t prevCC = cc;
2053                     nextSrc = src;
2054                     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
2055                     if (n16 >= MIN_YES_YES_WITH_CC) {
2056                         cc = getCCFromNormalYesOrMaybe(n16);
2057                         if (prevCC > cc) {
2058                             if (sink == nullptr) {
2059                                 return FALSE;
2060                             }
2061                             break;
2062                         }
2063                     } else {
2064                         break;
2065                     }
2066                     src = nextSrc;
2067                 }
2068                 // src is after the last in-order combining mark.
2069                 // If there is a boundary here, then we continue with no change.
2070                 if (norm16HasCompBoundaryBefore(n16)) {
2071                     if (isCompYesAndZeroCC(n16)) {
2072                         src = nextSrc;
2073                     }
2074                     continue;
2075                 }
2076                 // Use the slow path. There is no boundary in [prevSrc, src[.
2077             }
2078         }
2079 
2080         // Slow path: Find the nearest boundaries around the current character,
2081         // decompose and recompose.
2082         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
2083             const uint8_t *p = prevSrc;
2084             UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
2085             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2086                 prevSrc = p;
2087             }
2088         }
2089         ReorderingBuffer buffer(*this, s16, errorCode);
2090         if (U_FAILURE(errorCode)) {
2091             break;
2092         }
2093         // We know there is not a boundary here.
2094         decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
2095                        buffer, errorCode);
2096         // Decompose until the next boundary.
2097         src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
2098                              buffer, errorCode);
2099         if (U_FAILURE(errorCode)) {
2100             break;
2101         }
2102         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
2103             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
2104             return TRUE;
2105         }
2106         recompose(buffer, 0, onlyContiguous);
2107         if (!buffer.equals(prevSrc, src)) {
2108             if (sink == nullptr) {
2109                 return FALSE;
2110             }
2111             if (prevBoundary != prevSrc &&
2112                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2113                                                    *sink, options, edits, errorCode)) {
2114                 break;
2115             }
2116             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2117                                             *sink, edits, errorCode)) {
2118                 break;
2119             }
2120             prevBoundary = src;
2121         }
2122     }
2123     return TRUE;
2124 }
2125 
hasCompBoundaryBefore(const UChar * src,const UChar * limit) const2126 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
2127     if (src == limit || *src < minCompNoMaybeCP) {
2128         return TRUE;
2129     }
2130     UChar32 c;
2131     uint16_t norm16;
2132     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
2133     return norm16HasCompBoundaryBefore(norm16);
2134 }
2135 
hasCompBoundaryBefore(const uint8_t * src,const uint8_t * limit) const2136 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2137     if (src == limit) {
2138         return TRUE;
2139     }
2140     uint16_t norm16;
2141     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
2142     return norm16HasCompBoundaryBefore(norm16);
2143 }
2144 
hasCompBoundaryAfter(const UChar * start,const UChar * p,UBool onlyContiguous) const2145 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2146                                             UBool onlyContiguous) const {
2147     if (start == p) {
2148         return TRUE;
2149     }
2150     UChar32 c;
2151     uint16_t norm16;
2152     UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2153     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2154 }
2155 
hasCompBoundaryAfter(const uint8_t * start,const uint8_t * p,UBool onlyContiguous) const2156 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2157                                             UBool onlyContiguous) const {
2158     if (start == p) {
2159         return TRUE;
2160     }
2161     uint16_t norm16;
2162     UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
2163     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2164 }
2165 
findPreviousCompBoundary(const UChar * start,const UChar * p,UBool onlyContiguous) const2166 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2167                                                        UBool onlyContiguous) const {
2168     while (p != start) {
2169         const UChar *codePointLimit = p;
2170         UChar32 c;
2171         uint16_t norm16;
2172         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2173         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2174             return codePointLimit;
2175         }
2176         if (hasCompBoundaryBefore(c, norm16)) {
2177             return p;
2178         }
2179     }
2180     return p;
2181 }
2182 
findNextCompBoundary(const UChar * p,const UChar * limit,UBool onlyContiguous) const2183 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2184                                                    UBool onlyContiguous) const {
2185     while (p != limit) {
2186         const UChar *codePointStart = p;
2187         UChar32 c;
2188         uint16_t norm16;
2189         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2190         if (hasCompBoundaryBefore(c, norm16)) {
2191             return codePointStart;
2192         }
2193         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2194             return p;
2195         }
2196     }
2197     return p;
2198 }
2199 
getPreviousTrailCC(const UChar * start,const UChar * p) const2200 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2201     if (start == p) {
2202         return 0;
2203     }
2204     int32_t i = (int32_t)(p - start);
2205     UChar32 c;
2206     U16_PREV(start, 0, i, c);
2207     return (uint8_t)getFCD16(c);
2208 }
2209 
getPreviousTrailCC(const uint8_t * start,const uint8_t * p) const2210 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2211     if (start == p) {
2212         return 0;
2213     }
2214     int32_t i = (int32_t)(p - start);
2215     UChar32 c;
2216     U8_PREV(start, 0, i, c);
2217     return (uint8_t)getFCD16(c);
2218 }
2219 
2220 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2221 // still had getFCDTrie() which built and cached an FCD trie.
2222 // That provided faster access to FCD data than getFCD16FromNormData()
2223 // but required synchronization and consumed some 10kB of heap memory
2224 // in any process that uses FCD (e.g., via collation).
2225 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2226 // at least for ASCII & CJK.
2227 
2228 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2229 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
2230 // This work-around could/should be removed once the following versions of Visual Studio are no
2231 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2232 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2233 #pragma optimize( "", off )
2234 #endif
2235 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const2236 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2237     uint16_t norm16=getNorm16(c);
2238     if (norm16 >= limitNoNo) {
2239         if(norm16>=MIN_NORMAL_MAYBE_YES) {
2240             // combining mark
2241             norm16=getCCFromNormalYesOrMaybe(norm16);
2242             return norm16|(norm16<<8);
2243         } else if(norm16>=minMaybeYes) {
2244             return 0;
2245         } else {  // isDecompNoAlgorithmic(norm16)
2246             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2247             if (deltaTrailCC <= DELTA_TCCC_1) {
2248                 return deltaTrailCC >> OFFSET_SHIFT;
2249             }
2250             // Maps to an isCompYesAndZeroCC.
2251             c=mapAlgorithmic(c, norm16);
2252             norm16=getRawNorm16(c);
2253         }
2254     }
2255     if(norm16<=minYesNo || isHangulLVT(norm16)) {
2256         // no decomposition or Hangul syllable, all zeros
2257         return 0;
2258     }
2259     // c decomposes, get everything from the variable-length extra data
2260     const uint16_t *mapping=getMapping(norm16);
2261     uint16_t firstUnit=*mapping;
2262     norm16=firstUnit>>8;  // tccc
2263     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2264         norm16|=*(mapping-1)&0xff00;  // lccc
2265     }
2266     return norm16;
2267 }
2268 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2269 #pragma optimize( "", on )
2270 #endif
2271 
2272 // Dual functionality:
2273 // buffer!=NULL: normalize
2274 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2275 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const2276 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2277                          ReorderingBuffer *buffer,
2278                          UErrorCode &errorCode) const {
2279     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2280     // Similar to the prevBoundary in the compose() implementation.
2281     const UChar *prevBoundary=src;
2282     int32_t prevFCD16=0;
2283     if(limit==NULL) {
2284         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2285         if(U_FAILURE(errorCode)) {
2286             return src;
2287         }
2288         if(prevBoundary<src) {
2289             prevBoundary=src;
2290             // We know that the previous character's lccc==0.
2291             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2292             prevFCD16=getFCD16(*(src-1));
2293             if(prevFCD16>1) {
2294                 --prevBoundary;
2295             }
2296         }
2297         limit=u_strchr(src, 0);
2298     }
2299 
2300     // Note: In this function we use buffer->appendZeroCC() because we track
2301     // the lead and trail combining classes here, rather than leaving it to
2302     // the ReorderingBuffer.
2303     // The exception is the call to decomposeShort() which uses the buffer
2304     // in the normal way.
2305 
2306     const UChar *prevSrc;
2307     UChar32 c=0;
2308     uint16_t fcd16=0;
2309 
2310     for(;;) {
2311         // count code units with lccc==0
2312         for(prevSrc=src; src!=limit;) {
2313             if((c=*src)<minLcccCP) {
2314                 prevFCD16=~c;
2315                 ++src;
2316             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2317                 prevFCD16=0;
2318                 ++src;
2319             } else {
2320                 if(U16_IS_LEAD(c)) {
2321                     UChar c2;
2322                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2323                         c=U16_GET_SUPPLEMENTARY(c, c2);
2324                     }
2325                 }
2326                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2327                     prevFCD16=fcd16;
2328                     src+=U16_LENGTH(c);
2329                 } else {
2330                     break;
2331                 }
2332             }
2333         }
2334         // copy these code units all at once
2335         if(src!=prevSrc) {
2336             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2337                 break;
2338             }
2339             if(src==limit) {
2340                 break;
2341             }
2342             prevBoundary=src;
2343             // We know that the previous character's lccc==0.
2344             if(prevFCD16<0) {
2345                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2346                 UChar32 prev=~prevFCD16;
2347                 if(prev<minDecompNoCP) {
2348                     prevFCD16=0;
2349                 } else {
2350                     prevFCD16=getFCD16FromNormData(prev);
2351                     if(prevFCD16>1) {
2352                         --prevBoundary;
2353                     }
2354                 }
2355             } else {
2356                 const UChar *p=src-1;
2357                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2358                     --p;
2359                     // Need to fetch the previous character's FCD value because
2360                     // prevFCD16 was just for the trail surrogate code point.
2361                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2362                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2363                 }
2364                 if(prevFCD16>1) {
2365                     prevBoundary=p;
2366                 }
2367             }
2368             // The start of the current character (c).
2369             prevSrc=src;
2370         } else if(src==limit) {
2371             break;
2372         }
2373 
2374         src+=U16_LENGTH(c);
2375         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2376         // Check for proper order, and decompose locally if necessary.
2377         if((prevFCD16&0xff)<=(fcd16>>8)) {
2378             // proper order: prev tccc <= current lccc
2379             if((fcd16&0xff)<=1) {
2380                 prevBoundary=src;
2381             }
2382             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2383                 break;
2384             }
2385             prevFCD16=fcd16;
2386             continue;
2387         } else if(buffer==NULL) {
2388             return prevBoundary;  // quick check "no"
2389         } else {
2390             /*
2391              * Back out the part of the source that we copied or appended
2392              * already but is now going to be decomposed.
2393              * prevSrc is set to after what was copied/appended.
2394              */
2395             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2396             /*
2397              * Find the part of the source that needs to be decomposed,
2398              * up to the next safe boundary.
2399              */
2400             src=findNextFCDBoundary(src, limit);
2401             /*
2402              * The source text does not fulfill the conditions for FCD.
2403              * Decompose and reorder a limited piece of the text.
2404              */
2405             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2406             if (U_FAILURE(errorCode)) {
2407                 break;
2408             }
2409             prevBoundary=src;
2410             prevFCD16=0;
2411         }
2412     }
2413     return src;
2414 }
2415 
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const2416 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2417                                        UBool doMakeFCD,
2418                                        UnicodeString &safeMiddle,
2419                                        ReorderingBuffer &buffer,
2420                                        UErrorCode &errorCode) const {
2421     if(!buffer.isEmpty()) {
2422         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2423         if(src!=firstBoundaryInSrc) {
2424             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2425                                                                     buffer.getLimit());
2426             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2427             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2428             buffer.removeSuffix(destSuffixLength);
2429             safeMiddle=middle;
2430             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2431             const UChar *middleStart=middle.getBuffer();
2432             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2433             if(U_FAILURE(errorCode)) {
2434                 return;
2435             }
2436             src=firstBoundaryInSrc;
2437         }
2438     }
2439     if(doMakeFCD) {
2440         makeFCD(src, limit, &buffer, errorCode);
2441     } else {
2442         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
2443             limit=u_strchr(src, 0);
2444         }
2445         buffer.appendZeroCC(src, limit, errorCode);
2446     }
2447 }
2448 
findPreviousFCDBoundary(const UChar * start,const UChar * p) const2449 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2450     while(start<p) {
2451         const UChar *codePointLimit = p;
2452         UChar32 c;
2453         uint16_t norm16;
2454         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2455         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2456             return codePointLimit;
2457         }
2458         if (norm16HasDecompBoundaryBefore(norm16)) {
2459             return p;
2460         }
2461     }
2462     return p;
2463 }
2464 
findNextFCDBoundary(const UChar * p,const UChar * limit) const2465 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2466     while(p<limit) {
2467         const UChar *codePointStart=p;
2468         UChar32 c;
2469         uint16_t norm16;
2470         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2471         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2472             return codePointStart;
2473         }
2474         if (norm16HasDecompBoundaryAfter(norm16)) {
2475             return p;
2476         }
2477     }
2478     return p;
2479 }
2480 
2481 // CanonicalIterator data -------------------------------------------------- ***
2482 
CanonIterData(UErrorCode & errorCode)2483 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2484         mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),
2485         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2486 
~CanonIterData()2487 CanonIterData::~CanonIterData() {
2488     umutablecptrie_close(mutableTrie);
2489     ucptrie_close(trie);
2490 }
2491 
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)2492 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2493     uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
2494     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2495         // origin is the first character whose decomposition starts with
2496         // the character for which we are setting the value.
2497         umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);
2498     } else {
2499         // origin is not the first character, or it is U+0000.
2500         UnicodeSet *set;
2501         if((canonValue&CANON_HAS_SET)==0) {
2502             set=new UnicodeSet;
2503             if(set==NULL) {
2504                 errorCode=U_MEMORY_ALLOCATION_ERROR;
2505                 return;
2506             }
2507             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2508             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2509             umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
2510             canonStartSets.addElement(set, errorCode);
2511             if(firstOrigin!=0) {
2512                 set->add(firstOrigin);
2513             }
2514         } else {
2515             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2516         }
2517         set->add(origin);
2518     }
2519 }
2520 
2521 // C++ class for friend access to private Normalizer2Impl members.
2522 class InitCanonIterData {
2523 public:
2524     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2525 };
2526 
2527 U_CDECL_BEGIN
2528 
2529 // UInitOnce instantiation function for CanonIterData
2530 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)2531 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2532     InitCanonIterData::doInit(impl, errorCode);
2533 }
2534 
2535 U_CDECL_END
2536 
doInit(Normalizer2Impl * impl,UErrorCode & errorCode)2537 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2538     U_ASSERT(impl->fCanonIterData == NULL);
2539     impl->fCanonIterData = new CanonIterData(errorCode);
2540     if (impl->fCanonIterData == NULL) {
2541         errorCode=U_MEMORY_ALLOCATION_ERROR;
2542     }
2543     if (U_SUCCESS(errorCode)) {
2544         UChar32 start = 0, end;
2545         uint32_t value;
2546         while ((end = ucptrie_getRange(impl->normTrie, start,
2547                                        UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2548                                        nullptr, nullptr, &value)) >= 0) {
2549             // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2550             if (value != Normalizer2Impl::INERT) {
2551                 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2552             }
2553             start = end + 1;
2554         }
2555 #ifdef UCPTRIE_DEBUG
2556         umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2557 #endif
2558         impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
2559             impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2560         umutablecptrie_close(impl->fCanonIterData->mutableTrie);
2561         impl->fCanonIterData->mutableTrie = nullptr;
2562     }
2563     if (U_FAILURE(errorCode)) {
2564         delete impl->fCanonIterData;
2565         impl->fCanonIterData = NULL;
2566     }
2567 }
2568 
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,const uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const2569 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2570                                                   CanonIterData &newData,
2571                                                   UErrorCode &errorCode) const {
2572     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2573         // Inert, or 2-way mapping (including Hangul syllable).
2574         // We do not write a canonStartSet for any yesNo character.
2575         // Composites from 2-way mappings are added at runtime from the
2576         // starter's compositions list, and the other characters in
2577         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2578         // "maybe" characters.
2579         return;
2580     }
2581     for(UChar32 c=start; c<=end; ++c) {
2582         uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
2583         uint32_t newValue=oldValue;
2584         if(isMaybeOrNonZeroCC(norm16)) {
2585             // not a segment starter if it occurs in a decomposition or has cc!=0
2586             newValue|=CANON_NOT_SEGMENT_STARTER;
2587             if(norm16<MIN_NORMAL_MAYBE_YES) {
2588                 newValue|=CANON_HAS_COMPOSITIONS;
2589             }
2590         } else if(norm16<minYesNo) {
2591             newValue|=CANON_HAS_COMPOSITIONS;
2592         } else {
2593             // c has a one-way decomposition
2594             UChar32 c2=c;
2595             // Do not modify the whole-range norm16 value.
2596             uint16_t norm16_2=norm16;
2597             if (isDecompNoAlgorithmic(norm16_2)) {
2598                 // Maps to an isCompYesAndZeroCC.
2599                 c2 = mapAlgorithmic(c2, norm16_2);
2600                 norm16_2 = getRawNorm16(c2);
2601                 // No compatibility mappings for the CanonicalIterator.
2602                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2603             }
2604             if (norm16_2 > minYesNo) {
2605                 // c decomposes, get everything from the variable-length extra data
2606                 const uint16_t *mapping=getMapping(norm16_2);
2607                 uint16_t firstUnit=*mapping;
2608                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2609                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2610                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
2611                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
2612                     }
2613                 }
2614                 // Skip empty mappings (no characters in the decomposition).
2615                 if(length!=0) {
2616                     ++mapping;  // skip over the firstUnit
2617                     // add c to first code point's start set
2618                     int32_t i=0;
2619                     U16_NEXT_UNSAFE(mapping, i, c2);
2620                     newData.addToStartSet(c, c2, errorCode);
2621                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2622                     // one-way mapping. A 2-way mapping is possible here after
2623                     // intermediate algorithmic mapping.
2624                     if(norm16_2>=minNoNo) {
2625                         while(i<length) {
2626                             U16_NEXT_UNSAFE(mapping, i, c2);
2627                             uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
2628                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2629                                 umutablecptrie_set(newData.mutableTrie, c2,
2630                                                    c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);
2631                             }
2632                         }
2633                     }
2634                 }
2635             } else {
2636                 // c decomposed to c2 algorithmically; c has cc==0
2637                 newData.addToStartSet(c, c2, errorCode);
2638             }
2639         }
2640         if(newValue!=oldValue) {
2641             umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
2642         }
2643     }
2644 }
2645 
ensureCanonIterData(UErrorCode & errorCode) const2646 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2647     // Logically const: Synchronized instantiation.
2648     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2649     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2650     return U_SUCCESS(errorCode);
2651 }
2652 
getCanonValue(UChar32 c) const2653 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2654     return (int32_t)ucptrie_get(fCanonIterData->trie, c);
2655 }
2656 
getCanonStartSet(int32_t n) const2657 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2658     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2659 }
2660 
isCanonSegmentStarter(UChar32 c) const2661 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2662     return getCanonValue(c)>=0;
2663 }
2664 
getCanonStartSet(UChar32 c,UnicodeSet & set) const2665 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2666     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2667     if(canonValue==0) {
2668         return FALSE;
2669     }
2670     set.clear();
2671     int32_t value=canonValue&CANON_VALUE_MASK;
2672     if((canonValue&CANON_HAS_SET)!=0) {
2673         set.addAll(getCanonStartSet(value));
2674     } else if(value!=0) {
2675         set.add(value);
2676     }
2677     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2678         uint16_t norm16=getRawNorm16(c);
2679         if(norm16==JAMO_L) {
2680             UChar32 syllable=
2681                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2682             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2683         } else {
2684             addComposites(getCompositionsList(norm16), set);
2685         }
2686     }
2687     return TRUE;
2688 }
2689 
2690 U_NAMESPACE_END
2691 
2692 // Normalizer2 data swapping ----------------------------------------------- ***
2693 
2694 U_NAMESPACE_USE
2695 
2696 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2697 unorm2_swap(const UDataSwapper *ds,
2698             const void *inData, int32_t length, void *outData,
2699             UErrorCode *pErrorCode) {
2700     const UDataInfo *pInfo;
2701     int32_t headerSize;
2702 
2703     const uint8_t *inBytes;
2704     uint8_t *outBytes;
2705 
2706     const int32_t *inIndexes;
2707     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2708 
2709     int32_t i, offset, nextOffset, size;
2710 
2711     /* udata_swapDataHeader checks the arguments */
2712     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2713     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2714         return 0;
2715     }
2716 
2717     /* check data format and format version */
2718     pInfo=(const UDataInfo *)((const char *)inData+4);
2719     uint8_t formatVersion0=pInfo->formatVersion[0];
2720     if(!(
2721         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2722         pInfo->dataFormat[1]==0x72 &&
2723         pInfo->dataFormat[2]==0x6d &&
2724         pInfo->dataFormat[3]==0x32 &&
2725         (1<=formatVersion0 && formatVersion0<=4)
2726     )) {
2727         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2728                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2729                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2730                          pInfo->formatVersion[0]);
2731         *pErrorCode=U_UNSUPPORTED_ERROR;
2732         return 0;
2733     }
2734 
2735     inBytes=(const uint8_t *)inData+headerSize;
2736     outBytes=(uint8_t *)outData+headerSize;
2737 
2738     inIndexes=(const int32_t *)inBytes;
2739     int32_t minIndexesLength;
2740     if(formatVersion0==1) {
2741         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2742     } else if(formatVersion0==2) {
2743         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2744     } else {
2745         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2746     }
2747 
2748     if(length>=0) {
2749         length-=headerSize;
2750         if(length<minIndexesLength*4) {
2751             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2752                              length);
2753             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2754             return 0;
2755         }
2756     }
2757 
2758     /* read the first few indexes */
2759     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2760         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2761     }
2762 
2763     /* get the total length of the data */
2764     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2765 
2766     if(length>=0) {
2767         if(length<size) {
2768             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2769                              length);
2770             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2771             return 0;
2772         }
2773 
2774         /* copy the data for inaccessible bytes */
2775         if(inBytes!=outBytes) {
2776             uprv_memcpy(outBytes, inBytes, size);
2777         }
2778 
2779         offset=0;
2780 
2781         /* swap the int32_t indexes[] */
2782         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2783         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2784         offset=nextOffset;
2785 
2786         /* swap the trie */
2787         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2788         utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2789         offset=nextOffset;
2790 
2791         /* swap the uint16_t extraData[] */
2792         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2793         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2794         offset=nextOffset;
2795 
2796         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2797         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2798         offset=nextOffset;
2799 
2800         U_ASSERT(offset==size);
2801     }
2802 
2803     return headerSize+size;
2804 }
2805 
2806 #endif  // !UCONFIG_NO_NORMALIZATION
2807