1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2impl.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18
19 // #define UCPTRIE_DEBUG
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_NORMALIZATION
24
25 #include "unicode/bytestream.h"
26 #include "unicode/edits.h"
27 #include "unicode/normalizer2.h"
28 #include "unicode/stringoptions.h"
29 #include "unicode/ucptrie.h"
30 #include "unicode/udata.h"
31 #include "unicode/umutablecptrie.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utf16.h"
34 #include "unicode/utf8.h"
35 #include "bytesinkutil.h"
36 #include "cmemory.h"
37 #include "mutex.h"
38 #include "normalizer2impl.h"
39 #include "putilimp.h"
40 #include "uassert.h"
41 #include "ucptrie_impl.h"
42 #include "uset_imp.h"
43 #include "uvector.h"
44
45 U_NAMESPACE_BEGIN
46
47 namespace {
48
49 /**
50 * UTF-8 lead byte for minNoMaybeCP.
51 * Can be lower than the actual lead byte for c.
52 * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
53 */
leadByteForCP(UChar32 c)54 inline uint8_t leadByteForCP(UChar32 c) {
55 if (c <= 0x7f) {
56 return (uint8_t)c;
57 } else if (c <= 0x7ff) {
58 return (uint8_t)(0xc0+(c>>6));
59 } else {
60 // Should not occur because ccc(U+0300)!=0.
61 return 0xe0;
62 }
63 }
64
65 /**
66 * Returns the code point from one single well-formed UTF-8 byte sequence
67 * between cpStart and cpLimit.
68 *
69 * Trie UTF-8 macros do not assemble whole code points (for efficiency).
70 * When we do need the code point, we call this function.
71 * We should not need it for normalization-inert data (norm16==0).
72 * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
73 */
codePointFromValidUTF8(const uint8_t * cpStart,const uint8_t * cpLimit)74 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
75 // Similar to U8_NEXT_UNSAFE(s, i, c).
76 U_ASSERT(cpStart < cpLimit);
77 uint8_t c = *cpStart;
78 switch(cpLimit-cpStart) {
79 case 1:
80 return c;
81 case 2:
82 return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
83 case 3:
84 // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
85 return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
86 case 4:
87 return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
88 default:
89 UPRV_UNREACHABLE; // Should not occur.
90 #ifdef U_STRINGI_PATCHES
91 return c;
92 #endif
93 }
94 }
95
96 /**
97 * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
98 * Otherwise returns a negative value.
99 */
previousHangulOrJamo(const uint8_t * start,const uint8_t * p)100 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
101 if ((p - start) >= 3) {
102 p -= 3;
103 uint8_t l = *p;
104 uint8_t t1, t2;
105 if (0xe1 <= l && l <= 0xed &&
106 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
107 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
108 (l < 0xed || t1 <= 0x1f)) {
109 return ((l & 0xf) << 12) | (t1 << 6) | t2;
110 }
111 }
112 return U_SENTINEL;
113 }
114
115 /**
116 * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
117 * Otherwise returns a negative value.
118 */
getJamoTMinusBase(const uint8_t * src,const uint8_t * limit)119 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
120 // Jamo T: E1 86 A8..E1 87 82
121 if ((limit - src) >= 3 && *src == 0xe1) {
122 if (src[1] == 0x86) {
123 uint8_t t = src[2];
124 // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
125 // Offset 0 does not correspond to any conjoining Jamo.
126 if (0xa8 <= t && t <= 0xbf) {
127 return t - 0xa7;
128 }
129 } else if (src[1] == 0x87) {
130 uint8_t t = src[2];
131 if ((int8_t)t <= (int8_t)0x82u) {
132 return t - (0xa7 - 0x40);
133 }
134 }
135 }
136 return -1;
137 }
138
139 void
appendCodePointDelta(const uint8_t * cpStart,const uint8_t * cpLimit,int32_t delta,ByteSink & sink,Edits * edits)140 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
141 ByteSink &sink, Edits *edits) {
142 char buffer[U8_MAX_LENGTH];
143 int32_t length;
144 int32_t cpLength = (int32_t)(cpLimit - cpStart);
145 if (cpLength == 1) {
146 // The builder makes ASCII map to ASCII.
147 buffer[0] = (uint8_t)(*cpStart + delta);
148 length = 1;
149 } else {
150 int32_t trail = *(cpLimit-1) + delta;
151 if (0x80 <= trail && trail <= 0xbf) {
152 // The delta only changes the last trail byte.
153 --cpLimit;
154 length = 0;
155 do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
156 buffer[length++] = (uint8_t)trail;
157 } else {
158 // Decode the code point, add the delta, re-encode.
159 UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
160 length = 0;
161 U8_APPEND_UNSAFE(buffer, length, c);
162 }
163 }
164 if (edits != nullptr) {
165 edits->addReplace(cpLength, length);
166 }
167 sink.Append(buffer, length);
168 }
169
170 } // namespace
171
172 // ReorderingBuffer -------------------------------------------------------- ***
173
ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest,UErrorCode & errorCode)174 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
175 UErrorCode &errorCode) :
176 impl(ni), str(dest),
177 start(str.getBuffer(8)), reorderStart(start), limit(start),
178 remainingCapacity(str.getCapacity()), lastCC(0) {
179 if (start == nullptr && U_SUCCESS(errorCode)) {
180 // getBuffer() already did str.setToBogus()
181 errorCode = U_MEMORY_ALLOCATION_ERROR;
182 }
183 }
184
init(int32_t destCapacity,UErrorCode & errorCode)185 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
186 int32_t length=str.length();
187 start=str.getBuffer(destCapacity);
188 if(start==NULL) {
189 // getBuffer() already did str.setToBogus()
190 errorCode=U_MEMORY_ALLOCATION_ERROR;
191 return FALSE;
192 }
193 limit=start+length;
194 remainingCapacity=str.getCapacity()-length;
195 reorderStart=start;
196 if(start==limit) {
197 lastCC=0;
198 } else {
199 setIterator();
200 lastCC=previousCC();
201 // Set reorderStart after the last code point with cc<=1 if there is one.
202 if(lastCC>1) {
203 while(previousCC()>1) {}
204 }
205 reorderStart=codePointLimit;
206 }
207 return TRUE;
208 }
209
equals(const UChar * otherStart,const UChar * otherLimit) const210 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
211 int32_t length=(int32_t)(limit-start);
212 return
213 length==(int32_t)(otherLimit-otherStart) &&
214 0==u_memcmp(start, otherStart, length);
215 }
216
equals(const uint8_t * otherStart,const uint8_t * otherLimit) const217 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
218 U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller
219 int32_t length = (int32_t)(limit - start);
220 int32_t otherLength = (int32_t)(otherLimit - otherStart);
221 // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
222 if (otherLength < length || (otherLength / 3) > length) {
223 return FALSE;
224 }
225 // Compare valid strings from between normalization boundaries.
226 // (Invalid sequences are normalization-inert.)
227 for (int32_t i = 0, j = 0;;) {
228 if (i >= length) {
229 return j >= otherLength;
230 } else if (j >= otherLength) {
231 return FALSE;
232 }
233 // Not at the end of either string yet.
234 UChar32 c, other;
235 U16_NEXT_UNSAFE(start, i, c);
236 U8_NEXT_UNSAFE(otherStart, j, other);
237 if (c != other) {
238 return FALSE;
239 }
240 }
241 }
242
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)243 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
244 if(remainingCapacity<2 && !resize(2, errorCode)) {
245 return FALSE;
246 }
247 if(lastCC<=cc || cc==0) {
248 limit[0]=U16_LEAD(c);
249 limit[1]=U16_TRAIL(c);
250 limit+=2;
251 lastCC=cc;
252 if(cc<=1) {
253 reorderStart=limit;
254 }
255 } else {
256 insert(c, cc);
257 }
258 remainingCapacity-=2;
259 return TRUE;
260 }
261
append(const UChar * s,int32_t length,UBool isNFD,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)262 UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,
263 uint8_t leadCC, uint8_t trailCC,
264 UErrorCode &errorCode) {
265 if(length==0) {
266 return TRUE;
267 }
268 if(remainingCapacity<length && !resize(length, errorCode)) {
269 return FALSE;
270 }
271 remainingCapacity-=length;
272 if(lastCC<=leadCC || leadCC==0) {
273 if(trailCC<=1) {
274 reorderStart=limit+length;
275 } else if(leadCC<=1) {
276 reorderStart=limit+1; // Ok if not a code point boundary.
277 }
278 const UChar *sLimit=s+length;
279 do { *limit++=*s++; } while(s!=sLimit);
280 lastCC=trailCC;
281 } else {
282 int32_t i=0;
283 UChar32 c;
284 U16_NEXT(s, i, length, c);
285 insert(c, leadCC); // insert first code point
286 while(i<length) {
287 U16_NEXT(s, i, length, c);
288 if(i<length) {
289 if (isNFD) {
290 leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
291 } else {
292 leadCC = impl.getCC(impl.getNorm16(c));
293 }
294 } else {
295 leadCC=trailCC;
296 }
297 append(c, leadCC, errorCode);
298 }
299 }
300 return TRUE;
301 }
302
appendZeroCC(UChar32 c,UErrorCode & errorCode)303 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
304 int32_t cpLength=U16_LENGTH(c);
305 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
306 return FALSE;
307 }
308 remainingCapacity-=cpLength;
309 if(cpLength==1) {
310 *limit++=(UChar)c;
311 } else {
312 limit[0]=U16_LEAD(c);
313 limit[1]=U16_TRAIL(c);
314 limit+=2;
315 }
316 lastCC=0;
317 reorderStart=limit;
318 return TRUE;
319 }
320
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)321 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
322 if(s==sLimit) {
323 return TRUE;
324 }
325 int32_t length=(int32_t)(sLimit-s);
326 if(remainingCapacity<length && !resize(length, errorCode)) {
327 return FALSE;
328 }
329 u_memcpy(limit, s, length);
330 limit+=length;
331 remainingCapacity-=length;
332 lastCC=0;
333 reorderStart=limit;
334 return TRUE;
335 }
336
remove()337 void ReorderingBuffer::remove() {
338 reorderStart=limit=start;
339 remainingCapacity=str.getCapacity();
340 lastCC=0;
341 }
342
removeSuffix(int32_t suffixLength)343 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
344 if(suffixLength<(limit-start)) {
345 limit-=suffixLength;
346 remainingCapacity+=suffixLength;
347 } else {
348 limit=start;
349 remainingCapacity=str.getCapacity();
350 }
351 lastCC=0;
352 reorderStart=limit;
353 }
354
resize(int32_t appendLength,UErrorCode & errorCode)355 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
356 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
357 int32_t length=(int32_t)(limit-start);
358 str.releaseBuffer(length);
359 int32_t newCapacity=length+appendLength;
360 int32_t doubleCapacity=2*str.getCapacity();
361 if(newCapacity<doubleCapacity) {
362 newCapacity=doubleCapacity;
363 }
364 if(newCapacity<256) {
365 newCapacity=256;
366 }
367 start=str.getBuffer(newCapacity);
368 if(start==NULL) {
369 // getBuffer() already did str.setToBogus()
370 errorCode=U_MEMORY_ALLOCATION_ERROR;
371 return FALSE;
372 }
373 reorderStart=start+reorderStartIndex;
374 limit=start+length;
375 remainingCapacity=str.getCapacity()-length;
376 return TRUE;
377 }
378
skipPrevious()379 void ReorderingBuffer::skipPrevious() {
380 codePointLimit=codePointStart;
381 UChar c=*--codePointStart;
382 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
383 --codePointStart;
384 }
385 }
386
previousCC()387 uint8_t ReorderingBuffer::previousCC() {
388 codePointLimit=codePointStart;
389 if(reorderStart>=codePointStart) {
390 return 0;
391 }
392 UChar32 c=*--codePointStart;
393 UChar c2;
394 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
395 --codePointStart;
396 c=U16_GET_SUPPLEMENTARY(c2, c);
397 }
398 return impl.getCCFromYesOrMaybeCP(c);
399 }
400
401 // Inserts c somewhere before the last character.
402 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)403 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
404 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
405 // insert c at codePointLimit, after the character with prevCC<=cc
406 UChar *q=limit;
407 UChar *r=limit+=U16_LENGTH(c);
408 do {
409 *--r=*--q;
410 } while(codePointLimit!=q);
411 writeCodePoint(q, c);
412 if(cc<=1) {
413 reorderStart=r;
414 }
415 }
416
417 // Normalizer2Impl --------------------------------------------------------- ***
418
419 struct CanonIterData : public UMemory {
420 CanonIterData(UErrorCode &errorCode);
421 ~CanonIterData();
422 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
423 UMutableCPTrie *mutableTrie;
424 UCPTrie *trie;
425 UVector canonStartSets; // contains UnicodeSet *
426 };
427
~Normalizer2Impl()428 Normalizer2Impl::~Normalizer2Impl() {
429 delete fCanonIterData;
430 }
431
432 void
init(const int32_t * inIndexes,const UCPTrie * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)433 Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
434 const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
435 minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);
436 minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
437 minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);
438
439 minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
440 minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
441 minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
442 minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
443 minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
444 minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
445 limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
446 minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
447 U_ASSERT((minMaybeYes & 7) == 0); // 8-aligned for noNoDelta bit fields
448 centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
449
450 normTrie=inTrie;
451
452 maybeYesCompositions=inExtraData;
453 extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
454
455 smallFCD=inSmallFCD;
456 }
457
458 U_CDECL_BEGIN
459
460 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)461 segmentStarterMapper(const void * /*context*/, uint32_t value) {
462 return value&CANON_NOT_SEGMENT_STARTER;
463 }
464
465 U_CDECL_END
466
467 void
addLcccChars(UnicodeSet & set) const468 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
469 UChar32 start = 0, end;
470 uint32_t norm16;
471 while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
472 nullptr, nullptr, &norm16)) >= 0) {
473 if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
474 norm16 != Normalizer2Impl::JAMO_VT) {
475 set.add(start, end);
476 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
477 uint16_t fcd16 = getFCD16(start);
478 if (fcd16 > 0xff) { set.add(start, end); }
479 }
480 start = end + 1;
481 }
482 }
483
484 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const485 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
486 // Add the start code point of each same-value range of the trie.
487 UChar32 start = 0, end;
488 uint32_t value;
489 while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
490 nullptr, nullptr, &value)) >= 0) {
491 sa->add(sa->set, start);
492 if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
493 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
494 // Range of code points with same-norm16-value algorithmic decompositions.
495 // They might have different non-zero FCD16 values.
496 uint16_t prevFCD16 = getFCD16(start);
497 while (++start <= end) {
498 uint16_t fcd16 = getFCD16(start);
499 if (fcd16 != prevFCD16) {
500 sa->add(sa->set, start);
501 prevFCD16 = fcd16;
502 }
503 }
504 }
505 start = end + 1;
506 }
507
508 /* add Hangul LV syllables and LV+1 because of skippables */
509 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
510 sa->add(sa->set, c);
511 sa->add(sa->set, c+1);
512 }
513 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
514 }
515
516 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const517 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
518 // Add the start code point of each same-value range of the canonical iterator data trie.
519 if (!ensureCanonIterData(errorCode)) { return; }
520 // Currently only used for the SEGMENT_STARTER property.
521 UChar32 start = 0, end;
522 uint32_t value;
523 while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
524 segmentStarterMapper, nullptr, &value)) >= 0) {
525 sa->add(sa->set, start);
526 start = end + 1;
527 }
528 }
529
530 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const531 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
532 UChar32 minNeedDataCP,
533 ReorderingBuffer *buffer,
534 UErrorCode &errorCode) const {
535 // Make some effort to support NUL-terminated strings reasonably.
536 // Take the part of the fast quick check loop that does not look up
537 // data and check the first part of the string.
538 // After this prefix, determine the string length to simplify the rest
539 // of the code.
540 const UChar *prevSrc=src;
541 UChar c;
542 while((c=*src++)<minNeedDataCP && c!=0) {}
543 // Back out the last character for full processing.
544 // Copy this prefix.
545 if(--src!=prevSrc) {
546 if(buffer!=NULL) {
547 buffer->appendZeroCC(prevSrc, src, errorCode);
548 }
549 }
550 return src;
551 }
552
553 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const554 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
555 UErrorCode &errorCode) const {
556 if(U_FAILURE(errorCode)) {
557 dest.setToBogus();
558 return dest;
559 }
560 const UChar *sArray=src.getBuffer();
561 if(&dest==&src || sArray==NULL) {
562 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
563 dest.setToBogus();
564 return dest;
565 }
566 decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
567 return dest;
568 }
569
570 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const571 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
572 UnicodeString &dest,
573 int32_t destLengthEstimate,
574 UErrorCode &errorCode) const {
575 if(destLengthEstimate<0 && limit!=NULL) {
576 destLengthEstimate=(int32_t)(limit-src);
577 }
578 dest.remove();
579 ReorderingBuffer buffer(*this, dest);
580 if(buffer.init(destLengthEstimate, errorCode)) {
581 decompose(src, limit, &buffer, errorCode);
582 }
583 }
584
585 // Dual functionality:
586 // buffer!=NULL: normalize
587 // buffer==NULL: isNormalized/spanQuickCheckYes
588 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const589 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
590 ReorderingBuffer *buffer,
591 UErrorCode &errorCode) const {
592 UChar32 minNoCP=minDecompNoCP;
593 if(limit==NULL) {
594 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
595 if(U_FAILURE(errorCode)) {
596 return src;
597 }
598 limit=u_strchr(src, 0);
599 }
600
601 const UChar *prevSrc;
602 UChar32 c=0;
603 uint16_t norm16=0;
604
605 // only for quick check
606 const UChar *prevBoundary=src;
607 uint8_t prevCC=0;
608
609 for(;;) {
610 // count code units below the minimum or with irrelevant data for the quick check
611 for(prevSrc=src; src!=limit;) {
612 if( (c=*src)<minNoCP ||
613 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
614 ) {
615 ++src;
616 } else if(!U16_IS_LEAD(c)) {
617 break;
618 } else {
619 UChar c2;
620 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
621 c=U16_GET_SUPPLEMENTARY(c, c2);
622 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
623 if(isMostDecompYesAndZeroCC(norm16)) {
624 src+=2;
625 } else {
626 break;
627 }
628 } else {
629 ++src; // unpaired lead surrogate: inert
630 }
631 }
632 }
633 // copy these code units all at once
634 if(src!=prevSrc) {
635 if(buffer!=NULL) {
636 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
637 break;
638 }
639 } else {
640 prevCC=0;
641 prevBoundary=src;
642 }
643 }
644 if(src==limit) {
645 break;
646 }
647
648 // Check one above-minimum, relevant code point.
649 src+=U16_LENGTH(c);
650 if(buffer!=NULL) {
651 if(!decompose(c, norm16, *buffer, errorCode)) {
652 break;
653 }
654 } else {
655 if(isDecompYes(norm16)) {
656 uint8_t cc=getCCFromYesOrMaybe(norm16);
657 if(prevCC<=cc || cc==0) {
658 prevCC=cc;
659 if(cc<=1) {
660 prevBoundary=src;
661 }
662 continue;
663 }
664 }
665 return prevBoundary; // "no" or cc out of order
666 }
667 }
668 return src;
669 }
670
671 // Decompose a short piece of text which is likely to contain characters that
672 // fail the quick check loop and/or where the quick check loop's overhead
673 // is unlikely to be amortized.
674 // Called by the compose() and makeFCD() implementations.
675 const UChar *
decomposeShort(const UChar * src,const UChar * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const676 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
677 UBool stopAtCompBoundary, UBool onlyContiguous,
678 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
679 if (U_FAILURE(errorCode)) {
680 return nullptr;
681 }
682 while(src<limit) {
683 if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
684 return src;
685 }
686 const UChar *prevSrc = src;
687 UChar32 c;
688 uint16_t norm16;
689 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
690 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
691 return prevSrc;
692 }
693 if(!decompose(c, norm16, buffer, errorCode)) {
694 return nullptr;
695 }
696 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
697 return src;
698 }
699 }
700 return src;
701 }
702
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const703 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
704 ReorderingBuffer &buffer,
705 UErrorCode &errorCode) const {
706 // get the decomposition and the lead and trail cc's
707 if (norm16 >= limitNoNo) {
708 if (isMaybeOrNonZeroCC(norm16)) {
709 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
710 }
711 // Maps to an isCompYesAndZeroCC.
712 c=mapAlgorithmic(c, norm16);
713 norm16=getRawNorm16(c);
714 }
715 if (norm16 < minYesNo) {
716 // c does not decompose
717 return buffer.append(c, 0, errorCode);
718 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
719 // Hangul syllable: decompose algorithmically
720 UChar jamos[3];
721 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
722 }
723 // c decomposes, get everything from the variable-length extra data
724 const uint16_t *mapping=getMapping(norm16);
725 uint16_t firstUnit=*mapping;
726 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
727 uint8_t leadCC, trailCC;
728 trailCC=(uint8_t)(firstUnit>>8);
729 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
730 leadCC=(uint8_t)(*(mapping-1)>>8);
731 } else {
732 leadCC=0;
733 }
734 return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
735 }
736
737 // Dual functionality:
738 // sink != nullptr: normalize
739 // sink == nullptr: isNormalized/spanQuickCheckYes
740 const uint8_t *
decomposeUTF8(uint32_t options,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const741 Normalizer2Impl::decomposeUTF8(uint32_t options,
742 const uint8_t *src, const uint8_t *limit,
743 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
744 U_ASSERT(limit != nullptr);
745 UnicodeString s16;
746 uint8_t minNoLead = leadByteForCP(minDecompNoCP);
747
748 const uint8_t *prevBoundary = src;
749 // only for quick check
750 uint8_t prevCC = 0;
751
752 for (;;) {
753 // Fast path: Scan over a sequence of characters below the minimum "no" code point,
754 // or with (decompYes && ccc==0) properties.
755 const uint8_t *fastStart = src;
756 const uint8_t *prevSrc;
757 uint16_t norm16 = 0;
758
759 for (;;) {
760 if (src == limit) {
761 if (prevBoundary != limit && sink != nullptr) {
762 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
763 *sink, options, edits, errorCode);
764 }
765 return src;
766 }
767 if (*src < minNoLead) {
768 ++src;
769 } else {
770 prevSrc = src;
771 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
772 if (!isMostDecompYesAndZeroCC(norm16)) {
773 break;
774 }
775 }
776 }
777 // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
778 // and the current character at [prevSrc..src[ is not a common case with cc=0
779 // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
780 // It could still be a maybeYes with cc=0.
781 if (prevSrc != fastStart) {
782 // The fast path looped over yes/0 characters before the current one.
783 if (sink != nullptr &&
784 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
785 *sink, options, edits, errorCode)) {
786 break;
787 }
788 prevBoundary = prevSrc;
789 prevCC = 0;
790 }
791
792 // Medium-fast path: Quick check.
793 if (isMaybeOrNonZeroCC(norm16)) {
794 // Does not decompose.
795 uint8_t cc = getCCFromYesOrMaybe(norm16);
796 if (prevCC <= cc || cc == 0) {
797 prevCC = cc;
798 if (cc <= 1) {
799 if (sink != nullptr &&
800 !ByteSinkUtil::appendUnchanged(prevBoundary, src,
801 *sink, options, edits, errorCode)) {
802 break;
803 }
804 prevBoundary = src;
805 }
806 continue;
807 }
808 }
809 if (sink == nullptr) {
810 return prevBoundary; // quick check: "no" or cc out of order
811 }
812
813 // Slow path
814 // Decompose up to and including the current character.
815 if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
816 if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
817 *sink, options, edits, errorCode)) {
818 break;
819 }
820 prevBoundary = prevSrc;
821 }
822 ReorderingBuffer buffer(*this, s16, errorCode);
823 if (U_FAILURE(errorCode)) {
824 break;
825 }
826 decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
827 buffer, errorCode);
828 // Decompose until the next boundary.
829 if (buffer.getLastCC() > 1) {
830 src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
831 buffer, errorCode);
832 }
833 if (U_FAILURE(errorCode)) {
834 break;
835 }
836 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
837 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
838 break;
839 }
840 // We already know there was a change if the original character decomposed;
841 // otherwise compare.
842 if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
843 if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
844 *sink, options, edits, errorCode)) {
845 break;
846 }
847 } else {
848 if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
849 *sink, edits, errorCode)) {
850 break;
851 }
852 }
853 prevBoundary = src;
854 prevCC = 0;
855 }
856 return src;
857 }
858
859 const uint8_t *
decomposeShort(const uint8_t * src,const uint8_t * limit,StopAt stopAt,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const860 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
861 StopAt stopAt, UBool onlyContiguous,
862 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
863 if (U_FAILURE(errorCode)) {
864 return nullptr;
865 }
866 while (src < limit) {
867 const uint8_t *prevSrc = src;
868 uint16_t norm16;
869 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
870 // Get the decomposition and the lead and trail cc's.
871 UChar32 c = U_SENTINEL;
872 if (norm16 >= limitNoNo) {
873 if (isMaybeOrNonZeroCC(norm16)) {
874 // No comp boundaries around this character.
875 uint8_t cc = getCCFromYesOrMaybe(norm16);
876 if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
877 return prevSrc;
878 }
879 c = codePointFromValidUTF8(prevSrc, src);
880 if (!buffer.append(c, cc, errorCode)) {
881 return nullptr;
882 }
883 if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
884 return src;
885 }
886 continue;
887 }
888 // Maps to an isCompYesAndZeroCC.
889 if (stopAt != STOP_AT_LIMIT) {
890 return prevSrc;
891 }
892 c = codePointFromValidUTF8(prevSrc, src);
893 c = mapAlgorithmic(c, norm16);
894 norm16 = getRawNorm16(c);
895 } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
896 return prevSrc;
897 }
898 // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
899 // We do not see invalid UTF-8 here because
900 // its norm16==INERT is normalization-inert,
901 // so it gets copied unchanged in the fast path,
902 // and we stop the slow path where invalid UTF-8 begins.
903 // c >= 0 is the result of an algorithmic mapping.
904 U_ASSERT(c >= 0 || norm16 != INERT);
905 if (norm16 < minYesNo) {
906 if (c < 0) {
907 c = codePointFromValidUTF8(prevSrc, src);
908 }
909 // does not decompose
910 if (!buffer.append(c, 0, errorCode)) {
911 return nullptr;
912 }
913 } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
914 // Hangul syllable: decompose algorithmically
915 if (c < 0) {
916 c = codePointFromValidUTF8(prevSrc, src);
917 }
918 char16_t jamos[3];
919 if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
920 return nullptr;
921 }
922 } else {
923 // The character decomposes, get everything from the variable-length extra data.
924 const uint16_t *mapping = getMapping(norm16);
925 uint16_t firstUnit = *mapping;
926 int32_t length = firstUnit & MAPPING_LENGTH_MASK;
927 uint8_t trailCC = (uint8_t)(firstUnit >> 8);
928 uint8_t leadCC;
929 if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
930 leadCC = (uint8_t)(*(mapping-1) >> 8);
931 } else {
932 leadCC = 0;
933 }
934 if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
935 return prevSrc;
936 }
937 if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
938 return nullptr;
939 }
940 }
941 if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
942 (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
943 return src;
944 }
945 }
946 return src;
947 }
948
949 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const950 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
951 uint16_t norm16;
952 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
953 // c does not decompose
954 return nullptr;
955 }
956 const UChar *decomp = nullptr;
957 if(isDecompNoAlgorithmic(norm16)) {
958 // Maps to an isCompYesAndZeroCC.
959 c=mapAlgorithmic(c, norm16);
960 decomp=buffer;
961 length=0;
962 U16_APPEND_UNSAFE(buffer, length, c);
963 // The mapping might decompose further.
964 norm16 = getRawNorm16(c);
965 }
966 if (norm16 < minYesNo) {
967 return decomp;
968 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
969 // Hangul syllable: decompose algorithmically
970 length=Hangul::decompose(c, buffer);
971 return buffer;
972 }
973 // c decomposes, get everything from the variable-length extra data
974 const uint16_t *mapping=getMapping(norm16);
975 length=*mapping&MAPPING_LENGTH_MASK;
976 return (const UChar *)mapping+1;
977 }
978
979 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
980 // so that a raw mapping fits that consists of one unit ("rm0")
981 // plus all but the first two code units of the normal mapping.
982 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
983 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const984 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
985 uint16_t norm16;
986 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
987 // c does not decompose
988 return NULL;
989 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
990 // Hangul syllable: decompose algorithmically
991 Hangul::getRawDecomposition(c, buffer);
992 length=2;
993 return buffer;
994 } else if(isDecompNoAlgorithmic(norm16)) {
995 c=mapAlgorithmic(c, norm16);
996 length=0;
997 U16_APPEND_UNSAFE(buffer, length, c);
998 return buffer;
999 }
1000 // c decomposes, get everything from the variable-length extra data
1001 const uint16_t *mapping=getMapping(norm16);
1002 uint16_t firstUnit=*mapping;
1003 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
1004 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
1005 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
1006 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
1007 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
1008 uint16_t rm0=*rawMapping;
1009 if(rm0<=MAPPING_LENGTH_MASK) {
1010 length=rm0;
1011 return (const UChar *)rawMapping-rm0;
1012 } else {
1013 // Copy the normal mapping and replace its first two code units with rm0.
1014 buffer[0]=(UChar)rm0;
1015 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
1016 length=mLength-1;
1017 return buffer;
1018 }
1019 } else {
1020 length=mLength;
1021 return (const UChar *)mapping+1;
1022 }
1023 }
1024
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1025 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
1026 UBool doDecompose,
1027 UnicodeString &safeMiddle,
1028 ReorderingBuffer &buffer,
1029 UErrorCode &errorCode) const {
1030 buffer.copyReorderableSuffixTo(safeMiddle);
1031 if(doDecompose) {
1032 decompose(src, limit, &buffer, errorCode);
1033 return;
1034 }
1035 // Just merge the strings at the boundary.
1036 bool isFirst = true;
1037 uint8_t firstCC = 0, prevCC = 0, cc;
1038 const UChar *p = src;
1039 while (p != limit) {
1040 const UChar *codePointStart = p;
1041 UChar32 c;
1042 uint16_t norm16;
1043 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1044 if ((cc = getCC(norm16)) == 0) {
1045 p = codePointStart;
1046 break;
1047 }
1048 if (isFirst) {
1049 firstCC = cc;
1050 isFirst = false;
1051 }
1052 prevCC = cc;
1053 }
1054 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1055 limit=u_strchr(p, 0);
1056 }
1057
1058 if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) {
1059 buffer.appendZeroCC(p, limit, errorCode);
1060 }
1061 }
1062
hasDecompBoundaryBefore(UChar32 c) const1063 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
1064 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1065 norm16HasDecompBoundaryBefore(getNorm16(c));
1066 }
1067
norm16HasDecompBoundaryBefore(uint16_t norm16) const1068 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
1069 if (norm16 < minNoNoCompNoMaybeCC) {
1070 return TRUE;
1071 }
1072 if (norm16 >= limitNoNo) {
1073 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1074 }
1075 // c decomposes, get everything from the variable-length extra data
1076 const uint16_t *mapping=getMapping(norm16);
1077 uint16_t firstUnit=*mapping;
1078 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
1079 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1080 }
1081
hasDecompBoundaryAfter(UChar32 c) const1082 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
1083 if (c < minDecompNoCP) {
1084 return TRUE;
1085 }
1086 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1087 return TRUE;
1088 }
1089 return norm16HasDecompBoundaryAfter(getNorm16(c));
1090 }
1091
norm16HasDecompBoundaryAfter(uint16_t norm16) const1092 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
1093 if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1094 return TRUE;
1095 }
1096 if (norm16 >= limitNoNo) {
1097 if (isMaybeOrNonZeroCC(norm16)) {
1098 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1099 }
1100 // Maps to an isCompYesAndZeroCC.
1101 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1102 }
1103 // c decomposes, get everything from the variable-length extra data
1104 const uint16_t *mapping=getMapping(norm16);
1105 uint16_t firstUnit=*mapping;
1106 // decomp after-boundary: same as hasFCDBoundaryAfter(),
1107 // fcd16<=1 || trailCC==0
1108 if(firstUnit>0x1ff) {
1109 return FALSE; // trailCC>1
1110 }
1111 if(firstUnit<=0xff) {
1112 return TRUE; // trailCC==0
1113 }
1114 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1115 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
1116 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
1117 }
1118
1119 /*
1120 * Finds the recomposition result for
1121 * a forward-combining "lead" character,
1122 * specified with a pointer to its compositions list,
1123 * and a backward-combining "trail" character.
1124 *
1125 * If the lead and trail characters combine, then this function returns
1126 * the following "compositeAndFwd" value:
1127 * Bits 21..1 composite character
1128 * Bit 0 set if the composite is a forward-combining starter
1129 * otherwise it returns -1.
1130 *
1131 * The compositions list has (trail, compositeAndFwd) pair entries,
1132 * encoded as either pairs or triples of 16-bit units.
1133 * The last entry has the high bit of its first unit set.
1134 *
1135 * The list is sorted by ascending trail characters (there are no duplicates).
1136 * A linear search is used.
1137 *
1138 * See normalizer2impl.h for a more detailed description
1139 * of the compositions list format.
1140 */
combine(const uint16_t * list,UChar32 trail)1141 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1142 uint16_t key1, firstUnit;
1143 if(trail<COMP_1_TRAIL_LIMIT) {
1144 // trail character is 0..33FF
1145 // result entry may have 2 or 3 units
1146 key1=(uint16_t)(trail<<1);
1147 while(key1>(firstUnit=*list)) {
1148 list+=2+(firstUnit&COMP_1_TRIPLE);
1149 }
1150 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1151 if(firstUnit&COMP_1_TRIPLE) {
1152 return ((int32_t)list[1]<<16)|list[2];
1153 } else {
1154 return list[1];
1155 }
1156 }
1157 } else {
1158 // trail character is 3400..10FFFF
1159 // result entry has 3 units
1160 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1161 (((trail>>COMP_1_TRAIL_SHIFT))&
1162 ~COMP_1_TRIPLE));
1163 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1164 uint16_t secondUnit;
1165 for(;;) {
1166 if(key1>(firstUnit=*list)) {
1167 list+=2+(firstUnit&COMP_1_TRIPLE);
1168 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1169 if(key2>(secondUnit=list[1])) {
1170 if(firstUnit&COMP_1_LAST_TUPLE) {
1171 break;
1172 } else {
1173 list+=3;
1174 }
1175 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1176 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1177 } else {
1178 break;
1179 }
1180 } else {
1181 break;
1182 }
1183 }
1184 }
1185 return -1;
1186 }
1187
1188 /**
1189 * @param list some character's compositions list
1190 * @param set recursively receives the composites from these compositions
1191 */
addComposites(const uint16_t * list,UnicodeSet & set) const1192 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1193 uint16_t firstUnit;
1194 int32_t compositeAndFwd;
1195 do {
1196 firstUnit=*list;
1197 if((firstUnit&COMP_1_TRIPLE)==0) {
1198 compositeAndFwd=list[1];
1199 list+=2;
1200 } else {
1201 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1202 list+=3;
1203 }
1204 UChar32 composite=compositeAndFwd>>1;
1205 if((compositeAndFwd&1)!=0) {
1206 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1207 }
1208 set.add(composite);
1209 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1210 }
1211
1212 /*
1213 * Recomposes the buffer text starting at recomposeStartIndex
1214 * (which is in NFD - decomposed and canonically ordered),
1215 * and truncates the buffer contents.
1216 *
1217 * Note that recomposition never lengthens the text:
1218 * Any character consists of either one or two code units;
1219 * a composition may contain at most one more code unit than the original starter,
1220 * while the combining mark that is removed has at least one code unit.
1221 */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const1222 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1223 UBool onlyContiguous) const {
1224 UChar *p=buffer.getStart()+recomposeStartIndex;
1225 UChar *limit=buffer.getLimit();
1226 if(p==limit) {
1227 return;
1228 }
1229
1230 UChar *starter, *pRemove, *q, *r;
1231 const uint16_t *compositionsList;
1232 UChar32 c, compositeAndFwd;
1233 uint16_t norm16;
1234 uint8_t cc, prevCC;
1235 UBool starterIsSupplementary;
1236
1237 // Some of the following variables are not used until we have a forward-combining starter
1238 // and are only initialized now to avoid compiler warnings.
1239 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
1240 starter=NULL;
1241 starterIsSupplementary=FALSE;
1242 prevCC=0;
1243
1244 for(;;) {
1245 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
1246 cc=getCCFromYesOrMaybe(norm16);
1247 if( // this character combines backward and
1248 isMaybe(norm16) &&
1249 // we have seen a starter that combines forward and
1250 compositionsList!=NULL &&
1251 // the backward-combining character is not blocked
1252 (prevCC<cc || prevCC==0)
1253 ) {
1254 if(isJamoVT(norm16)) {
1255 // c is a Jamo V/T, see if we can compose it with the previous character.
1256 if(c<Hangul::JAMO_T_BASE) {
1257 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1258 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1259 if(prev<Hangul::JAMO_L_COUNT) {
1260 pRemove=p-1;
1261 UChar syllable=(UChar)
1262 (Hangul::HANGUL_BASE+
1263 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1264 Hangul::JAMO_T_COUNT);
1265 UChar t;
1266 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1267 ++p;
1268 syllable+=t; // The next character was a Jamo T.
1269 }
1270 *starter=syllable;
1271 // remove the Jamo V/T
1272 q=pRemove;
1273 r=p;
1274 while(r<limit) {
1275 *q++=*r++;
1276 }
1277 limit=q;
1278 p=pRemove;
1279 }
1280 }
1281 /*
1282 * No "else" for Jamo T:
1283 * Since the input is in NFD, there are no Hangul LV syllables that
1284 * a Jamo T could combine with.
1285 * All Jamo Ts are combined above when handling Jamo Vs.
1286 */
1287 if(p==limit) {
1288 break;
1289 }
1290 compositionsList=NULL;
1291 continue;
1292 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1293 // The starter and the combining mark (c) do combine.
1294 UChar32 composite=compositeAndFwd>>1;
1295
1296 // Replace the starter with the composite, remove the combining mark.
1297 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
1298 if(starterIsSupplementary) {
1299 if(U_IS_SUPPLEMENTARY(composite)) {
1300 // both are supplementary
1301 starter[0]=U16_LEAD(composite);
1302 starter[1]=U16_TRAIL(composite);
1303 } else {
1304 *starter=(UChar)composite;
1305 // The composite is shorter than the starter,
1306 // move the intermediate characters forward one.
1307 starterIsSupplementary=FALSE;
1308 q=starter+1;
1309 r=q+1;
1310 while(r<pRemove) {
1311 *q++=*r++;
1312 }
1313 --pRemove;
1314 }
1315 } else if(U_IS_SUPPLEMENTARY(composite)) {
1316 // The composite is longer than the starter,
1317 // move the intermediate characters back one.
1318 starterIsSupplementary=TRUE;
1319 ++starter; // temporarily increment for the loop boundary
1320 q=pRemove;
1321 r=++pRemove;
1322 while(starter<q) {
1323 *--r=*--q;
1324 }
1325 *starter=U16_TRAIL(composite);
1326 *--starter=U16_LEAD(composite); // undo the temporary increment
1327 } else {
1328 // both are on the BMP
1329 *starter=(UChar)composite;
1330 }
1331
1332 /* remove the combining mark by moving the following text over it */
1333 if(pRemove<p) {
1334 q=pRemove;
1335 r=p;
1336 while(r<limit) {
1337 *q++=*r++;
1338 }
1339 limit=q;
1340 p=pRemove;
1341 }
1342 // Keep prevCC because we removed the combining mark.
1343
1344 if(p==limit) {
1345 break;
1346 }
1347 // Is the composite a starter that combines forward?
1348 if(compositeAndFwd&1) {
1349 compositionsList=
1350 getCompositionsListForComposite(getRawNorm16(composite));
1351 } else {
1352 compositionsList=NULL;
1353 }
1354
1355 // We combined; continue with looking for compositions.
1356 continue;
1357 }
1358 }
1359
1360 // no combination this time
1361 prevCC=cc;
1362 if(p==limit) {
1363 break;
1364 }
1365
1366 // If c did not combine, then check if it is a starter.
1367 if(cc==0) {
1368 // Found a new starter.
1369 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1370 // It may combine with something, prepare for it.
1371 if(U_IS_BMP(c)) {
1372 starterIsSupplementary=FALSE;
1373 starter=p-1;
1374 } else {
1375 starterIsSupplementary=TRUE;
1376 starter=p-2;
1377 }
1378 }
1379 } else if(onlyContiguous) {
1380 // FCC: no discontiguous compositions; any intervening character blocks.
1381 compositionsList=NULL;
1382 }
1383 }
1384 buffer.setReorderingLimit(limit);
1385 }
1386
1387 UChar32
composePair(UChar32 a,UChar32 b) const1388 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1389 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16
1390 const uint16_t *list;
1391 if(isInert(norm16)) {
1392 return U_SENTINEL;
1393 } else if(norm16<minYesNoMappingsOnly) {
1394 // a combines forward.
1395 if(isJamoL(norm16)) {
1396 b-=Hangul::JAMO_V_BASE;
1397 if(0<=b && b<Hangul::JAMO_V_COUNT) {
1398 return
1399 (Hangul::HANGUL_BASE+
1400 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1401 Hangul::JAMO_T_COUNT);
1402 } else {
1403 return U_SENTINEL;
1404 }
1405 } else if(isHangulLV(norm16)) {
1406 b-=Hangul::JAMO_T_BASE;
1407 if(0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
1408 return a+b;
1409 } else {
1410 return U_SENTINEL;
1411 }
1412 } else {
1413 // 'a' has a compositions list in extraData
1414 list=getMapping(norm16);
1415 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1416 list+= // mapping pointer
1417 1+ // +1 to skip the first unit with the mapping length
1418 (*list&MAPPING_LENGTH_MASK); // + mapping length
1419 }
1420 }
1421 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1422 return U_SENTINEL;
1423 } else {
1424 list=getCompositionsListForMaybe(norm16);
1425 }
1426 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1427 return U_SENTINEL;
1428 }
1429 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1430 return combine(list, b)>>1;
1431 #else
1432 int32_t compositeAndFwd=combine(list, b);
1433 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1434 #endif
1435 }
1436
1437 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1438 // doCompose: normalize
1439 // !doCompose: isNormalized (buffer must be empty and initialized)
1440 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1441 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1442 UBool onlyContiguous,
1443 UBool doCompose,
1444 ReorderingBuffer &buffer,
1445 UErrorCode &errorCode) const {
1446 const UChar *prevBoundary=src;
1447 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1448 if(limit==NULL) {
1449 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1450 doCompose ? &buffer : NULL,
1451 errorCode);
1452 if(U_FAILURE(errorCode)) {
1453 return FALSE;
1454 }
1455 limit=u_strchr(src, 0);
1456 if (prevBoundary != src) {
1457 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1458 prevBoundary = src;
1459 } else {
1460 buffer.removeSuffix(1);
1461 prevBoundary = --src;
1462 }
1463 }
1464 }
1465
1466 for (;;) {
1467 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1468 // or with (compYes && ccc==0) properties.
1469 const UChar *prevSrc;
1470 UChar32 c = 0;
1471 uint16_t norm16 = 0;
1472 for (;;) {
1473 if (src == limit) {
1474 if (prevBoundary != limit && doCompose) {
1475 buffer.appendZeroCC(prevBoundary, limit, errorCode);
1476 }
1477 return TRUE;
1478 }
1479 if( (c=*src)<minNoMaybeCP ||
1480 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1481 ) {
1482 ++src;
1483 } else {
1484 prevSrc = src++;
1485 if(!U16_IS_LEAD(c)) {
1486 break;
1487 } else {
1488 UChar c2;
1489 if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1490 ++src;
1491 c=U16_GET_SUPPLEMENTARY(c, c2);
1492 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1493 if(!isCompYesAndZeroCC(norm16)) {
1494 break;
1495 }
1496 }
1497 }
1498 }
1499 }
1500 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1501 // The current character is either a "noNo" (has a mapping)
1502 // or a "maybeYes" (combines backward)
1503 // or a "yesYes" with ccc!=0.
1504 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1505
1506 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1507 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
1508 if (!doCompose) {
1509 return FALSE;
1510 }
1511 // Fast path for mapping a character that is immediately surrounded by boundaries.
1512 // In this case, we need not decompose around the current character.
1513 if (isDecompNoAlgorithmic(norm16)) {
1514 // Maps to a single isCompYesAndZeroCC character
1515 // which also implies hasCompBoundaryBefore.
1516 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1517 hasCompBoundaryBefore(src, limit)) {
1518 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1519 break;
1520 }
1521 if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1522 break;
1523 }
1524 prevBoundary = src;
1525 continue;
1526 }
1527 } else if (norm16 < minNoNoCompBoundaryBefore) {
1528 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1529 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1530 hasCompBoundaryBefore(src, limit)) {
1531 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1532 break;
1533 }
1534 const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1535 int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1536 if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1537 break;
1538 }
1539 prevBoundary = src;
1540 continue;
1541 }
1542 } else if (norm16 >= minNoNoEmpty) {
1543 // The current character maps to nothing.
1544 // Simply omit it from the output if there is a boundary before _or_ after it.
1545 // The character itself implies no boundaries.
1546 if (hasCompBoundaryBefore(src, limit) ||
1547 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1548 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1549 break;
1550 }
1551 prevBoundary = src;
1552 continue;
1553 }
1554 }
1555 // Other "noNo" type, or need to examine more text around this character:
1556 // Fall through to the slow path.
1557 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1558 UChar prev=*(prevSrc-1);
1559 if(c<Hangul::JAMO_T_BASE) {
1560 // The current character is a Jamo Vowel,
1561 // compose with previous Jamo L and following Jamo T.
1562 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1563 if(l<Hangul::JAMO_L_COUNT) {
1564 if (!doCompose) {
1565 return FALSE;
1566 }
1567 int32_t t;
1568 if (src != limit &&
1569 0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1570 t < Hangul::JAMO_T_COUNT) {
1571 // The next character is a Jamo T.
1572 ++src;
1573 } else if (hasCompBoundaryBefore(src, limit)) {
1574 // No Jamo T follows, not even via decomposition.
1575 t = 0;
1576 } else {
1577 t = -1;
1578 }
1579 if (t >= 0) {
1580 UChar32 syllable = Hangul::HANGUL_BASE +
1581 (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1582 Hangul::JAMO_T_COUNT + t;
1583 --prevSrc; // Replace the Jamo L as well.
1584 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1585 break;
1586 }
1587 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1588 break;
1589 }
1590 prevBoundary = src;
1591 continue;
1592 }
1593 // If we see L+V+x where x!=T then we drop to the slow path,
1594 // decompose and recompose.
1595 // This is to deal with NFKC finding normal L and V but a
1596 // compatibility variant of a T.
1597 // We need to either fully compose that combination here
1598 // (which would complicate the code and may not work with strange custom data)
1599 // or use the slow path.
1600 }
1601 } else if (Hangul::isHangulLV(prev)) {
1602 // The current character is a Jamo Trailing consonant,
1603 // compose with previous Hangul LV that does not contain a Jamo T.
1604 if (!doCompose) {
1605 return FALSE;
1606 }
1607 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1608 --prevSrc; // Replace the Hangul LV as well.
1609 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1610 break;
1611 }
1612 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1613 break;
1614 }
1615 prevBoundary = src;
1616 continue;
1617 }
1618 // No matching context, or may need to decompose surrounding text first:
1619 // Fall through to the slow path.
1620 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
1621 // One or more combining marks that do not combine-back:
1622 // Check for canonical order, copy unchanged if ok and
1623 // if followed by a character with a boundary-before.
1624 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
1625 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1626 // Fails FCD test, need to decompose and contiguously recompose.
1627 if (!doCompose) {
1628 return FALSE;
1629 }
1630 } else {
1631 // If !onlyContiguous (not FCC), then we ignore the tccc of
1632 // the previous character which passed the quick check "yes && ccc==0" test.
1633 const UChar *nextSrc;
1634 uint16_t n16;
1635 for (;;) {
1636 if (src == limit) {
1637 if (doCompose) {
1638 buffer.appendZeroCC(prevBoundary, limit, errorCode);
1639 }
1640 return TRUE;
1641 }
1642 uint8_t prevCC = cc;
1643 nextSrc = src;
1644 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
1645 if (n16 >= MIN_YES_YES_WITH_CC) {
1646 cc = getCCFromNormalYesOrMaybe(n16);
1647 if (prevCC > cc) {
1648 if (!doCompose) {
1649 return FALSE;
1650 }
1651 break;
1652 }
1653 } else {
1654 break;
1655 }
1656 src = nextSrc;
1657 }
1658 // src is after the last in-order combining mark.
1659 // If there is a boundary here, then we continue with no change.
1660 if (norm16HasCompBoundaryBefore(n16)) {
1661 if (isCompYesAndZeroCC(n16)) {
1662 src = nextSrc;
1663 }
1664 continue;
1665 }
1666 // Use the slow path. There is no boundary in [prevSrc, src[.
1667 }
1668 }
1669
1670 // Slow path: Find the nearest boundaries around the current character,
1671 // decompose and recompose.
1672 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1673 const UChar *p = prevSrc;
1674 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
1675 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1676 prevSrc = p;
1677 }
1678 }
1679 if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1680 break;
1681 }
1682 int32_t recomposeStartIndex=buffer.length();
1683 // We know there is not a boundary here.
1684 decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1685 buffer, errorCode);
1686 // Decompose until the next boundary.
1687 src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1688 buffer, errorCode);
1689 if (U_FAILURE(errorCode)) {
1690 break;
1691 }
1692 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
1693 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1694 return TRUE;
1695 }
1696 recompose(buffer, recomposeStartIndex, onlyContiguous);
1697 if(!doCompose) {
1698 if(!buffer.equals(prevSrc, src)) {
1699 return FALSE;
1700 }
1701 buffer.remove();
1702 }
1703 prevBoundary=src;
1704 }
1705 return TRUE;
1706 }
1707
1708 // Very similar to compose(): Make the same changes in both places if relevant.
1709 // pQCResult==NULL: spanQuickCheckYes
1710 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1711 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1712 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1713 UBool onlyContiguous,
1714 UNormalizationCheckResult *pQCResult) const {
1715 const UChar *prevBoundary=src;
1716 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1717 if(limit==NULL) {
1718 UErrorCode errorCode=U_ZERO_ERROR;
1719 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1720 limit=u_strchr(src, 0);
1721 if (prevBoundary != src) {
1722 if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1723 prevBoundary = src;
1724 } else {
1725 prevBoundary = --src;
1726 }
1727 }
1728 }
1729
1730 for(;;) {
1731 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1732 // or with (compYes && ccc==0) properties.
1733 const UChar *prevSrc;
1734 UChar32 c = 0;
1735 uint16_t norm16 = 0;
1736 for (;;) {
1737 if(src==limit) {
1738 return src;
1739 }
1740 if( (c=*src)<minNoMaybeCP ||
1741 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
1742 ) {
1743 ++src;
1744 } else {
1745 prevSrc = src++;
1746 if(!U16_IS_LEAD(c)) {
1747 break;
1748 } else {
1749 UChar c2;
1750 if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1751 ++src;
1752 c=U16_GET_SUPPLEMENTARY(c, c2);
1753 norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
1754 if(!isCompYesAndZeroCC(norm16)) {
1755 break;
1756 }
1757 }
1758 }
1759 }
1760 }
1761 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1762 // The current character is either a "noNo" (has a mapping)
1763 // or a "maybeYes" (combines backward)
1764 // or a "yesYes" with ccc!=0.
1765 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1766
1767 uint16_t prevNorm16 = INERT;
1768 if (prevBoundary != prevSrc) {
1769 if (norm16HasCompBoundaryBefore(norm16)) {
1770 prevBoundary = prevSrc;
1771 } else {
1772 const UChar *p = prevSrc;
1773 uint16_t n16;
1774 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
1775 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1776 prevBoundary = prevSrc;
1777 } else {
1778 prevBoundary = p;
1779 prevNorm16 = n16;
1780 }
1781 }
1782 }
1783
1784 if(isMaybeOrNonZeroCC(norm16)) {
1785 uint8_t cc=getCCFromYesOrMaybe(norm16);
1786 if (onlyContiguous /* FCC */ && cc != 0 &&
1787 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1788 // The [prevBoundary..prevSrc[ character
1789 // passed the quick check "yes && ccc==0" test
1790 // but is out of canonical order with the current combining mark.
1791 } else {
1792 // If !onlyContiguous (not FCC), then we ignore the tccc of
1793 // the previous character which passed the quick check "yes && ccc==0" test.
1794 const UChar *nextSrc;
1795 for (;;) {
1796 if (norm16 < MIN_YES_YES_WITH_CC) {
1797 if (pQCResult != nullptr) {
1798 *pQCResult = UNORM_MAYBE;
1799 } else {
1800 return prevBoundary;
1801 }
1802 }
1803 if (src == limit) {
1804 return src;
1805 }
1806 uint8_t prevCC = cc;
1807 nextSrc = src;
1808 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
1809 if (isMaybeOrNonZeroCC(norm16)) {
1810 cc = getCCFromYesOrMaybe(norm16);
1811 if (!(prevCC <= cc || cc == 0)) {
1812 break;
1813 }
1814 } else {
1815 break;
1816 }
1817 src = nextSrc;
1818 }
1819 // src is after the last in-order combining mark.
1820 if (isCompYesAndZeroCC(norm16)) {
1821 prevBoundary = src;
1822 src = nextSrc;
1823 continue;
1824 }
1825 }
1826 }
1827 if(pQCResult!=NULL) {
1828 *pQCResult=UNORM_NO;
1829 }
1830 return prevBoundary;
1831 }
1832 }
1833
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1834 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1835 UBool doCompose,
1836 UBool onlyContiguous,
1837 UnicodeString &safeMiddle,
1838 ReorderingBuffer &buffer,
1839 UErrorCode &errorCode) const {
1840 if(!buffer.isEmpty()) {
1841 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1842 if(src!=firstStarterInSrc) {
1843 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1844 buffer.getLimit(), onlyContiguous);
1845 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1846 UnicodeString middle(lastStarterInDest, destSuffixLength);
1847 buffer.removeSuffix(destSuffixLength);
1848 safeMiddle=middle;
1849 middle.append(src, (int32_t)(firstStarterInSrc-src));
1850 const UChar *middleStart=middle.getBuffer();
1851 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1852 TRUE, buffer, errorCode);
1853 if(U_FAILURE(errorCode)) {
1854 return;
1855 }
1856 src=firstStarterInSrc;
1857 }
1858 }
1859 if(doCompose) {
1860 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1861 } else {
1862 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1863 limit=u_strchr(src, 0);
1864 }
1865 buffer.appendZeroCC(src, limit, errorCode);
1866 }
1867 }
1868
1869 UBool
composeUTF8(uint32_t options,UBool onlyContiguous,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const1870 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1871 const uint8_t *src, const uint8_t *limit,
1872 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1873 U_ASSERT(limit != nullptr);
1874 UnicodeString s16;
1875 uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1876 const uint8_t *prevBoundary = src;
1877
1878 for (;;) {
1879 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1880 // or with (compYes && ccc==0) properties.
1881 const uint8_t *prevSrc;
1882 uint16_t norm16 = 0;
1883 for (;;) {
1884 if (src == limit) {
1885 if (prevBoundary != limit && sink != nullptr) {
1886 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1887 *sink, options, edits, errorCode);
1888 }
1889 return TRUE;
1890 }
1891 if (*src < minNoMaybeLead) {
1892 ++src;
1893 } else {
1894 prevSrc = src;
1895 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
1896 if (!isCompYesAndZeroCC(norm16)) {
1897 break;
1898 }
1899 }
1900 }
1901 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1902 // The current character is either a "noNo" (has a mapping)
1903 // or a "maybeYes" (combines backward)
1904 // or a "yesYes" with ccc!=0.
1905 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1906
1907 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1908 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes
1909 if (sink == nullptr) {
1910 return FALSE;
1911 }
1912 // Fast path for mapping a character that is immediately surrounded by boundaries.
1913 // In this case, we need not decompose around the current character.
1914 if (isDecompNoAlgorithmic(norm16)) {
1915 // Maps to a single isCompYesAndZeroCC character
1916 // which also implies hasCompBoundaryBefore.
1917 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1918 hasCompBoundaryBefore(src, limit)) {
1919 if (prevBoundary != prevSrc &&
1920 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1921 *sink, options, edits, errorCode)) {
1922 break;
1923 }
1924 appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1925 prevBoundary = src;
1926 continue;
1927 }
1928 } else if (norm16 < minNoNoCompBoundaryBefore) {
1929 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1930 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1931 hasCompBoundaryBefore(src, limit)) {
1932 if (prevBoundary != prevSrc &&
1933 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1934 *sink, options, edits, errorCode)) {
1935 break;
1936 }
1937 const uint16_t *mapping = getMapping(norm16);
1938 int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1939 if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1940 *sink, edits, errorCode)) {
1941 break;
1942 }
1943 prevBoundary = src;
1944 continue;
1945 }
1946 } else if (norm16 >= minNoNoEmpty) {
1947 // The current character maps to nothing.
1948 // Simply omit it from the output if there is a boundary before _or_ after it.
1949 // The character itself implies no boundaries.
1950 if (hasCompBoundaryBefore(src, limit) ||
1951 hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1952 if (prevBoundary != prevSrc &&
1953 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1954 *sink, options, edits, errorCode)) {
1955 break;
1956 }
1957 if (edits != nullptr) {
1958 edits->addReplace((int32_t)(src - prevSrc), 0);
1959 }
1960 prevBoundary = src;
1961 continue;
1962 }
1963 }
1964 // Other "noNo" type, or need to examine more text around this character:
1965 // Fall through to the slow path.
1966 } else if (isJamoVT(norm16)) {
1967 // Jamo L: E1 84 80..92
1968 // Jamo V: E1 85 A1..B5
1969 // Jamo T: E1 86 A8..E1 87 82
1970 U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1971 UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1972 if (prevSrc[1] == 0x85) {
1973 // The current character is a Jamo Vowel,
1974 // compose with previous Jamo L and following Jamo T.
1975 UChar32 l = prev - Hangul::JAMO_L_BASE;
1976 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1977 if (sink == nullptr) {
1978 return FALSE;
1979 }
1980 int32_t t = getJamoTMinusBase(src, limit);
1981 if (t >= 0) {
1982 // The next character is a Jamo T.
1983 src += 3;
1984 } else if (hasCompBoundaryBefore(src, limit)) {
1985 // No Jamo T follows, not even via decomposition.
1986 t = 0;
1987 }
1988 if (t >= 0) {
1989 UChar32 syllable = Hangul::HANGUL_BASE +
1990 (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1991 Hangul::JAMO_T_COUNT + t;
1992 prevSrc -= 3; // Replace the Jamo L as well.
1993 if (prevBoundary != prevSrc &&
1994 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1995 *sink, options, edits, errorCode)) {
1996 break;
1997 }
1998 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1999 prevBoundary = src;
2000 continue;
2001 }
2002 // If we see L+V+x where x!=T then we drop to the slow path,
2003 // decompose and recompose.
2004 // This is to deal with NFKC finding normal L and V but a
2005 // compatibility variant of a T.
2006 // We need to either fully compose that combination here
2007 // (which would complicate the code and may not work with strange custom data)
2008 // or use the slow path.
2009 }
2010 } else if (Hangul::isHangulLV(prev)) {
2011 // The current character is a Jamo Trailing consonant,
2012 // compose with previous Hangul LV that does not contain a Jamo T.
2013 if (sink == nullptr) {
2014 return FALSE;
2015 }
2016 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
2017 prevSrc -= 3; // Replace the Hangul LV as well.
2018 if (prevBoundary != prevSrc &&
2019 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2020 *sink, options, edits, errorCode)) {
2021 break;
2022 }
2023 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
2024 prevBoundary = src;
2025 continue;
2026 }
2027 // No matching context, or may need to decompose surrounding text first:
2028 // Fall through to the slow path.
2029 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC
2030 // One or more combining marks that do not combine-back:
2031 // Check for canonical order, copy unchanged if ok and
2032 // if followed by a character with a boundary-before.
2033 uint8_t cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0
2034 if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
2035 // Fails FCD test, need to decompose and contiguously recompose.
2036 if (sink == nullptr) {
2037 return FALSE;
2038 }
2039 } else {
2040 // If !onlyContiguous (not FCC), then we ignore the tccc of
2041 // the previous character which passed the quick check "yes && ccc==0" test.
2042 const uint8_t *nextSrc;
2043 uint16_t n16;
2044 for (;;) {
2045 if (src == limit) {
2046 if (sink != nullptr) {
2047 ByteSinkUtil::appendUnchanged(prevBoundary, limit,
2048 *sink, options, edits, errorCode);
2049 }
2050 return TRUE;
2051 }
2052 uint8_t prevCC = cc;
2053 nextSrc = src;
2054 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
2055 if (n16 >= MIN_YES_YES_WITH_CC) {
2056 cc = getCCFromNormalYesOrMaybe(n16);
2057 if (prevCC > cc) {
2058 if (sink == nullptr) {
2059 return FALSE;
2060 }
2061 break;
2062 }
2063 } else {
2064 break;
2065 }
2066 src = nextSrc;
2067 }
2068 // src is after the last in-order combining mark.
2069 // If there is a boundary here, then we continue with no change.
2070 if (norm16HasCompBoundaryBefore(n16)) {
2071 if (isCompYesAndZeroCC(n16)) {
2072 src = nextSrc;
2073 }
2074 continue;
2075 }
2076 // Use the slow path. There is no boundary in [prevSrc, src[.
2077 }
2078 }
2079
2080 // Slow path: Find the nearest boundaries around the current character,
2081 // decompose and recompose.
2082 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
2083 const uint8_t *p = prevSrc;
2084 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
2085 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2086 prevSrc = p;
2087 }
2088 }
2089 ReorderingBuffer buffer(*this, s16, errorCode);
2090 if (U_FAILURE(errorCode)) {
2091 break;
2092 }
2093 // We know there is not a boundary here.
2094 decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
2095 buffer, errorCode);
2096 // Decompose until the next boundary.
2097 src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
2098 buffer, errorCode);
2099 if (U_FAILURE(errorCode)) {
2100 break;
2101 }
2102 if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
2103 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
2104 return TRUE;
2105 }
2106 recompose(buffer, 0, onlyContiguous);
2107 if (!buffer.equals(prevSrc, src)) {
2108 if (sink == nullptr) {
2109 return FALSE;
2110 }
2111 if (prevBoundary != prevSrc &&
2112 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2113 *sink, options, edits, errorCode)) {
2114 break;
2115 }
2116 if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2117 *sink, edits, errorCode)) {
2118 break;
2119 }
2120 prevBoundary = src;
2121 }
2122 }
2123 return TRUE;
2124 }
2125
hasCompBoundaryBefore(const UChar * src,const UChar * limit) const2126 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
2127 if (src == limit || *src < minCompNoMaybeCP) {
2128 return TRUE;
2129 }
2130 UChar32 c;
2131 uint16_t norm16;
2132 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
2133 return norm16HasCompBoundaryBefore(norm16);
2134 }
2135
hasCompBoundaryBefore(const uint8_t * src,const uint8_t * limit) const2136 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2137 if (src == limit) {
2138 return TRUE;
2139 }
2140 uint16_t norm16;
2141 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
2142 return norm16HasCompBoundaryBefore(norm16);
2143 }
2144
hasCompBoundaryAfter(const UChar * start,const UChar * p,UBool onlyContiguous) const2145 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2146 UBool onlyContiguous) const {
2147 if (start == p) {
2148 return TRUE;
2149 }
2150 UChar32 c;
2151 uint16_t norm16;
2152 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2153 return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2154 }
2155
hasCompBoundaryAfter(const uint8_t * start,const uint8_t * p,UBool onlyContiguous) const2156 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2157 UBool onlyContiguous) const {
2158 if (start == p) {
2159 return TRUE;
2160 }
2161 uint16_t norm16;
2162 UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
2163 return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2164 }
2165
findPreviousCompBoundary(const UChar * start,const UChar * p,UBool onlyContiguous) const2166 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2167 UBool onlyContiguous) const {
2168 while (p != start) {
2169 const UChar *codePointLimit = p;
2170 UChar32 c;
2171 uint16_t norm16;
2172 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2173 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2174 return codePointLimit;
2175 }
2176 if (hasCompBoundaryBefore(c, norm16)) {
2177 return p;
2178 }
2179 }
2180 return p;
2181 }
2182
findNextCompBoundary(const UChar * p,const UChar * limit,UBool onlyContiguous) const2183 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2184 UBool onlyContiguous) const {
2185 while (p != limit) {
2186 const UChar *codePointStart = p;
2187 UChar32 c;
2188 uint16_t norm16;
2189 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2190 if (hasCompBoundaryBefore(c, norm16)) {
2191 return codePointStart;
2192 }
2193 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2194 return p;
2195 }
2196 }
2197 return p;
2198 }
2199
getPreviousTrailCC(const UChar * start,const UChar * p) const2200 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2201 if (start == p) {
2202 return 0;
2203 }
2204 int32_t i = (int32_t)(p - start);
2205 UChar32 c;
2206 U16_PREV(start, 0, i, c);
2207 return (uint8_t)getFCD16(c);
2208 }
2209
getPreviousTrailCC(const uint8_t * start,const uint8_t * p) const2210 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2211 if (start == p) {
2212 return 0;
2213 }
2214 int32_t i = (int32_t)(p - start);
2215 UChar32 c;
2216 U8_PREV(start, 0, i, c);
2217 return (uint8_t)getFCD16(c);
2218 }
2219
2220 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2221 // still had getFCDTrie() which built and cached an FCD trie.
2222 // That provided faster access to FCD data than getFCD16FromNormData()
2223 // but required synchronization and consumed some 10kB of heap memory
2224 // in any process that uses FCD (e.g., via collation).
2225 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2226 // at least for ASCII & CJK.
2227
2228 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2229 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
2230 // This work-around could/should be removed once the following versions of Visual Studio are no
2231 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2232 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2233 #pragma optimize( "", off )
2234 #endif
2235 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const2236 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2237 uint16_t norm16=getNorm16(c);
2238 if (norm16 >= limitNoNo) {
2239 if(norm16>=MIN_NORMAL_MAYBE_YES) {
2240 // combining mark
2241 norm16=getCCFromNormalYesOrMaybe(norm16);
2242 return norm16|(norm16<<8);
2243 } else if(norm16>=minMaybeYes) {
2244 return 0;
2245 } else { // isDecompNoAlgorithmic(norm16)
2246 uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2247 if (deltaTrailCC <= DELTA_TCCC_1) {
2248 return deltaTrailCC >> OFFSET_SHIFT;
2249 }
2250 // Maps to an isCompYesAndZeroCC.
2251 c=mapAlgorithmic(c, norm16);
2252 norm16=getRawNorm16(c);
2253 }
2254 }
2255 if(norm16<=minYesNo || isHangulLVT(norm16)) {
2256 // no decomposition or Hangul syllable, all zeros
2257 return 0;
2258 }
2259 // c decomposes, get everything from the variable-length extra data
2260 const uint16_t *mapping=getMapping(norm16);
2261 uint16_t firstUnit=*mapping;
2262 norm16=firstUnit>>8; // tccc
2263 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2264 norm16|=*(mapping-1)&0xff00; // lccc
2265 }
2266 return norm16;
2267 }
2268 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2269 #pragma optimize( "", on )
2270 #endif
2271
2272 // Dual functionality:
2273 // buffer!=NULL: normalize
2274 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2275 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const2276 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2277 ReorderingBuffer *buffer,
2278 UErrorCode &errorCode) const {
2279 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2280 // Similar to the prevBoundary in the compose() implementation.
2281 const UChar *prevBoundary=src;
2282 int32_t prevFCD16=0;
2283 if(limit==NULL) {
2284 src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2285 if(U_FAILURE(errorCode)) {
2286 return src;
2287 }
2288 if(prevBoundary<src) {
2289 prevBoundary=src;
2290 // We know that the previous character's lccc==0.
2291 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2292 prevFCD16=getFCD16(*(src-1));
2293 if(prevFCD16>1) {
2294 --prevBoundary;
2295 }
2296 }
2297 limit=u_strchr(src, 0);
2298 }
2299
2300 // Note: In this function we use buffer->appendZeroCC() because we track
2301 // the lead and trail combining classes here, rather than leaving it to
2302 // the ReorderingBuffer.
2303 // The exception is the call to decomposeShort() which uses the buffer
2304 // in the normal way.
2305
2306 const UChar *prevSrc;
2307 UChar32 c=0;
2308 uint16_t fcd16=0;
2309
2310 for(;;) {
2311 // count code units with lccc==0
2312 for(prevSrc=src; src!=limit;) {
2313 if((c=*src)<minLcccCP) {
2314 prevFCD16=~c;
2315 ++src;
2316 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2317 prevFCD16=0;
2318 ++src;
2319 } else {
2320 if(U16_IS_LEAD(c)) {
2321 UChar c2;
2322 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2323 c=U16_GET_SUPPLEMENTARY(c, c2);
2324 }
2325 }
2326 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2327 prevFCD16=fcd16;
2328 src+=U16_LENGTH(c);
2329 } else {
2330 break;
2331 }
2332 }
2333 }
2334 // copy these code units all at once
2335 if(src!=prevSrc) {
2336 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2337 break;
2338 }
2339 if(src==limit) {
2340 break;
2341 }
2342 prevBoundary=src;
2343 // We know that the previous character's lccc==0.
2344 if(prevFCD16<0) {
2345 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2346 UChar32 prev=~prevFCD16;
2347 if(prev<minDecompNoCP) {
2348 prevFCD16=0;
2349 } else {
2350 prevFCD16=getFCD16FromNormData(prev);
2351 if(prevFCD16>1) {
2352 --prevBoundary;
2353 }
2354 }
2355 } else {
2356 const UChar *p=src-1;
2357 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2358 --p;
2359 // Need to fetch the previous character's FCD value because
2360 // prevFCD16 was just for the trail surrogate code point.
2361 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2362 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2363 }
2364 if(prevFCD16>1) {
2365 prevBoundary=p;
2366 }
2367 }
2368 // The start of the current character (c).
2369 prevSrc=src;
2370 } else if(src==limit) {
2371 break;
2372 }
2373
2374 src+=U16_LENGTH(c);
2375 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2376 // Check for proper order, and decompose locally if necessary.
2377 if((prevFCD16&0xff)<=(fcd16>>8)) {
2378 // proper order: prev tccc <= current lccc
2379 if((fcd16&0xff)<=1) {
2380 prevBoundary=src;
2381 }
2382 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2383 break;
2384 }
2385 prevFCD16=fcd16;
2386 continue;
2387 } else if(buffer==NULL) {
2388 return prevBoundary; // quick check "no"
2389 } else {
2390 /*
2391 * Back out the part of the source that we copied or appended
2392 * already but is now going to be decomposed.
2393 * prevSrc is set to after what was copied/appended.
2394 */
2395 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2396 /*
2397 * Find the part of the source that needs to be decomposed,
2398 * up to the next safe boundary.
2399 */
2400 src=findNextFCDBoundary(src, limit);
2401 /*
2402 * The source text does not fulfill the conditions for FCD.
2403 * Decompose and reorder a limited piece of the text.
2404 */
2405 decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2406 if (U_FAILURE(errorCode)) {
2407 break;
2408 }
2409 prevBoundary=src;
2410 prevFCD16=0;
2411 }
2412 }
2413 return src;
2414 }
2415
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const2416 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2417 UBool doMakeFCD,
2418 UnicodeString &safeMiddle,
2419 ReorderingBuffer &buffer,
2420 UErrorCode &errorCode) const {
2421 if(!buffer.isEmpty()) {
2422 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2423 if(src!=firstBoundaryInSrc) {
2424 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2425 buffer.getLimit());
2426 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2427 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2428 buffer.removeSuffix(destSuffixLength);
2429 safeMiddle=middle;
2430 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2431 const UChar *middleStart=middle.getBuffer();
2432 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2433 if(U_FAILURE(errorCode)) {
2434 return;
2435 }
2436 src=firstBoundaryInSrc;
2437 }
2438 }
2439 if(doMakeFCD) {
2440 makeFCD(src, limit, &buffer, errorCode);
2441 } else {
2442 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
2443 limit=u_strchr(src, 0);
2444 }
2445 buffer.appendZeroCC(src, limit, errorCode);
2446 }
2447 }
2448
findPreviousFCDBoundary(const UChar * start,const UChar * p) const2449 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2450 while(start<p) {
2451 const UChar *codePointLimit = p;
2452 UChar32 c;
2453 uint16_t norm16;
2454 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
2455 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2456 return codePointLimit;
2457 }
2458 if (norm16HasDecompBoundaryBefore(norm16)) {
2459 return p;
2460 }
2461 }
2462 return p;
2463 }
2464
findNextFCDBoundary(const UChar * p,const UChar * limit) const2465 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2466 while(p<limit) {
2467 const UChar *codePointStart=p;
2468 UChar32 c;
2469 uint16_t norm16;
2470 UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
2471 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2472 return codePointStart;
2473 }
2474 if (norm16HasDecompBoundaryAfter(norm16)) {
2475 return p;
2476 }
2477 }
2478 return p;
2479 }
2480
2481 // CanonicalIterator data -------------------------------------------------- ***
2482
CanonIterData(UErrorCode & errorCode)2483 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2484 mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),
2485 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2486
~CanonIterData()2487 CanonIterData::~CanonIterData() {
2488 umutablecptrie_close(mutableTrie);
2489 ucptrie_close(trie);
2490 }
2491
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)2492 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2493 uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
2494 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2495 // origin is the first character whose decomposition starts with
2496 // the character for which we are setting the value.
2497 umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);
2498 } else {
2499 // origin is not the first character, or it is U+0000.
2500 UnicodeSet *set;
2501 if((canonValue&CANON_HAS_SET)==0) {
2502 set=new UnicodeSet;
2503 if(set==NULL) {
2504 errorCode=U_MEMORY_ALLOCATION_ERROR;
2505 return;
2506 }
2507 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2508 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2509 umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
2510 canonStartSets.addElement(set, errorCode);
2511 if(firstOrigin!=0) {
2512 set->add(firstOrigin);
2513 }
2514 } else {
2515 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2516 }
2517 set->add(origin);
2518 }
2519 }
2520
2521 // C++ class for friend access to private Normalizer2Impl members.
2522 class InitCanonIterData {
2523 public:
2524 static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2525 };
2526
2527 U_CDECL_BEGIN
2528
2529 // UInitOnce instantiation function for CanonIterData
2530 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)2531 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2532 InitCanonIterData::doInit(impl, errorCode);
2533 }
2534
2535 U_CDECL_END
2536
doInit(Normalizer2Impl * impl,UErrorCode & errorCode)2537 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2538 U_ASSERT(impl->fCanonIterData == NULL);
2539 impl->fCanonIterData = new CanonIterData(errorCode);
2540 if (impl->fCanonIterData == NULL) {
2541 errorCode=U_MEMORY_ALLOCATION_ERROR;
2542 }
2543 if (U_SUCCESS(errorCode)) {
2544 UChar32 start = 0, end;
2545 uint32_t value;
2546 while ((end = ucptrie_getRange(impl->normTrie, start,
2547 UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
2548 nullptr, nullptr, &value)) >= 0) {
2549 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2550 if (value != Normalizer2Impl::INERT) {
2551 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2552 }
2553 start = end + 1;
2554 }
2555 #ifdef UCPTRIE_DEBUG
2556 umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
2557 #endif
2558 impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
2559 impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
2560 umutablecptrie_close(impl->fCanonIterData->mutableTrie);
2561 impl->fCanonIterData->mutableTrie = nullptr;
2562 }
2563 if (U_FAILURE(errorCode)) {
2564 delete impl->fCanonIterData;
2565 impl->fCanonIterData = NULL;
2566 }
2567 }
2568
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,const uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const2569 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2570 CanonIterData &newData,
2571 UErrorCode &errorCode) const {
2572 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2573 // Inert, or 2-way mapping (including Hangul syllable).
2574 // We do not write a canonStartSet for any yesNo character.
2575 // Composites from 2-way mappings are added at runtime from the
2576 // starter's compositions list, and the other characters in
2577 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2578 // "maybe" characters.
2579 return;
2580 }
2581 for(UChar32 c=start; c<=end; ++c) {
2582 uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
2583 uint32_t newValue=oldValue;
2584 if(isMaybeOrNonZeroCC(norm16)) {
2585 // not a segment starter if it occurs in a decomposition or has cc!=0
2586 newValue|=CANON_NOT_SEGMENT_STARTER;
2587 if(norm16<MIN_NORMAL_MAYBE_YES) {
2588 newValue|=CANON_HAS_COMPOSITIONS;
2589 }
2590 } else if(norm16<minYesNo) {
2591 newValue|=CANON_HAS_COMPOSITIONS;
2592 } else {
2593 // c has a one-way decomposition
2594 UChar32 c2=c;
2595 // Do not modify the whole-range norm16 value.
2596 uint16_t norm16_2=norm16;
2597 if (isDecompNoAlgorithmic(norm16_2)) {
2598 // Maps to an isCompYesAndZeroCC.
2599 c2 = mapAlgorithmic(c2, norm16_2);
2600 norm16_2 = getRawNorm16(c2);
2601 // No compatibility mappings for the CanonicalIterator.
2602 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2603 }
2604 if (norm16_2 > minYesNo) {
2605 // c decomposes, get everything from the variable-length extra data
2606 const uint16_t *mapping=getMapping(norm16_2);
2607 uint16_t firstUnit=*mapping;
2608 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2609 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2610 if(c==c2 && (*(mapping-1)&0xff)!=0) {
2611 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
2612 }
2613 }
2614 // Skip empty mappings (no characters in the decomposition).
2615 if(length!=0) {
2616 ++mapping; // skip over the firstUnit
2617 // add c to first code point's start set
2618 int32_t i=0;
2619 U16_NEXT_UNSAFE(mapping, i, c2);
2620 newData.addToStartSet(c, c2, errorCode);
2621 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2622 // one-way mapping. A 2-way mapping is possible here after
2623 // intermediate algorithmic mapping.
2624 if(norm16_2>=minNoNo) {
2625 while(i<length) {
2626 U16_NEXT_UNSAFE(mapping, i, c2);
2627 uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
2628 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2629 umutablecptrie_set(newData.mutableTrie, c2,
2630 c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);
2631 }
2632 }
2633 }
2634 }
2635 } else {
2636 // c decomposed to c2 algorithmically; c has cc==0
2637 newData.addToStartSet(c, c2, errorCode);
2638 }
2639 }
2640 if(newValue!=oldValue) {
2641 umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
2642 }
2643 }
2644 }
2645
ensureCanonIterData(UErrorCode & errorCode) const2646 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2647 // Logically const: Synchronized instantiation.
2648 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2649 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2650 return U_SUCCESS(errorCode);
2651 }
2652
getCanonValue(UChar32 c) const2653 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2654 return (int32_t)ucptrie_get(fCanonIterData->trie, c);
2655 }
2656
getCanonStartSet(int32_t n) const2657 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2658 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2659 }
2660
isCanonSegmentStarter(UChar32 c) const2661 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2662 return getCanonValue(c)>=0;
2663 }
2664
getCanonStartSet(UChar32 c,UnicodeSet & set) const2665 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2666 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2667 if(canonValue==0) {
2668 return FALSE;
2669 }
2670 set.clear();
2671 int32_t value=canonValue&CANON_VALUE_MASK;
2672 if((canonValue&CANON_HAS_SET)!=0) {
2673 set.addAll(getCanonStartSet(value));
2674 } else if(value!=0) {
2675 set.add(value);
2676 }
2677 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2678 uint16_t norm16=getRawNorm16(c);
2679 if(norm16==JAMO_L) {
2680 UChar32 syllable=
2681 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2682 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2683 } else {
2684 addComposites(getCompositionsList(norm16), set);
2685 }
2686 }
2687 return TRUE;
2688 }
2689
2690 U_NAMESPACE_END
2691
2692 // Normalizer2 data swapping ----------------------------------------------- ***
2693
2694 U_NAMESPACE_USE
2695
2696 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2697 unorm2_swap(const UDataSwapper *ds,
2698 const void *inData, int32_t length, void *outData,
2699 UErrorCode *pErrorCode) {
2700 const UDataInfo *pInfo;
2701 int32_t headerSize;
2702
2703 const uint8_t *inBytes;
2704 uint8_t *outBytes;
2705
2706 const int32_t *inIndexes;
2707 int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2708
2709 int32_t i, offset, nextOffset, size;
2710
2711 /* udata_swapDataHeader checks the arguments */
2712 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2713 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2714 return 0;
2715 }
2716
2717 /* check data format and format version */
2718 pInfo=(const UDataInfo *)((const char *)inData+4);
2719 uint8_t formatVersion0=pInfo->formatVersion[0];
2720 if(!(
2721 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
2722 pInfo->dataFormat[1]==0x72 &&
2723 pInfo->dataFormat[2]==0x6d &&
2724 pInfo->dataFormat[3]==0x32 &&
2725 (1<=formatVersion0 && formatVersion0<=4)
2726 )) {
2727 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2728 pInfo->dataFormat[0], pInfo->dataFormat[1],
2729 pInfo->dataFormat[2], pInfo->dataFormat[3],
2730 pInfo->formatVersion[0]);
2731 *pErrorCode=U_UNSUPPORTED_ERROR;
2732 return 0;
2733 }
2734
2735 inBytes=(const uint8_t *)inData+headerSize;
2736 outBytes=(uint8_t *)outData+headerSize;
2737
2738 inIndexes=(const int32_t *)inBytes;
2739 int32_t minIndexesLength;
2740 if(formatVersion0==1) {
2741 minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2742 } else if(formatVersion0==2) {
2743 minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2744 } else {
2745 minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2746 }
2747
2748 if(length>=0) {
2749 length-=headerSize;
2750 if(length<minIndexesLength*4) {
2751 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2752 length);
2753 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2754 return 0;
2755 }
2756 }
2757
2758 /* read the first few indexes */
2759 for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2760 indexes[i]=udata_readInt32(ds, inIndexes[i]);
2761 }
2762
2763 /* get the total length of the data */
2764 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2765
2766 if(length>=0) {
2767 if(length<size) {
2768 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2769 length);
2770 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2771 return 0;
2772 }
2773
2774 /* copy the data for inaccessible bytes */
2775 if(inBytes!=outBytes) {
2776 uprv_memcpy(outBytes, inBytes, size);
2777 }
2778
2779 offset=0;
2780
2781 /* swap the int32_t indexes[] */
2782 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2783 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2784 offset=nextOffset;
2785
2786 /* swap the trie */
2787 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2788 utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2789 offset=nextOffset;
2790
2791 /* swap the uint16_t extraData[] */
2792 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2793 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2794 offset=nextOffset;
2795
2796 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2797 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2798 offset=nextOffset;
2799
2800 U_ASSERT(offset==size);
2801 }
2802
2803 return headerSize+size;
2804 }
2805
2806 #endif // !UCONFIG_NO_NORMALIZATION
2807