1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * uitercollationiterator.cpp
9 *
10 * created on: 2012sep23 (from utf16collationiterator.cpp)
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/uiter.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "collation.h"
22 #include "collationdata.h"
23 #include "collationfcd.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
26 #include "uassert.h"
27 #include "uitercollationiterator.h"
28 
29 U_NAMESPACE_BEGIN
30 
~UIterCollationIterator()31 UIterCollationIterator::~UIterCollationIterator() {}
32 
33 void
resetToOffset(int32_t newOffset)34 UIterCollationIterator::resetToOffset(int32_t newOffset) {
35     reset();
36     iter.move(&iter, newOffset, UITER_START);
37 }
38 
39 int32_t
getOffset() const40 UIterCollationIterator::getOffset() const {
41     return iter.getIndex(&iter, UITER_CURRENT);
42 }
43 
44 uint32_t
handleNextCE32(UChar32 & c,UErrorCode &)45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
46     c = iter.next(&iter);
47     if(c < 0) {
48         return Collation::FALLBACK_CE32;
49     }
50     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
51 }
52 
53 UChar
handleGetTrailSurrogate()54 UIterCollationIterator::handleGetTrailSurrogate() {
55     UChar32 trail = iter.next(&iter);
56     if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
57     return (UChar)trail;
58 }
59 
60 UChar32
nextCodePoint(UErrorCode &)61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
62     return uiter_next32(&iter);
63 }
64 
65 UChar32
previousCodePoint(UErrorCode &)66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
67     return uiter_previous32(&iter);
68 }
69 
70 void
forwardNumCodePoints(int32_t num,UErrorCode &)71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
72     while(num > 0 && (uiter_next32(&iter)) >= 0) {
73         --num;
74     }
75 }
76 
77 void
backwardNumCodePoints(int32_t num,UErrorCode &)78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
79     while(num > 0 && (uiter_previous32(&iter)) >= 0) {
80         --num;
81     }
82 }
83 
84 // FCDUIterCollationIterator ----------------------------------------------- ***
85 
~FCDUIterCollationIterator()86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
87 
88 void
resetToOffset(int32_t newOffset)89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
90     UIterCollationIterator::resetToOffset(newOffset);
91     start = newOffset;
92     state = ITER_CHECK_FWD;
93 }
94 
95 int32_t
getOffset() const96 FCDUIterCollationIterator::getOffset() const {
97     if(state <= ITER_CHECK_BWD) {
98         return iter.getIndex(&iter, UITER_CURRENT);
99     } else if(state == ITER_IN_FCD_SEGMENT) {
100         return pos;
101     } else if(pos == 0) {
102         return start;
103     } else {
104         return limit;
105     }
106 }
107 
108 uint32_t
handleNextCE32(UChar32 & c,UErrorCode & errorCode)109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
110     for(;;) {
111         if(state == ITER_CHECK_FWD) {
112             c = iter.next(&iter);
113             if(c < 0) {
114                 return Collation::FALLBACK_CE32;
115             }
116             if(CollationFCD::hasTccc(c)) {
117                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
118                         CollationFCD::hasLccc(iter.current(&iter))) {
119                     iter.previous(&iter);
120                     if(!nextSegment(errorCode)) {
121                         c = U_SENTINEL;
122                         return Collation::FALLBACK_CE32;
123                     }
124                     continue;
125                 }
126             }
127             break;
128         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
129             c = iter.next(&iter);
130             ++pos;
131             U_ASSERT(c >= 0);
132             break;
133         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
134             c = normalized[pos++];
135             break;
136         } else {
137             switchToForward();
138         }
139     }
140     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
141 }
142 
143 UChar
handleGetTrailSurrogate()144 FCDUIterCollationIterator::handleGetTrailSurrogate() {
145     if(state <= ITER_IN_FCD_SEGMENT) {
146         UChar32 trail = iter.next(&iter);
147         if(U16_IS_TRAIL(trail)) {
148             if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
149         } else if(trail >= 0) {
150             iter.previous(&iter);
151         }
152         return (UChar)trail;
153     } else {
154         U_ASSERT(pos < normalized.length());
155         UChar trail;
156         if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
157         return trail;
158     }
159 }
160 
161 UChar32
nextCodePoint(UErrorCode & errorCode)162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
163     UChar32 c;
164     for(;;) {
165         if(state == ITER_CHECK_FWD) {
166             c = iter.next(&iter);
167             if(c < 0) {
168                 return c;
169             }
170             if(CollationFCD::hasTccc(c)) {
171                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
172                         CollationFCD::hasLccc(iter.current(&iter))) {
173                     iter.previous(&iter);
174                     if(!nextSegment(errorCode)) {
175                         return U_SENTINEL;
176                     }
177                     continue;
178                 }
179             }
180             if(U16_IS_LEAD(c)) {
181                 UChar32 trail = iter.next(&iter);
182                 if(U16_IS_TRAIL(trail)) {
183                     return U16_GET_SUPPLEMENTARY(c, trail);
184                 } else if(trail >= 0) {
185                     iter.previous(&iter);
186                 }
187             }
188             return c;
189         } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
190             c = uiter_next32(&iter);
191             pos += U16_LENGTH(c);
192             U_ASSERT(c >= 0);
193             return c;
194         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
195             c = normalized.char32At(pos);
196             pos += U16_LENGTH(c);
197             return c;
198         } else {
199             switchToForward();
200         }
201     }
202 }
203 
204 UChar32
previousCodePoint(UErrorCode & errorCode)205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
206     UChar32 c;
207     for(;;) {
208         if(state == ITER_CHECK_BWD) {
209             c = iter.previous(&iter);
210             if(c < 0) {
211                 start = pos = 0;
212                 state = ITER_IN_FCD_SEGMENT;
213                 return U_SENTINEL;
214             }
215             if(CollationFCD::hasLccc(c)) {
216                 UChar32 prev = U_SENTINEL;
217                 if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218                         CollationFCD::hasTccc(prev = iter.previous(&iter))) {
219                     iter.next(&iter);
220                     if(prev >= 0) {
221                         iter.next(&iter);
222                     }
223                     if(!previousSegment(errorCode)) {
224                         return U_SENTINEL;
225                     }
226                     continue;
227                 }
228                 // hasLccc(trail)=true for all trail surrogates
229                 if(U16_IS_TRAIL(c)) {
230                     if(prev < 0) {
231                         prev = iter.previous(&iter);
232                     }
233                     if(U16_IS_LEAD(prev)) {
234                         return U16_GET_SUPPLEMENTARY(prev, c);
235                     }
236                 }
237                 if(prev >= 0) {
238                     iter.next(&iter);
239                 }
240             }
241             return c;
242         } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
243             c = uiter_previous32(&iter);
244             pos -= U16_LENGTH(c);
245             U_ASSERT(c >= 0);
246             return c;
247         } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
248             c = normalized.char32At(pos - 1);
249             pos -= U16_LENGTH(c);
250             return c;
251         } else {
252             switchToBackward();
253         }
254     }
255 }
256 
257 void
forwardNumCodePoints(int32_t num,UErrorCode & errorCode)258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
259     // Specify the class to avoid a virtual-function indirection.
260     // In Java, we would declare this class final.
261     while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
262         --num;
263     }
264 }
265 
266 void
backwardNumCodePoints(int32_t num,UErrorCode & errorCode)267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
268     // Specify the class to avoid a virtual-function indirection.
269     // In Java, we would declare this class final.
270     while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
271         --num;
272     }
273 }
274 
275 void
switchToForward()276 FCDUIterCollationIterator::switchToForward() {
277     U_ASSERT(state == ITER_CHECK_BWD ||
278              (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
279              (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
280     if(state == ITER_CHECK_BWD) {
281         // Turn around from backward checking.
282         start = pos = iter.getIndex(&iter, UITER_CURRENT);
283         if(pos == limit) {
284             state = ITER_CHECK_FWD;  // Check forward.
285         } else {  // pos < limit
286             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
287         }
288     } else {
289         // Reached the end of the FCD segment.
290         if(state == ITER_IN_FCD_SEGMENT) {
291             // The input text segment is FCD, extend it forward.
292         } else {
293             // The input text segment needed to be normalized.
294             // Switch to checking forward from it.
295             if(state == IN_NORM_ITER_AT_START) {
296                 iter.move(&iter, limit - start, UITER_CURRENT);
297             }
298             start = limit;
299         }
300         state = ITER_CHECK_FWD;
301     }
302 }
303 
304 UBool
nextSegment(UErrorCode & errorCode)305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
306     if(U_FAILURE(errorCode)) { return FALSE; }
307     U_ASSERT(state == ITER_CHECK_FWD);
308     // The input text [start..(iter index)[ passes the FCD check.
309     pos = iter.getIndex(&iter, UITER_CURRENT);
310     // Collect the characters being checked, in case they need to be normalized.
311     UnicodeString s;
312     uint8_t prevCC = 0;
313     for(;;) {
314         // Fetch the next character and its fcd16 value.
315         UChar32 c = uiter_next32(&iter);
316         if(c < 0) { break; }
317         uint16_t fcd16 = nfcImpl.getFCD16(c);
318         uint8_t leadCC = (uint8_t)(fcd16 >> 8);
319         if(leadCC == 0 && !s.isEmpty()) {
320             // FCD boundary before this character.
321             uiter_previous32(&iter);
322             break;
323         }
324         s.append(c);
325         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
326             // Fails FCD check. Find the next FCD boundary and normalize.
327             for(;;) {
328                 c = uiter_next32(&iter);
329                 if(c < 0) { break; }
330                 if(nfcImpl.getFCD16(c) <= 0xff) {
331                     uiter_previous32(&iter);
332                     break;
333                 }
334                 s.append(c);
335             }
336             if(!normalize(s, errorCode)) { return FALSE; }
337             start = pos;
338             limit = pos + s.length();
339             state = IN_NORM_ITER_AT_LIMIT;
340             pos = 0;
341             return TRUE;
342         }
343         prevCC = (uint8_t)fcd16;
344         if(prevCC == 0) {
345             // FCD boundary after the last character.
346             break;
347         }
348     }
349     limit = pos + s.length();
350     U_ASSERT(pos != limit);
351     iter.move(&iter, -s.length(), UITER_CURRENT);
352     state = ITER_IN_FCD_SEGMENT;
353     return TRUE;
354 }
355 
356 void
switchToBackward()357 FCDUIterCollationIterator::switchToBackward() {
358     U_ASSERT(state == ITER_CHECK_FWD ||
359              (state == ITER_IN_FCD_SEGMENT && pos == start) ||
360              (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
361     if(state == ITER_CHECK_FWD) {
362         // Turn around from forward checking.
363         limit = pos = iter.getIndex(&iter, UITER_CURRENT);
364         if(pos == start) {
365             state = ITER_CHECK_BWD;  // Check backward.
366         } else {  // pos > start
367             state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
368         }
369     } else {
370         // Reached the start of the FCD segment.
371         if(state == ITER_IN_FCD_SEGMENT) {
372             // The input text segment is FCD, extend it backward.
373         } else {
374             // The input text segment needed to be normalized.
375             // Switch to checking backward from it.
376             if(state == IN_NORM_ITER_AT_LIMIT) {
377                 iter.move(&iter, start - limit, UITER_CURRENT);
378             }
379             limit = start;
380         }
381         state = ITER_CHECK_BWD;
382     }
383 }
384 
385 UBool
previousSegment(UErrorCode & errorCode)386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
387     if(U_FAILURE(errorCode)) { return FALSE; }
388     U_ASSERT(state == ITER_CHECK_BWD);
389     // The input text [(iter index)..limit[ passes the FCD check.
390     pos = iter.getIndex(&iter, UITER_CURRENT);
391     // Collect the characters being checked, in case they need to be normalized.
392     UnicodeString s;
393     uint8_t nextCC = 0;
394     for(;;) {
395         // Fetch the previous character and its fcd16 value.
396         UChar32 c = uiter_previous32(&iter);
397         if(c < 0) { break; }
398         uint16_t fcd16 = nfcImpl.getFCD16(c);
399         uint8_t trailCC = (uint8_t)fcd16;
400         if(trailCC == 0 && !s.isEmpty()) {
401             // FCD boundary after this character.
402             uiter_next32(&iter);
403             break;
404         }
405         s.append(c);
406         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
407                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
408             // Fails FCD check. Find the previous FCD boundary and normalize.
409             while(fcd16 > 0xff) {
410                 c = uiter_previous32(&iter);
411                 if(c < 0) { break; }
412                 fcd16 = nfcImpl.getFCD16(c);
413                 if(fcd16 == 0) {
414                     (void)uiter_next32(&iter);
415                     break;
416                 }
417                 s.append(c);
418             }
419             s.reverse();
420             if(!normalize(s, errorCode)) { return FALSE; }
421             limit = pos;
422             start = pos - s.length();
423             state = IN_NORM_ITER_AT_START;
424             pos = normalized.length();
425             return TRUE;
426         }
427         nextCC = (uint8_t)(fcd16 >> 8);
428         if(nextCC == 0) {
429             // FCD boundary before the following character.
430             break;
431         }
432     }
433     start = pos - s.length();
434     U_ASSERT(pos != start);
435     iter.move(&iter, s.length(), UITER_CURRENT);
436     state = ITER_IN_FCD_SEGMENT;
437     return TRUE;
438 }
439 
440 UBool
normalize(const UnicodeString & s,UErrorCode & errorCode)441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
442     // NFD without argument checking.
443     U_ASSERT(U_SUCCESS(errorCode));
444     nfcImpl.decompose(s, normalized, errorCode);
445     return U_SUCCESS(errorCode);
446 }
447 
448 U_NAMESPACE_END
449 
450 #endif  // !UCONFIG_NO_COLLATION
451