1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 /*
11 * File coleitr.cpp
12 *
13 * Created by: Helena Shih
14 *
15 * Modification History:
16 *
17 *  Date      Name        Description
18 *
19 *  6/23/97   helena      Adding comments to make code more readable.
20 * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
21 * 12/10/99   aliu        Ported Thai collation support from Java.
22 * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
23 * 02/19/01   swquek      Removed CollationElementIterator() since it is
24 *                        private constructor and no calls are made to it
25 * 2012-2014  markus      Rewritten in C++ again.
26 */
27 
28 #include "unicode/utypes.h"
29 
30 #if !UCONFIG_NO_COLLATION
31 
32 #include "unicode/chariter.h"
33 #include "unicode/coleitr.h"
34 #include "unicode/tblcoll.h"
35 #include "unicode/ustring.h"
36 #include "cmemory.h"
37 #include "collation.h"
38 #include "collationdata.h"
39 #include "collationiterator.h"
40 #include "collationsets.h"
41 #include "collationtailoring.h"
42 #include "uassert.h"
43 #include "uhash.h"
44 #include "utf16collationiterator.h"
45 #include "uvectr32.h"
46 
47 /* Constants --------------------------------------------------------------- */
48 
49 U_NAMESPACE_BEGIN
50 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52 
53 /* CollationElementIterator public constructor/destructor ------------------ */
54 
55 CollationElementIterator::CollationElementIterator(
56                                          const CollationElementIterator& other)
57         : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
58     *this = other;
59 }
60 
~CollationElementIterator()61 CollationElementIterator::~CollationElementIterator()
62 {
63     delete iter_;
64     delete offsets_;
65 }
66 
67 /* CollationElementIterator public methods --------------------------------- */
68 
69 namespace {
70 
getFirstHalf(uint32_t p,uint32_t lower32)71 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72     return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
73 }
getSecondHalf(uint32_t p,uint32_t lower32)74 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75     return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
76 }
ceNeedsTwoParts(int64_t ce)77 UBool ceNeedsTwoParts(int64_t ce) {
78     return (ce & INT64_C(0xffff00ff003f)) != 0;
79 }
80 
81 }  // namespace
82 
getOffset() const83 int32_t CollationElementIterator::getOffset() const
84 {
85     if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
86         // CollationIterator::previousCE() decrements the CEs length
87         // while it pops CEs from its internal buffer.
88         int32_t i = iter_->getCEsLength();
89         if (otherHalf_ != 0) {
90             // Return the trailing CE offset while we are in the middle of a 64-bit CE.
91             ++i;
92         }
93         U_ASSERT(i < offsets_->size());
94         return offsets_->elementAti(i);
95     }
96     return iter_->getOffset();
97 }
98 
99 /**
100 * Get the ordering priority of the next character in the string.
101 * @return the next character's ordering. Returns NULLORDER if an error has
102 *         occured or if the end of string has been reached
103 */
next(UErrorCode & status)104 int32_t CollationElementIterator::next(UErrorCode& status)
105 {
106     if (U_FAILURE(status)) { return NULLORDER; }
107     if (dir_ > 1) {
108         // Continue forward iteration. Test this first.
109         if (otherHalf_ != 0) {
110             uint32_t oh = otherHalf_;
111             otherHalf_ = 0;
112             return oh;
113         }
114     } else if (dir_ == 1) {
115         // next() after setOffset()
116         dir_ = 2;
117     } else if (dir_ == 0) {
118         // The iter_ is already reset to the start of the text.
119         dir_ = 2;
120     } else /* dir_ < 0 */ {
121         // illegal change of direction
122         status = U_INVALID_STATE_ERROR;
123         return NULLORDER;
124     }
125     // No need to keep all CEs in the buffer when we iterate.
126     iter_->clearCEsIfNoneRemaining();
127     int64_t ce = iter_->nextCE(status);
128     if (ce == Collation::NO_CE) { return NULLORDER; }
129     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130     uint32_t p = (uint32_t)(ce >> 32);
131     uint32_t lower32 = (uint32_t)ce;
132     uint32_t firstHalf = getFirstHalf(p, lower32);
133     uint32_t secondHalf = getSecondHalf(p, lower32);
134     if (secondHalf != 0) {
135         otherHalf_ = secondHalf | 0xc0;  // continuation CE
136     }
137     return firstHalf;
138 }
139 
operator !=(const CollationElementIterator & other) const140 UBool CollationElementIterator::operator!=(
141                                   const CollationElementIterator& other) const
142 {
143     return !(*this == other);
144 }
145 
operator ==(const CollationElementIterator & that) const146 UBool CollationElementIterator::operator==(
147                                     const CollationElementIterator& that) const
148 {
149     if (this == &that) {
150         return TRUE;
151     }
152 
153     return
154         (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
155         otherHalf_ == that.otherHalf_ &&
156         normalizeDir() == that.normalizeDir() &&
157         string_ == that.string_ &&
158         *iter_ == *that.iter_;
159 }
160 
161 /**
162 * Get the ordering priority of the previous collation element in the string.
163 * @param status the error code status.
164 * @return the previous element's ordering. Returns NULLORDER if an error has
165 *         occured or if the start of string has been reached.
166 */
previous(UErrorCode & status)167 int32_t CollationElementIterator::previous(UErrorCode& status)
168 {
169     if (U_FAILURE(status)) { return NULLORDER; }
170     if (dir_ < 0) {
171         // Continue backwards iteration. Test this first.
172         if (otherHalf_ != 0) {
173             uint32_t oh = otherHalf_;
174             otherHalf_ = 0;
175             return oh;
176         }
177     } else if (dir_ == 0) {
178         iter_->resetToOffset(string_.length());
179         dir_ = -1;
180     } else if (dir_ == 1) {
181         // previous() after setOffset()
182         dir_ = -1;
183     } else /* dir_ > 1 */ {
184         // illegal change of direction
185         status = U_INVALID_STATE_ERROR;
186         return NULLORDER;
187     }
188     if (offsets_ == NULL) {
189         offsets_ = new UVector32(status);
190         if (offsets_ == NULL) {
191             status = U_MEMORY_ALLOCATION_ERROR;
192             return NULLORDER;
193         }
194     }
195     // If we already have expansion CEs, then we also have offsets.
196     // Otherwise remember the trailing offset in case we need to
197     // write offsets for an artificial expansion.
198     int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
199     int64_t ce = iter_->previousCE(*offsets_, status);
200     if (ce == Collation::NO_CE) { return NULLORDER; }
201     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202     uint32_t p = (uint32_t)(ce >> 32);
203     uint32_t lower32 = (uint32_t)ce;
204     uint32_t firstHalf = getFirstHalf(p, lower32);
205     uint32_t secondHalf = getSecondHalf(p, lower32);
206     if (secondHalf != 0) {
207         if (offsets_->isEmpty()) {
208             // When we convert a single 64-bit CE into two 32-bit CEs,
209             // we need to make this artificial expansion behave like a normal expansion.
210             // See CollationIterator::previousCE().
211             offsets_->addElement(iter_->getOffset(), status);
212             offsets_->addElement(limitOffset, status);
213         }
214         otherHalf_ = firstHalf;
215         return secondHalf | 0xc0;  // continuation CE
216     }
217     return firstHalf;
218 }
219 
220 /**
221 * Resets the cursor to the beginning of the string.
222 */
reset()223 void CollationElementIterator::reset()
224 {
225     iter_ ->resetToOffset(0);
226     otherHalf_ = 0;
227     dir_ = 0;
228 }
229 
setOffset(int32_t newOffset,UErrorCode & status)230 void CollationElementIterator::setOffset(int32_t newOffset,
231                                          UErrorCode& status)
232 {
233     if (U_FAILURE(status)) { return; }
234     if (0 < newOffset && newOffset < string_.length()) {
235         int32_t offset = newOffset;
236         do {
237             UChar c = string_.charAt(offset);
238             if (!rbc_->isUnsafe(c) ||
239                     (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240                 break;
241             }
242             // Back up to before this unsafe character.
243             --offset;
244         } while (offset > 0);
245         if (offset < newOffset) {
246             // We might have backed up more than necessary.
247             // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248             // but for text "chu" setOffset(2) should remain at 2
249             // although we initially back up to offset 0.
250             // Find the last safe offset no greater than newOffset by iterating forward.
251             int32_t lastSafeOffset = offset;
252             do {
253                 iter_->resetToOffset(lastSafeOffset);
254                 do {
255                     iter_->nextCE(status);
256                     if (U_FAILURE(status)) { return; }
257                 } while ((offset = iter_->getOffset()) == lastSafeOffset);
258                 if (offset <= newOffset) {
259                     lastSafeOffset = offset;
260                 }
261             } while (offset < newOffset);
262             newOffset = lastSafeOffset;
263         }
264     }
265     iter_->resetToOffset(newOffset);
266     otherHalf_ = 0;
267     dir_ = 1;
268 }
269 
270 /**
271 * Sets the source to the new source string.
272 */
setText(const UnicodeString & source,UErrorCode & status)273 void CollationElementIterator::setText(const UnicodeString& source,
274                                        UErrorCode& status)
275 {
276     if (U_FAILURE(status)) {
277         return;
278     }
279 
280     string_ = source;
281     const UChar *s = string_.getBuffer();
282     CollationIterator *newIter;
283     UBool numeric = rbc_->settings->isNumeric();
284     if (rbc_->settings->dontCheckFCD()) {
285         newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286     } else {
287         newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288     }
289     if (newIter == NULL) {
290         status = U_MEMORY_ALLOCATION_ERROR;
291         return;
292     }
293     delete iter_;
294     iter_ = newIter;
295     otherHalf_ = 0;
296     dir_ = 0;
297 }
298 
299 // Sets the source to the new character iterator.
setText(CharacterIterator & source,UErrorCode & status)300 void CollationElementIterator::setText(CharacterIterator& source,
301                                        UErrorCode& status)
302 {
303     if (U_FAILURE(status))
304         return;
305 
306     source.getText(string_);
307     setText(string_, status);
308 }
309 
strengthOrder(int32_t order) const310 int32_t CollationElementIterator::strengthOrder(int32_t order) const
311 {
312     UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313     // Mask off the unwanted differences.
314     if (s == UCOL_PRIMARY) {
315         order &= 0xffff0000;
316     }
317     else if (s == UCOL_SECONDARY) {
318         order &= 0xffffff00;
319     }
320 
321     return order;
322 }
323 
324 /* CollationElementIterator private constructors/destructors --------------- */
325 
326 /**
327 * This is the "real" constructor for this class; it constructs an iterator
328 * over the source text using the specified collator
329 */
CollationElementIterator(const UnicodeString & source,const RuleBasedCollator * coll,UErrorCode & status)330 CollationElementIterator::CollationElementIterator(
331                                                const UnicodeString &source,
332                                                const RuleBasedCollator *coll,
333                                                UErrorCode &status)
334         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
335     setText(source, status);
336 }
337 
338 /**
339 * This is the "real" constructor for this class; it constructs an iterator over
340 * the source text using the specified collator
341 */
CollationElementIterator(const CharacterIterator & source,const RuleBasedCollator * coll,UErrorCode & status)342 CollationElementIterator::CollationElementIterator(
343                                            const CharacterIterator &source,
344                                            const RuleBasedCollator *coll,
345                                            UErrorCode &status)
346         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
347     // We only call source.getText() which should be const anyway.
348     setText(const_cast<CharacterIterator &>(source), status);
349 }
350 
351 /* CollationElementIterator private methods -------------------------------- */
352 
operator =(const CollationElementIterator & other)353 const CollationElementIterator& CollationElementIterator::operator=(
354                                          const CollationElementIterator& other)
355 {
356     if (this == &other) {
357         return *this;
358     }
359 
360     CollationIterator *newIter;
361     const FCDUTF16CollationIterator *otherFCDIter =
362             dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363     if(otherFCDIter != NULL) {
364         newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365     } else {
366         const UTF16CollationIterator *otherIter =
367                 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368         if(otherIter != NULL) {
369             newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
370         } else {
371             newIter = NULL;
372         }
373     }
374     if(newIter != NULL) {
375         delete iter_;
376         iter_ = newIter;
377         rbc_ = other.rbc_;
378         otherHalf_ = other.otherHalf_;
379         dir_ = other.dir_;
380 
381         string_ = other.string_;
382     }
383     if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384         UErrorCode errorCode = U_ZERO_ERROR;
385         if(offsets_ == NULL) {
386             offsets_ = new UVector32(other.offsets_->size(), errorCode);
387         }
388         if(offsets_ != NULL) {
389             offsets_->assign(*other.offsets_, errorCode);
390         }
391     }
392     return *this;
393 }
394 
395 namespace {
396 
397 class MaxExpSink : public ContractionsAndExpansions::CESink {
398 public:
MaxExpSink(UHashtable * h,UErrorCode & ec)399     MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400     virtual ~MaxExpSink();
handleCE(int64_t)401     virtual void handleCE(int64_t /*ce*/) {}
handleExpansion(const int64_t ces[],int32_t length)402     virtual void handleExpansion(const int64_t ces[], int32_t length) {
403         if (length <= 1) {
404             // We do not need to add single CEs into the map.
405             return;
406         }
407         int32_t count = 0;  // number of CE "halves"
408         for (int32_t i = 0; i < length; ++i) {
409             count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410         }
411         // last "half" of the last CE
412         int64_t ce = ces[length - 1];
413         uint32_t p = (uint32_t)(ce >> 32);
414         uint32_t lower32 = (uint32_t)ce;
415         uint32_t lastHalf = getSecondHalf(p, lower32);
416         if (lastHalf == 0) {
417             lastHalf = getFirstHalf(p, lower32);
418             U_ASSERT(lastHalf != 0);
419         } else {
420             lastHalf |= 0xc0;  // old-style continuation CE
421         }
422         if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423             uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424         }
425     }
426 
427 private:
428     UHashtable *maxExpansions;
429     UErrorCode &errorCode;
430 };
431 
~MaxExpSink()432 MaxExpSink::~MaxExpSink() {}
433 
434 }  // namespace
435 
436 UHashtable *
computeMaxExpansions(const CollationData * data,UErrorCode & errorCode)437 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438     if (U_FAILURE(errorCode)) { return NULL; }
439     UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440                                            uhash_compareLong, &errorCode);
441     if (U_FAILURE(errorCode)) { return NULL; }
442     MaxExpSink sink(maxExpansions, errorCode);
443     ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
444     if (U_FAILURE(errorCode)) {
445         uhash_close(maxExpansions);
446         return NULL;
447     }
448     return maxExpansions;
449 }
450 
451 int32_t
getMaxExpansion(int32_t order) const452 CollationElementIterator::getMaxExpansion(int32_t order) const {
453     return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454 }
455 
456 int32_t
getMaxExpansion(const UHashtable * maxExpansions,int32_t order)457 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458     if (order == 0) { return 1; }
459     int32_t max;
460     if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
461         return max;
462     }
463     if ((order & 0xc0) == 0xc0) {
464         // old-style continuation CE
465         return 2;
466     } else {
467         return 1;
468     }
469 }
470 
471 U_NAMESPACE_END
472 
473 #endif /* #if !UCONFIG_NO_COLLATION */
474