1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *   Copyright (C) 2001-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 ******************************************************************************
8 *
9 * File ucoleitr.cpp
10 *
11 * Modification History:
12 *
13 * Date        Name        Description
14 * 02/15/2001  synwee      Modified all methods to process its own function
15 *                         instead of calling the equivalent c++ api (coleitr.h)
16 * 2012-2014   markus      Rewritten in C++ again.
17 ******************************************************************************/
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_COLLATION
22 
23 #include "unicode/coleitr.h"
24 #include "unicode/tblcoll.h"
25 #include "unicode/ucoleitr.h"
26 #include "unicode/ustring.h"
27 #include "unicode/sortkey.h"
28 #include "unicode/uobject.h"
29 #include "cmemory.h"
30 #include "usrchimp.h"
31 
32 U_NAMESPACE_USE
33 
34 #define BUFFER_LENGTH             100
35 
36 #define DEFAULT_BUFFER_SIZE 16
37 #define BUFFER_GROW 8
38 
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
40 
41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
42 
43 #define DELETE_ARRAY(array) uprv_free((void *) (array))
44 
45 struct RCEI
46 {
47     uint32_t ce;
48     int32_t  low;
49     int32_t  high;
50 };
51 
52 U_NAMESPACE_BEGIN
53 
54 struct RCEBuffer
55 {
56     RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
57     RCEI   *buffer;
58     int32_t bufferIndex;
59     int32_t bufferSize;
60 
61     RCEBuffer();
62     ~RCEBuffer();
63 
64     UBool isEmpty() const;
65     void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
66     const RCEI *get();
67 };
68 
RCEBuffer()69 RCEBuffer::RCEBuffer()
70 {
71     buffer = defaultBuffer;
72     bufferIndex = 0;
73     bufferSize = UPRV_LENGTHOF(defaultBuffer);
74 }
75 
~RCEBuffer()76 RCEBuffer::~RCEBuffer()
77 {
78     if (buffer != defaultBuffer) {
79         DELETE_ARRAY(buffer);
80     }
81 }
82 
isEmpty() const83 UBool RCEBuffer::isEmpty() const
84 {
85     return bufferIndex <= 0;
86 }
87 
put(uint32_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)88 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
89 {
90     if (U_FAILURE(errorCode)) {
91         return;
92     }
93     if (bufferIndex >= bufferSize) {
94         RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
95         if (newBuffer == NULL) {
96             errorCode = U_MEMORY_ALLOCATION_ERROR;
97             return;
98         }
99 
100         ARRAY_COPY(newBuffer, buffer, bufferSize);
101 
102         if (buffer != defaultBuffer) {
103             DELETE_ARRAY(buffer);
104         }
105 
106         buffer = newBuffer;
107         bufferSize += BUFFER_GROW;
108     }
109 
110     buffer[bufferIndex].ce   = ce;
111     buffer[bufferIndex].low  = ixLow;
112     buffer[bufferIndex].high = ixHigh;
113 
114     bufferIndex += 1;
115 }
116 
get()117 const RCEI *RCEBuffer::get()
118 {
119     if (bufferIndex > 0) {
120      return &buffer[--bufferIndex];
121     }
122 
123     return NULL;
124 }
125 
PCEBuffer()126 PCEBuffer::PCEBuffer()
127 {
128     buffer = defaultBuffer;
129     bufferIndex = 0;
130     bufferSize = UPRV_LENGTHOF(defaultBuffer);
131 }
132 
~PCEBuffer()133 PCEBuffer::~PCEBuffer()
134 {
135     if (buffer != defaultBuffer) {
136         DELETE_ARRAY(buffer);
137     }
138 }
139 
reset()140 void PCEBuffer::reset()
141 {
142     bufferIndex = 0;
143 }
144 
isEmpty() const145 UBool PCEBuffer::isEmpty() const
146 {
147     return bufferIndex <= 0;
148 }
149 
put(uint64_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)150 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
151 {
152     if (U_FAILURE(errorCode)) {
153         return;
154     }
155     if (bufferIndex >= bufferSize) {
156         PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
157         if (newBuffer == NULL) {
158             errorCode = U_MEMORY_ALLOCATION_ERROR;
159             return;
160         }
161 
162         ARRAY_COPY(newBuffer, buffer, bufferSize);
163 
164         if (buffer != defaultBuffer) {
165             DELETE_ARRAY(buffer);
166         }
167 
168         buffer = newBuffer;
169         bufferSize += BUFFER_GROW;
170     }
171 
172     buffer[bufferIndex].ce   = ce;
173     buffer[bufferIndex].low  = ixLow;
174     buffer[bufferIndex].high = ixHigh;
175 
176     bufferIndex += 1;
177 }
178 
get()179 const PCEI *PCEBuffer::get()
180 {
181     if (bufferIndex > 0) {
182      return &buffer[--bufferIndex];
183     }
184 
185     return NULL;
186 }
187 
UCollationPCE(UCollationElements * elems)188 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
189 
UCollationPCE(CollationElementIterator * iter)190 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
191 
init(UCollationElements * elems)192 void UCollationPCE::init(UCollationElements *elems) {
193     init(CollationElementIterator::fromUCollationElements(elems));
194 }
195 
init(CollationElementIterator * iter)196 void UCollationPCE::init(CollationElementIterator *iter)
197 {
198     cei = iter;
199     init(*iter->rbc_);
200 }
201 
init(const Collator & coll)202 void UCollationPCE::init(const Collator &coll)
203 {
204     UErrorCode status = U_ZERO_ERROR;
205 
206     strength    = coll.getAttribute(UCOL_STRENGTH, status);
207     toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
208     isShifted   = FALSE;
209     variableTop = coll.getVariableTop(status);
210 }
211 
~UCollationPCE()212 UCollationPCE::~UCollationPCE()
213 {
214     // nothing to do
215 }
216 
processCE(uint32_t ce)217 uint64_t UCollationPCE::processCE(uint32_t ce)
218 {
219     uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
220 
221     // This is clean, but somewhat slow...
222     // We could apply the mask to ce and then
223     // just get all three orders...
224     switch(strength) {
225     default:
226         tertiary = ucol_tertiaryOrder(ce);
227         U_FALLTHROUGH;
228 
229     case UCOL_SECONDARY:
230         secondary = ucol_secondaryOrder(ce);
231         U_FALLTHROUGH;
232 
233     case UCOL_PRIMARY:
234         primary = ucol_primaryOrder(ce);
235     }
236 
237     // **** This should probably handle continuations too.  ****
238     // **** That means that we need 24 bits for the primary ****
239     // **** instead of the 16 that we're currently using.   ****
240     // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
241     // **** Another complication with continuations is that ****
242     // **** the *second* CE is marked as a continuation, so ****
243     // **** we always have to peek ahead to know how long   ****
244     // **** the primary is...                               ****
245     if ((toShift && variableTop > ce && primary != 0)
246                 || (isShifted && primary == 0)) {
247 
248         if (primary == 0) {
249             return UCOL_IGNORABLE;
250         }
251 
252         if (strength >= UCOL_QUATERNARY) {
253             quaternary = primary;
254         }
255 
256         primary = secondary = tertiary = 0;
257         isShifted = TRUE;
258     } else {
259         if (strength >= UCOL_QUATERNARY) {
260             quaternary = 0xFFFF;
261         }
262 
263         isShifted = FALSE;
264     }
265 
266     return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
267 }
268 
269 U_NAMESPACE_END
270 
271 /* public methods ---------------------------------------------------- */
272 
273 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)274 ucol_openElements(const UCollator  *coll,
275                   const UChar      *text,
276                         int32_t    textLength,
277                         UErrorCode *status)
278 {
279     if (U_FAILURE(*status)) {
280         return NULL;
281     }
282     if (coll == NULL || (text == NULL && textLength != 0)) {
283         *status = U_ILLEGAL_ARGUMENT_ERROR;
284         return NULL;
285     }
286     const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
287     if (rbc == NULL) {
288         *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
289         return NULL;
290     }
291 
292     UnicodeString s((UBool)(textLength < 0), text, textLength);
293     CollationElementIterator *cei = rbc->createCollationElementIterator(s);
294     if (cei == NULL) {
295         *status = U_MEMORY_ALLOCATION_ERROR;
296         return NULL;
297     }
298 
299     return cei->toUCollationElements();
300 }
301 
302 
303 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)304 ucol_closeElements(UCollationElements *elems)
305 {
306     delete CollationElementIterator::fromUCollationElements(elems);
307 }
308 
309 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)310 ucol_reset(UCollationElements *elems)
311 {
312     CollationElementIterator::fromUCollationElements(elems)->reset();
313 }
314 
315 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)316 ucol_next(UCollationElements *elems,
317           UErrorCode         *status)
318 {
319     if (U_FAILURE(*status)) {
320         return UCOL_NULLORDER;
321     }
322 
323     return CollationElementIterator::fromUCollationElements(elems)->next(*status);
324 }
325 
326 U_NAMESPACE_BEGIN
327 
328 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)329 UCollationPCE::nextProcessed(
330                    int32_t            *ixLow,
331                    int32_t            *ixHigh,
332                    UErrorCode         *status)
333 {
334     int64_t result = UCOL_IGNORABLE;
335     uint32_t low = 0, high = 0;
336 
337     if (U_FAILURE(*status)) {
338         return UCOL_PROCESSED_NULLORDER;
339     }
340 
341     pceBuffer.reset();
342 
343     do {
344         low = cei->getOffset();
345         int32_t ce = cei->next(*status);
346         high = cei->getOffset();
347 
348         if (ce == UCOL_NULLORDER) {
349              result = UCOL_PROCESSED_NULLORDER;
350              break;
351         }
352 
353         result = processCE((uint32_t)ce);
354     } while (result == UCOL_IGNORABLE);
355 
356     if (ixLow != NULL) {
357         *ixLow = low;
358     }
359 
360     if (ixHigh != NULL) {
361         *ixHigh = high;
362     }
363 
364     return result;
365 }
366 
367 U_NAMESPACE_END
368 
369 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)370 ucol_previous(UCollationElements *elems,
371               UErrorCode         *status)
372 {
373     if(U_FAILURE(*status)) {
374         return UCOL_NULLORDER;
375     }
376     return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
377 }
378 
379 U_NAMESPACE_BEGIN
380 
381 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)382 UCollationPCE::previousProcessed(
383                    int32_t            *ixLow,
384                    int32_t            *ixHigh,
385                    UErrorCode         *status)
386 {
387     int64_t result = UCOL_IGNORABLE;
388     int32_t  low = 0, high = 0;
389 
390     if (U_FAILURE(*status)) {
391         return UCOL_PROCESSED_NULLORDER;
392     }
393 
394     // pceBuffer.reset();
395 
396     while (pceBuffer.isEmpty()) {
397         // buffer raw CEs up to non-ignorable primary
398         RCEBuffer rceb;
399         int32_t ce;
400 
401         // **** do we need to reset rceb, or will it always be empty at this point ****
402         do {
403             high = cei->getOffset();
404             ce   = cei->previous(*status);
405             low  = cei->getOffset();
406 
407             if (ce == UCOL_NULLORDER) {
408                 if (!rceb.isEmpty()) {
409                     break;
410                 }
411 
412                 goto finish;
413             }
414 
415             rceb.put((uint32_t)ce, low, high, *status);
416         } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
417 
418         // process the raw CEs
419         while (U_SUCCESS(*status) && !rceb.isEmpty()) {
420             const RCEI *rcei = rceb.get();
421 
422             result = processCE(rcei->ce);
423 
424             if (result != UCOL_IGNORABLE) {
425                 pceBuffer.put(result, rcei->low, rcei->high, *status);
426             }
427         }
428         if (U_FAILURE(*status)) {
429             return UCOL_PROCESSED_NULLORDER;
430         }
431     }
432 
433 finish:
434     if (pceBuffer.isEmpty()) {
435         // **** Is -1 the right value for ixLow, ixHigh? ****
436     	if (ixLow != NULL) {
437     		*ixLow = -1;
438     	}
439 
440     	if (ixHigh != NULL) {
441     		*ixHigh = -1
442     		;
443     	}
444         return UCOL_PROCESSED_NULLORDER;
445     }
446 
447     const PCEI *pcei = pceBuffer.get();
448 
449     if (ixLow != NULL) {
450         *ixLow = pcei->low;
451     }
452 
453     if (ixHigh != NULL) {
454         *ixHigh = pcei->high;
455     }
456 
457     return pcei->ce;
458 }
459 
460 U_NAMESPACE_END
461 
462 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)463 ucol_getMaxExpansion(const UCollationElements *elems,
464                            int32_t            order)
465 {
466     return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
467 
468     // TODO: The old code masked the order according to strength and then did a binary search.
469     // However this was probably at least partially broken because of the following comment.
470     // Still, it might have found a match when this version may not.
471 
472     // FIXME: with a masked search, there might be more than one hit,
473     // so we need to look forward and backward from the match to find all
474     // of the hits...
475 }
476 
477 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)478 ucol_setText(      UCollationElements *elems,
479              const UChar              *text,
480                    int32_t            textLength,
481                    UErrorCode         *status)
482 {
483     if (U_FAILURE(*status)) {
484         return;
485     }
486 
487     if ((text == NULL && textLength != 0)) {
488         *status = U_ILLEGAL_ARGUMENT_ERROR;
489         return;
490     }
491     UnicodeString s((UBool)(textLength < 0), text, textLength);
492     return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
493 }
494 
495 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)496 ucol_getOffset(const UCollationElements *elems)
497 {
498     return CollationElementIterator::fromUCollationElements(elems)->getOffset();
499 }
500 
501 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)502 ucol_setOffset(UCollationElements    *elems,
503                int32_t           offset,
504                UErrorCode            *status)
505 {
506     if (U_FAILURE(*status)) {
507         return;
508     }
509 
510     CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
511 }
512 
513 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)514 ucol_primaryOrder (int32_t order)
515 {
516     return (order >> 16) & 0xffff;
517 }
518 
519 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)520 ucol_secondaryOrder (int32_t order)
521 {
522     return (order >> 8) & 0xff;
523 }
524 
525 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)526 ucol_tertiaryOrder (int32_t order)
527 {
528     return order & 0xff;
529 }
530 
531 #endif /* #if !UCONFIG_NO_COLLATION */
532