1 /*
2 ******************************************************************************
3 *   Copyright (C) 2001-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date        Name        Description
12 * 02/15/2001  synwee      Modified all methods to process its own function
13 *                         instead of calling the equivalent c++ api (coleitr.h)
14 * 2012-2014   markus      Rewritten in C++ again.
15 ******************************************************************************/
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/coleitr.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ucoleitr.h"
24 #include "unicode/ustring.h"
25 #include "unicode/sortkey.h"
26 #include "unicode/uobject.h"
27 #include "cmemory.h"
28 #include "usrchimp.h"
29 
30 U_NAMESPACE_USE
31 
32 #define BUFFER_LENGTH             100
33 
34 #define DEFAULT_BUFFER_SIZE 16
35 #define BUFFER_GROW 8
36 
37 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
38 
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
40 
41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
42 
43 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
44 
45 #define DELETE_ARRAY(array) uprv_free((void *) (array))
46 
47 struct RCEI
48 {
49     uint32_t ce;
50     int32_t  low;
51     int32_t  high;
52 };
53 
54 U_NAMESPACE_BEGIN
55 
56 struct RCEBuffer
57 {
58     RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
59     RCEI   *buffer;
60     int32_t bufferIndex;
61     int32_t bufferSize;
62 
63     RCEBuffer();
64     ~RCEBuffer();
65 
66     UBool empty() const;
67     void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
68     const RCEI *get();
69 };
70 
RCEBuffer()71 RCEBuffer::RCEBuffer()
72 {
73     buffer = defaultBuffer;
74     bufferIndex = 0;
75     bufferSize = UPRV_LENGTHOF(defaultBuffer);
76 }
77 
~RCEBuffer()78 RCEBuffer::~RCEBuffer()
79 {
80     if (buffer != defaultBuffer) {
81         DELETE_ARRAY(buffer);
82     }
83 }
84 
empty() const85 UBool RCEBuffer::empty() const
86 {
87     return bufferIndex <= 0;
88 }
89 
put(uint32_t ce,int32_t ixLow,int32_t ixHigh)90 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
91 {
92     if (bufferIndex >= bufferSize) {
93         RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
94 
95         ARRAY_COPY(newBuffer, buffer, bufferSize);
96 
97         if (buffer != defaultBuffer) {
98             DELETE_ARRAY(buffer);
99         }
100 
101         buffer = newBuffer;
102         bufferSize += BUFFER_GROW;
103     }
104 
105     buffer[bufferIndex].ce   = ce;
106     buffer[bufferIndex].low  = ixLow;
107     buffer[bufferIndex].high = ixHigh;
108 
109     bufferIndex += 1;
110 }
111 
get()112 const RCEI *RCEBuffer::get()
113 {
114     if (bufferIndex > 0) {
115      return &buffer[--bufferIndex];
116     }
117 
118     return NULL;
119 }
120 
PCEBuffer()121 PCEBuffer::PCEBuffer()
122 {
123     buffer = defaultBuffer;
124     bufferIndex = 0;
125     bufferSize = UPRV_LENGTHOF(defaultBuffer);
126 }
127 
~PCEBuffer()128 PCEBuffer::~PCEBuffer()
129 {
130     if (buffer != defaultBuffer) {
131         DELETE_ARRAY(buffer);
132     }
133 }
134 
reset()135 void PCEBuffer::reset()
136 {
137     bufferIndex = 0;
138 }
139 
empty() const140 UBool PCEBuffer::empty() const
141 {
142     return bufferIndex <= 0;
143 }
144 
put(uint64_t ce,int32_t ixLow,int32_t ixHigh)145 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
146 {
147     if (bufferIndex >= bufferSize) {
148         PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
149 
150         ARRAY_COPY(newBuffer, buffer, bufferSize);
151 
152         if (buffer != defaultBuffer) {
153             DELETE_ARRAY(buffer);
154         }
155 
156         buffer = newBuffer;
157         bufferSize += BUFFER_GROW;
158     }
159 
160     buffer[bufferIndex].ce   = ce;
161     buffer[bufferIndex].low  = ixLow;
162     buffer[bufferIndex].high = ixHigh;
163 
164     bufferIndex += 1;
165 }
166 
get()167 const PCEI *PCEBuffer::get()
168 {
169     if (bufferIndex > 0) {
170      return &buffer[--bufferIndex];
171     }
172 
173     return NULL;
174 }
175 
UCollationPCE(UCollationElements * elems)176 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
177 
UCollationPCE(CollationElementIterator * iter)178 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
179 
init(UCollationElements * elems)180 void UCollationPCE::init(UCollationElements *elems) {
181     init(CollationElementIterator::fromUCollationElements(elems));
182 }
183 
init(CollationElementIterator * iter)184 void UCollationPCE::init(CollationElementIterator *iter)
185 {
186     cei = iter;
187     init(*iter->rbc_);
188 }
189 
init(const Collator & coll)190 void UCollationPCE::init(const Collator &coll)
191 {
192     UErrorCode status = U_ZERO_ERROR;
193 
194     strength    = coll.getAttribute(UCOL_STRENGTH, status);
195     toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
196     isShifted   = FALSE;
197     variableTop = coll.getVariableTop(status);
198 }
199 
~UCollationPCE()200 UCollationPCE::~UCollationPCE()
201 {
202     // nothing to do
203 }
204 
processCE(uint32_t ce)205 uint64_t UCollationPCE::processCE(uint32_t ce)
206 {
207     uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
208 
209     // This is clean, but somewhat slow...
210     // We could apply the mask to ce and then
211     // just get all three orders...
212     switch(strength) {
213     default:
214         tertiary = ucol_tertiaryOrder(ce);
215         /* note fall-through */
216 
217     case UCOL_SECONDARY:
218         secondary = ucol_secondaryOrder(ce);
219         /* note fall-through */
220 
221     case UCOL_PRIMARY:
222         primary = ucol_primaryOrder(ce);
223     }
224 
225     // **** This should probably handle continuations too.  ****
226     // **** That means that we need 24 bits for the primary ****
227     // **** instead of the 16 that we're currently using.   ****
228     // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
229     // **** Another complication with continuations is that ****
230     // **** the *second* CE is marked as a continuation, so ****
231     // **** we always have to peek ahead to know how long   ****
232     // **** the primary is...                               ****
233     if ((toShift && variableTop > ce && primary != 0)
234                 || (isShifted && primary == 0)) {
235 
236         if (primary == 0) {
237             return UCOL_IGNORABLE;
238         }
239 
240         if (strength >= UCOL_QUATERNARY) {
241             quaternary = primary;
242         }
243 
244         primary = secondary = tertiary = 0;
245         isShifted = TRUE;
246     } else {
247         if (strength >= UCOL_QUATERNARY) {
248             quaternary = 0xFFFF;
249         }
250 
251         isShifted = FALSE;
252     }
253 
254     return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
255 }
256 
257 U_NAMESPACE_END
258 
259 /* public methods ---------------------------------------------------- */
260 
261 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)262 ucol_openElements(const UCollator  *coll,
263                   const UChar      *text,
264                         int32_t    textLength,
265                         UErrorCode *status)
266 {
267     if (U_FAILURE(*status)) {
268         return NULL;
269     }
270     if (coll == NULL || (text == NULL && textLength != 0)) {
271         *status = U_ILLEGAL_ARGUMENT_ERROR;
272         return NULL;
273     }
274     const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
275     if (rbc == NULL) {
276         *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
277         return NULL;
278     }
279 
280     UnicodeString s((UBool)(textLength < 0), text, textLength);
281     CollationElementIterator *cei = rbc->createCollationElementIterator(s);
282     if (cei == NULL) {
283         *status = U_MEMORY_ALLOCATION_ERROR;
284         return NULL;
285     }
286 
287     return cei->toUCollationElements();
288 }
289 
290 
291 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)292 ucol_closeElements(UCollationElements *elems)
293 {
294     delete CollationElementIterator::fromUCollationElements(elems);
295 }
296 
297 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)298 ucol_reset(UCollationElements *elems)
299 {
300     CollationElementIterator::fromUCollationElements(elems)->reset();
301 }
302 
303 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)304 ucol_next(UCollationElements *elems,
305           UErrorCode         *status)
306 {
307     if (U_FAILURE(*status)) {
308         return UCOL_NULLORDER;
309     }
310 
311     return CollationElementIterator::fromUCollationElements(elems)->next(*status);
312 }
313 
314 U_NAMESPACE_BEGIN
315 
316 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)317 UCollationPCE::nextProcessed(
318                    int32_t            *ixLow,
319                    int32_t            *ixHigh,
320                    UErrorCode         *status)
321 {
322     int64_t result = UCOL_IGNORABLE;
323     uint32_t low = 0, high = 0;
324 
325     if (U_FAILURE(*status)) {
326         return UCOL_PROCESSED_NULLORDER;
327     }
328 
329     pceBuffer.reset();
330 
331     do {
332         low = cei->getOffset();
333         int32_t ce = cei->next(*status);
334         high = cei->getOffset();
335 
336         if (ce == UCOL_NULLORDER) {
337              result = UCOL_PROCESSED_NULLORDER;
338              break;
339         }
340 
341         result = processCE((uint32_t)ce);
342     } while (result == UCOL_IGNORABLE);
343 
344     if (ixLow != NULL) {
345         *ixLow = low;
346     }
347 
348     if (ixHigh != NULL) {
349         *ixHigh = high;
350     }
351 
352     return result;
353 }
354 
355 U_NAMESPACE_END
356 
357 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)358 ucol_previous(UCollationElements *elems,
359               UErrorCode         *status)
360 {
361     if(U_FAILURE(*status)) {
362         return UCOL_NULLORDER;
363     }
364     return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
365 }
366 
367 U_NAMESPACE_BEGIN
368 
369 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)370 UCollationPCE::previousProcessed(
371                    int32_t            *ixLow,
372                    int32_t            *ixHigh,
373                    UErrorCode         *status)
374 {
375     int64_t result = UCOL_IGNORABLE;
376     int32_t  low = 0, high = 0;
377 
378     if (U_FAILURE(*status)) {
379         return UCOL_PROCESSED_NULLORDER;
380     }
381 
382     // pceBuffer.reset();
383 
384     while (pceBuffer.empty()) {
385         // buffer raw CEs up to non-ignorable primary
386         RCEBuffer rceb;
387         int32_t ce;
388 
389         // **** do we need to reset rceb, or will it always be empty at this point ****
390         do {
391             high = cei->getOffset();
392             ce   = cei->previous(*status);
393             low  = cei->getOffset();
394 
395             if (ce == UCOL_NULLORDER) {
396                 if (! rceb.empty()) {
397                     break;
398                 }
399 
400                 goto finish;
401             }
402 
403             rceb.put((uint32_t)ce, low, high);
404         } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce));
405 
406         // process the raw CEs
407         while (! rceb.empty()) {
408             const RCEI *rcei = rceb.get();
409 
410             result = processCE(rcei->ce);
411 
412             if (result != UCOL_IGNORABLE) {
413                 pceBuffer.put(result, rcei->low, rcei->high);
414             }
415         }
416     }
417 
418 finish:
419     if (pceBuffer.empty()) {
420         // **** Is -1 the right value for ixLow, ixHigh? ****
421     	if (ixLow != NULL) {
422     		*ixLow = -1;
423     	}
424 
425     	if (ixHigh != NULL) {
426     		*ixHigh = -1
427     		;
428     	}
429         return UCOL_PROCESSED_NULLORDER;
430     }
431 
432     const PCEI *pcei = pceBuffer.get();
433 
434     if (ixLow != NULL) {
435         *ixLow = pcei->low;
436     }
437 
438     if (ixHigh != NULL) {
439         *ixHigh = pcei->high;
440     }
441 
442     return pcei->ce;
443 }
444 
445 U_NAMESPACE_END
446 
447 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)448 ucol_getMaxExpansion(const UCollationElements *elems,
449                            int32_t            order)
450 {
451     return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
452 
453     // TODO: The old code masked the order according to strength and then did a binary search.
454     // However this was probably at least partially broken because of the following comment.
455     // Still, it might have found a match when this version may not.
456 
457     // FIXME: with a masked search, there might be more than one hit,
458     // so we need to look forward and backward from the match to find all
459     // of the hits...
460 }
461 
462 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)463 ucol_setText(      UCollationElements *elems,
464              const UChar              *text,
465                    int32_t            textLength,
466                    UErrorCode         *status)
467 {
468     if (U_FAILURE(*status)) {
469         return;
470     }
471 
472     if ((text == NULL && textLength != 0)) {
473         *status = U_ILLEGAL_ARGUMENT_ERROR;
474         return;
475     }
476     UnicodeString s((UBool)(textLength < 0), text, textLength);
477     return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
478 }
479 
480 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)481 ucol_getOffset(const UCollationElements *elems)
482 {
483     return CollationElementIterator::fromUCollationElements(elems)->getOffset();
484 }
485 
486 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)487 ucol_setOffset(UCollationElements    *elems,
488                int32_t           offset,
489                UErrorCode            *status)
490 {
491     if (U_FAILURE(*status)) {
492         return;
493     }
494 
495     CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
496 }
497 
498 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)499 ucol_primaryOrder (int32_t order)
500 {
501     return (order >> 16) & 0xffff;
502 }
503 
504 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)505 ucol_secondaryOrder (int32_t order)
506 {
507     return (order >> 8) & 0xff;
508 }
509 
510 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)511 ucol_tertiaryOrder (int32_t order)
512 {
513     return order & 0xff;
514 }
515 
516 #endif /* #if !UCONFIG_NO_COLLATION */
517