1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationdata.cpp
9 *
10 * created on: 2012jul28
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/ucol.h"
19 #include "unicode/udata.h"
20 #include "unicode/uscript.h"
21 #include "cmemory.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "uassert.h"
25 #include "utrie2.h"
26 #include "uvectr32.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 uint32_t
getIndirectCE32(uint32_t ce32) const31 CollationData::getIndirectCE32(uint32_t ce32) const {
32     U_ASSERT(Collation::isSpecialCE32(ce32));
33     int32_t tag = Collation::tagFromCE32(ce32);
34     if(tag == Collation::DIGIT_TAG) {
35         // Fetch the non-numeric-collation CE32.
36         ce32 = ce32s[Collation::indexFromCE32(ce32)];
37     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
38         ce32 = Collation::UNASSIGNED_CE32;
39     } else if(tag == Collation::U0000_TAG) {
40         // Fetch the normal ce32 for U+0000.
41         ce32 = ce32s[0];
42     }
43     return ce32;
44 }
45 
46 uint32_t
getFinalCE32(uint32_t ce32) const47 CollationData::getFinalCE32(uint32_t ce32) const {
48     if(Collation::isSpecialCE32(ce32)) {
49         ce32 = getIndirectCE32(ce32);
50     }
51     return ce32;
52 }
53 
54 int64_t
getSingleCE(UChar32 c,UErrorCode & errorCode) const55 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56     if(U_FAILURE(errorCode)) { return 0; }
57     // Keep parallel with CollationDataBuilder::getSingleCE().
58     const CollationData *d;
59     uint32_t ce32 = getCE32(c);
60     if(ce32 == Collation::FALLBACK_CE32) {
61         d = base;
62         ce32 = base->getCE32(c);
63     } else {
64         d = this;
65     }
66     while(Collation::isSpecialCE32(ce32)) {
67         switch(Collation::tagFromCE32(ce32)) {
68         case Collation::LATIN_EXPANSION_TAG:
69         case Collation::BUILDER_DATA_TAG:
70         case Collation::PREFIX_TAG:
71         case Collation::CONTRACTION_TAG:
72         case Collation::HANGUL_TAG:
73         case Collation::LEAD_SURROGATE_TAG:
74             errorCode = U_UNSUPPORTED_ERROR;
75             return 0;
76         case Collation::FALLBACK_TAG:
77         case Collation::RESERVED_TAG_3:
78             errorCode = U_INTERNAL_PROGRAM_ERROR;
79             return 0;
80         case Collation::LONG_PRIMARY_TAG:
81             return Collation::ceFromLongPrimaryCE32(ce32);
82         case Collation::LONG_SECONDARY_TAG:
83             return Collation::ceFromLongSecondaryCE32(ce32);
84         case Collation::EXPANSION32_TAG:
85             if(Collation::lengthFromCE32(ce32) == 1) {
86                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87                 break;
88             } else {
89                 errorCode = U_UNSUPPORTED_ERROR;
90                 return 0;
91             }
92         case Collation::EXPANSION_TAG: {
93             if(Collation::lengthFromCE32(ce32) == 1) {
94                 return d->ces[Collation::indexFromCE32(ce32)];
95             } else {
96                 errorCode = U_UNSUPPORTED_ERROR;
97                 return 0;
98             }
99         }
100         case Collation::DIGIT_TAG:
101             // Fetch the non-numeric-collation CE32 and continue.
102             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103             break;
104         case Collation::U0000_TAG:
105             U_ASSERT(c == 0);
106             // Fetch the normal ce32 for U+0000 and continue.
107             ce32 = d->ce32s[0];
108             break;
109         case Collation::OFFSET_TAG:
110             return d->getCEFromOffsetCE32(c, ce32);
111         case Collation::IMPLICIT_TAG:
112             return Collation::unassignedCEFromCodePoint(c);
113         }
114     }
115     return Collation::ceFromSimpleCE32(ce32);
116 }
117 
118 uint32_t
getFirstPrimaryForGroup(int32_t script) const119 CollationData::getFirstPrimaryForGroup(int32_t script) const {
120     int32_t index = getScriptIndex(script);
121     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
122 }
123 
124 uint32_t
getLastPrimaryForGroup(int32_t script) const125 CollationData::getLastPrimaryForGroup(int32_t script) const {
126     int32_t index = getScriptIndex(script);
127     if(index == 0) {
128         return 0;
129     }
130     uint32_t limit = scriptStarts[index + 1];
131     return (limit << 16) - 1;
132 }
133 
134 int32_t
getGroupForPrimary(uint32_t p) const135 CollationData::getGroupForPrimary(uint32_t p) const {
136     p >>= 16;
137     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
138         return -1;
139     }
140     int32_t index = 1;
141     while(p >= scriptStarts[index + 1]) { ++index; }
142     for(int32_t i = 0; i < numScripts; ++i) {
143         if(scriptsIndex[i] == index) {
144             return i;
145         }
146     }
147     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148         if(scriptsIndex[numScripts + i] == index) {
149             return UCOL_REORDER_CODE_FIRST + i;
150         }
151     }
152     return -1;
153 }
154 
155 int32_t
getScriptIndex(int32_t script) const156 CollationData::getScriptIndex(int32_t script) const {
157     if(script < 0) {
158         return 0;
159     } else if(script < numScripts) {
160         return scriptsIndex[script];
161     } else if(script < UCOL_REORDER_CODE_FIRST) {
162         return 0;
163     } else {
164         script -= UCOL_REORDER_CODE_FIRST;
165         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166             return scriptsIndex[numScripts + script];
167         } else {
168             return 0;
169         }
170     }
171 }
172 
173 int32_t
getEquivalentScripts(int32_t script,int32_t dest[],int32_t capacity,UErrorCode & errorCode) const174 CollationData::getEquivalentScripts(int32_t script,
175                                     int32_t dest[], int32_t capacity,
176                                     UErrorCode &errorCode) const {
177     if(U_FAILURE(errorCode)) { return 0; }
178     int32_t index = getScriptIndex(script);
179     if(index == 0) { return 0; }
180     if(script >= UCOL_REORDER_CODE_FIRST) {
181         // Special groups have no aliases.
182         if(capacity > 0) {
183             dest[0] = script;
184         } else {
185             errorCode = U_BUFFER_OVERFLOW_ERROR;
186         }
187         return 1;
188     }
189 
190     int32_t length = 0;
191     for(int32_t i = 0; i < numScripts; ++i) {
192         if(scriptsIndex[i] == index) {
193             if(length < capacity) {
194                 dest[length] = i;
195             }
196             ++length;
197         }
198     }
199     if(length > capacity) {
200         errorCode = U_BUFFER_OVERFLOW_ERROR;
201     }
202     return length;
203 }
204 
205 void
makeReorderRanges(const int32_t * reorder,int32_t length,UVector32 & ranges,UErrorCode & errorCode) const206 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207                                  UVector32 &ranges, UErrorCode &errorCode) const {
208     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209 }
210 
211 void
makeReorderRanges(const int32_t * reorder,int32_t length,UBool latinMustMove,UVector32 & ranges,UErrorCode & errorCode) const212 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213                                  UBool latinMustMove,
214                                  UVector32 &ranges, UErrorCode &errorCode) const {
215     if(U_FAILURE(errorCode)) { return; }
216     ranges.removeAllElements();
217     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
218         return;
219     }
220 
221     // Maps each script-or-group range to a new lead byte.
222     uint8_t table[MAX_NUM_SCRIPT_RANGES];
223     uprv_memset(table, 0, sizeof(table));
224 
225     {
226         // Set "don't care" values for reserved ranges.
227         int32_t index = scriptsIndex[
228                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229         if(index != 0) {
230             table[index] = 0xff;
231         }
232         index = scriptsIndex[
233                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234         if(index != 0) {
235             table[index] = 0xff;
236         }
237     }
238 
239     // Never reorder special low and high primary lead bytes.
240     U_ASSERT(scriptStartsLength >= 2);
241     U_ASSERT(scriptStarts[0] == 0);
242     int32_t lowStart = scriptStarts[1];
243     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
244     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
245     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
246 
247     // Get the set of special reorder codes in the input list.
248     // This supports a fixed number of special reorder codes;
249     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250     uint32_t specials = 0;
251     for(int32_t i = 0; i < length; ++i) {
252         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254             specials |= (uint32_t)1 << reorderCode;
255         }
256     }
257 
258     // Start the reordering with the special low reorder codes that do not occur in the input.
259     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260         int32_t index = scriptsIndex[numScripts + i];
261         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
262             lowStart = addLowScriptRange(table, index, lowStart);
263         }
264     }
265 
266     // Skip the reserved range before Latin if Latin is the first script,
267     // so that we do not move it unnecessarily.
268     int32_t skippedReserved = 0;
269     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
270         int32_t index = scriptsIndex[USCRIPT_LATIN];
271         U_ASSERT(index != 0);
272         int32_t start = scriptStarts[index];
273         U_ASSERT(lowStart <= start);
274         skippedReserved = start - lowStart;
275         lowStart = start;
276     }
277 
278     // Reorder according to the input scripts, continuing from the bottom of the primary range.
279     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
280     UBool hasReorderToEnd = FALSE;
281     for(int32_t i = 0; i < length;) {
282         int32_t script = reorder[i++];
283         if(script == USCRIPT_UNKNOWN) {
284             // Put the remaining scripts at the top.
285             hasReorderToEnd = TRUE;
286             while(i < length) {
287                 script = reorder[--length];
288                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
289                         script == UCOL_REORDER_CODE_DEFAULT) {
290                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291                     return;
292                 }
293                 int32_t index = getScriptIndex(script);
294                 if(index == 0) { continue; }
295                 if(table[index] != 0) {  // Duplicate or equivalent script.
296                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297                     return;
298                 }
299                 highLimit = addHighScriptRange(table, index, highLimit);
300             }
301             break;
302         }
303         if(script == UCOL_REORDER_CODE_DEFAULT) {
304             // The default code must be the only one in the list, and that is handled by the caller.
305             // Otherwise it must not be used.
306             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307             return;
308         }
309         int32_t index = getScriptIndex(script);
310         if(index == 0) { continue; }
311         if(table[index] != 0) {  // Duplicate or equivalent script.
312             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313             return;
314         }
315         lowStart = addLowScriptRange(table, index, lowStart);
316     }
317 
318     // Put all remaining scripts into the middle.
319     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
320         int32_t leadByte = table[i];
321         if(leadByte != 0) { continue; }
322         int32_t start = scriptStarts[i];
323         if(!hasReorderToEnd && start > lowStart) {
324             // No need to move this script.
325             lowStart = start;
326         }
327         lowStart = addLowScriptRange(table, i, lowStart);
328     }
329     if(lowStart > highLimit) {
330         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
331             // Try not skipping the before-Latin reserved range.
332             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333             return;
334         }
335         // We need more primary lead bytes than available, despite the reserved ranges.
336         errorCode = U_BUFFER_OVERFLOW_ERROR;
337         return;
338     }
339 
340     // Turn lead bytes into a list of (limit, offset) pairs.
341     // Encode each pair in one list element:
342     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
343     int32_t offset = 0;
344     for(int32_t i = 1;; ++i) {
345         int32_t nextOffset = offset;
346         while(i < scriptStartsLength - 1) {
347             int32_t newLeadByte = table[i];
348             if(newLeadByte == 0xff) {
349                 // "Don't care" lead byte for reserved range, continue with current offset.
350             } else {
351                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
352                 if(nextOffset != offset) { break; }
353             }
354             ++i;
355         }
356         if(offset != 0 || i < scriptStartsLength - 1) {
357             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
358         }
359         if(i == scriptStartsLength - 1) { break; }
360         offset = nextOffset;
361     }
362 }
363 
364 int32_t
addLowScriptRange(uint8_t table[],int32_t index,int32_t lowStart) const365 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366     int32_t start = scriptStarts[index];
367     if((start & 0xff) < (lowStart & 0xff)) {
368         lowStart += 0x100;
369     }
370     table[index] = (uint8_t)(lowStart >> 8);
371     int32_t limit = scriptStarts[index + 1];
372     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
373     return lowStart;
374 }
375 
376 int32_t
addHighScriptRange(uint8_t table[],int32_t index,int32_t highLimit) const377 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378     int32_t limit = scriptStarts[index + 1];
379     if((limit & 0xff) > (highLimit & 0xff)) {
380         highLimit -= 0x100;
381     }
382     int32_t start = scriptStarts[index];
383     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
384     table[index] = (uint8_t)(highLimit >> 8);
385     return highLimit;
386 }
387 
388 U_NAMESPACE_END
389 
390 #endif  // !UCONFIG_NO_COLLATION
391