1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // umutablecptrie.cpp (inspired by utrie2_builder.cpp)
5 // created: 2017dec29 Markus W. Scherer
6 
7 // #define UCPTRIE_DEBUG
8 #ifdef UCPTRIE_DEBUG
9 #   include <stdio.h>
10 #endif
11 
12 #include "unicode/utypes.h"
13 #include "unicode/ucptrie.h"
14 #include "unicode/umutablecptrie.h"
15 #include "unicode/uobject.h"
16 #include "unicode/utf16.h"
17 #include "cmemory.h"
18 #include "uassert.h"
19 #include "ucptrie_impl.h"
20 
21 // ICU-20235 In case Microsoft math.h has defined this, undefine it.
22 #ifdef OVERFLOW
23 #undef OVERFLOW
24 #endif
25 
26 U_NAMESPACE_BEGIN
27 
28 namespace {
29 
30 constexpr int32_t MAX_UNICODE = 0x10ffff;
31 
32 constexpr int32_t UNICODE_LIMIT = 0x110000;
33 constexpr int32_t BMP_LIMIT = 0x10000;
34 constexpr int32_t ASCII_LIMIT = 0x80;
35 
36 constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3;
37 constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3;
38 constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3;
39 
40 constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3));
41 
42 // Flag values for data blocks.
43 constexpr uint8_t ALL_SAME = 0;
44 constexpr uint8_t MIXED = 1;
45 constexpr uint8_t SAME_AS = 2;
46 
47 /** Start with allocation of 16k data entries. */
48 constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14);
49 
50 /** Grow about 8x each time. */
51 constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17);
52 
53 /**
54  * Maximum length of the build-time data array.
55  * One entry per 0x110000 code points.
56  */
57 constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT;
58 
59 // Flag values for index-3 blocks while compacting/building.
60 constexpr uint8_t I3_NULL = 0;
61 constexpr uint8_t I3_BMP = 1;
62 constexpr uint8_t I3_16 = 2;
63 constexpr uint8_t I3_18 = 3;
64 
65 constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
66 
67 class AllSameBlocks;
68 class MixedBlocks;
69 
70 class MutableCodePointTrie : public UMemory {
71 public:
72     MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode);
73     MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode);
74     MutableCodePointTrie(const MutableCodePointTrie &other) = delete;
75     ~MutableCodePointTrie();
76 
77     MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
78 
79     static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
80     static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
81 
82     uint32_t get(UChar32 c) const;
83     int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
84                      uint32_t *pValue) const;
85 
86     void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
87     void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode);
88 
89     UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode);
90 
91 private:
92     void clear();
93 
94     bool ensureHighStart(UChar32 c);
95     int32_t allocDataBlock(int32_t blockLength);
96     int32_t getDataBlock(int32_t i);
97 
98     void maskValues(uint32_t mask);
99     UChar32 findHighStart() const;
100     int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
101     int32_t compactData(
102             int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
103             int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
104     int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
105     int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
106 
107     uint32_t *index = nullptr;
108     int32_t indexCapacity = 0;
109     int32_t index3NullOffset = -1;
110     uint32_t *data = nullptr;
111     int32_t dataCapacity = 0;
112     int32_t dataLength = 0;
113     int32_t dataNullOffset = -1;
114 
115     uint32_t origInitialValue;
116     uint32_t initialValue;
117     uint32_t errorValue;
118     UChar32 highStart;
119     uint32_t highValue;
120 #ifdef UCPTRIE_DEBUG
121 public:
122     const char *name;
123 #endif
124 private:
125     /** Temporary array while building the final data. */
126     uint16_t *index16 = nullptr;
127     uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3];
128 };
129 
MutableCodePointTrie(uint32_t iniValue,uint32_t errValue,UErrorCode & errorCode)130 MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) :
131         origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue),
132         highStart(0), highValue(initialValue)
133 #ifdef UCPTRIE_DEBUG
134         , name("open")
135 #endif
136         {
137     if (U_FAILURE(errorCode)) { return; }
138     index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4);
139     data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4);
140     if (index == nullptr || data == nullptr) {
141         errorCode = U_MEMORY_ALLOCATION_ERROR;
142         return;
143     }
144     indexCapacity = BMP_I_LIMIT;
145     dataCapacity = INITIAL_DATA_LENGTH;
146 }
147 
MutableCodePointTrie(const MutableCodePointTrie & other,UErrorCode & errorCode)148 MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) :
149         index3NullOffset(other.index3NullOffset),
150         dataNullOffset(other.dataNullOffset),
151         origInitialValue(other.origInitialValue), initialValue(other.initialValue),
152         errorValue(other.errorValue),
153         highStart(other.highStart), highValue(other.highValue)
154 #ifdef UCPTRIE_DEBUG
155         , name("mutable clone")
156 #endif
157         {
158     if (U_FAILURE(errorCode)) { return; }
159     int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
160     index = (uint32_t *)uprv_malloc(iCapacity * 4);
161     data = (uint32_t *)uprv_malloc(other.dataCapacity * 4);
162     if (index == nullptr || data == nullptr) {
163         errorCode = U_MEMORY_ALLOCATION_ERROR;
164         return;
165     }
166     indexCapacity = iCapacity;
167     dataCapacity = other.dataCapacity;
168 
169     int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
170     uprv_memcpy(flags, other.flags, iLimit);
171     uprv_memcpy(index, other.index, iLimit * 4);
172     uprv_memcpy(data, other.data, (size_t)other.dataLength * 4);
173     dataLength = other.dataLength;
174     U_ASSERT(other.index16 == nullptr);
175 }
176 
~MutableCodePointTrie()177 MutableCodePointTrie::~MutableCodePointTrie() {
178     uprv_free(index);
179     uprv_free(data);
180     uprv_free(index16);
181 }
182 
fromUCPMap(const UCPMap * map,UErrorCode & errorCode)183 MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
184     // Use the highValue as the initialValue to reduce the highStart.
185     uint32_t errorValue = ucpmap_get(map, -1);
186     uint32_t initialValue = ucpmap_get(map, 0x10ffff);
187     LocalPointer<MutableCodePointTrie> mutableTrie(
188         new MutableCodePointTrie(initialValue, errorValue, errorCode),
189         errorCode);
190     if (U_FAILURE(errorCode)) {
191         return nullptr;
192     }
193     UChar32 start = 0, end;
194     uint32_t value;
195     while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
196                                   nullptr, nullptr, &value)) >= 0) {
197         if (value != initialValue) {
198             if (start == end) {
199                 mutableTrie->set(start, value, errorCode);
200             } else {
201                 mutableTrie->setRange(start, end, value, errorCode);
202             }
203         }
204         start = end + 1;
205     }
206     if (U_SUCCESS(errorCode)) {
207         return mutableTrie.orphan();
208     } else {
209         return nullptr;
210     }
211 }
212 
fromUCPTrie(const UCPTrie * trie,UErrorCode & errorCode)213 MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
214     // Use the highValue as the initialValue to reduce the highStart.
215     uint32_t errorValue;
216     uint32_t initialValue;
217     switch (trie->valueWidth) {
218     case UCPTRIE_VALUE_BITS_16:
219         errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
220         initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
221         break;
222     case UCPTRIE_VALUE_BITS_32:
223         errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
224         initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
225         break;
226     case UCPTRIE_VALUE_BITS_8:
227         errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
228         initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
229         break;
230     default:
231         // Unreachable if the trie is properly initialized.
232         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
233         return nullptr;
234     }
235     LocalPointer<MutableCodePointTrie> mutableTrie(
236         new MutableCodePointTrie(initialValue, errorValue, errorCode),
237         errorCode);
238     if (U_FAILURE(errorCode)) {
239         return nullptr;
240     }
241     UChar32 start = 0, end;
242     uint32_t value;
243     while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
244                                    nullptr, nullptr, &value)) >= 0) {
245         if (value != initialValue) {
246             if (start == end) {
247                 mutableTrie->set(start, value, errorCode);
248             } else {
249                 mutableTrie->setRange(start, end, value, errorCode);
250             }
251         }
252         start = end + 1;
253     }
254     if (U_SUCCESS(errorCode)) {
255         return mutableTrie.orphan();
256     } else {
257         return nullptr;
258     }
259 }
260 
clear()261 void MutableCodePointTrie::clear() {
262     index3NullOffset = dataNullOffset = -1;
263     dataLength = 0;
264     highValue = initialValue = origInitialValue;
265     highStart = 0;
266     uprv_free(index16);
267     index16 = nullptr;
268 }
269 
get(UChar32 c) const270 uint32_t MutableCodePointTrie::get(UChar32 c) const {
271     if ((uint32_t)c > MAX_UNICODE) {
272         return errorValue;
273     }
274     if (c >= highStart) {
275         return highValue;
276     }
277     int32_t i = c >> UCPTRIE_SHIFT_3;
278     if (flags[i] == ALL_SAME) {
279         return index[i];
280     } else {
281         return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)];
282     }
283 }
284 
maybeFilterValue(uint32_t value,uint32_t initialValue,uint32_t nullValue,UCPMapValueFilter * filter,const void * context)285 inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
286                                  UCPMapValueFilter *filter, const void *context) {
287     if (value == initialValue) {
288         value = nullValue;
289     } else if (filter != nullptr) {
290         value = filter(context, value);
291     }
292     return value;
293 }
294 
getRange(UChar32 start,UCPMapValueFilter * filter,const void * context,uint32_t * pValue) const295 UChar32 MutableCodePointTrie::getRange(
296         UChar32 start, UCPMapValueFilter *filter, const void *context,
297         uint32_t *pValue) const {
298     if ((uint32_t)start > MAX_UNICODE) {
299         return U_SENTINEL;
300     }
301     if (start >= highStart) {
302         if (pValue != nullptr) {
303             uint32_t value = highValue;
304             if (filter != nullptr) { value = filter(context, value); }
305             *pValue = value;
306         }
307         return MAX_UNICODE;
308     }
309     uint32_t nullValue = initialValue;
310     if (filter != nullptr) { nullValue = filter(context, nullValue); }
311     UChar32 c = start;
312     uint32_t trieValue, value;
313     bool haveValue = false;
314     int32_t i = c >> UCPTRIE_SHIFT_3;
315     do {
316         if (flags[i] == ALL_SAME) {
317             uint32_t trieValue2 = index[i];
318             if (haveValue) {
319                 if (trieValue2 != trieValue) {
320                     if (filter == nullptr ||
321                             maybeFilterValue(trieValue2, initialValue, nullValue,
322                                              filter, context) != value) {
323                         return c - 1;
324                     }
325                     trieValue = trieValue2;  // may or may not help
326                 }
327             } else {
328                 trieValue = trieValue2;
329                 value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
330                 if (pValue != nullptr) { *pValue = value; }
331                 haveValue = true;
332             }
333             c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
334         } else /* MIXED */ {
335             int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
336             uint32_t trieValue2 = data[di];
337             if (haveValue) {
338                 if (trieValue2 != trieValue) {
339                     if (filter == nullptr ||
340                             maybeFilterValue(trieValue2, initialValue, nullValue,
341                                              filter, context) != value) {
342                         return c - 1;
343                     }
344                     trieValue = trieValue2;  // may or may not help
345                 }
346             } else {
347                 trieValue = trieValue2;
348                 value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context);
349                 if (pValue != nullptr) { *pValue = value; }
350                 haveValue = true;
351             }
352             while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
353                 trieValue2 = data[++di];
354                 if (trieValue2 != trieValue) {
355                     if (filter == nullptr ||
356                             maybeFilterValue(trieValue2, initialValue, nullValue,
357                                              filter, context) != value) {
358                         return c - 1;
359                     }
360                 }
361                 trieValue = trieValue2;  // may or may not help
362             }
363         }
364         ++i;
365     } while (c < highStart);
366     U_ASSERT(haveValue);
367     if (maybeFilterValue(highValue, initialValue, nullValue,
368                          filter, context) != value) {
369         return c - 1;
370     } else {
371         return MAX_UNICODE;
372     }
373 }
374 
375 void
writeBlock(uint32_t * block,uint32_t value)376 writeBlock(uint32_t *block, uint32_t value) {
377     uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
378     while (block < limit) {
379         *block++ = value;
380     }
381 }
382 
ensureHighStart(UChar32 c)383 bool MutableCodePointTrie::ensureHighStart(UChar32 c) {
384     if (c >= highStart) {
385         // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
386         c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
387         int32_t i = highStart >> UCPTRIE_SHIFT_3;
388         int32_t iLimit = c >> UCPTRIE_SHIFT_3;
389         if (iLimit > indexCapacity) {
390             uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4);
391             if (newIndex == nullptr) { return false; }
392             uprv_memcpy(newIndex, index, i * 4);
393             uprv_free(index);
394             index = newIndex;
395             indexCapacity = I_LIMIT;
396         }
397         do {
398             flags[i] = ALL_SAME;
399             index[i] = initialValue;
400         } while(++i < iLimit);
401         highStart = c;
402     }
403     return true;
404 }
405 
allocDataBlock(int32_t blockLength)406 int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) {
407     int32_t newBlock = dataLength;
408     int32_t newTop = newBlock + blockLength;
409     if (newTop > dataCapacity) {
410         int32_t capacity;
411         if (dataCapacity < MEDIUM_DATA_LENGTH) {
412             capacity = MEDIUM_DATA_LENGTH;
413         } else if (dataCapacity < MAX_DATA_LENGTH) {
414             capacity = MAX_DATA_LENGTH;
415         } else {
416             // Should never occur.
417             // Either MAX_DATA_LENGTH is incorrect,
418             // or the code writes more values than should be possible.
419             return -1;
420         }
421         uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4);
422         if (newData == nullptr) {
423             return -1;
424         }
425         uprv_memcpy(newData, data, (size_t)dataLength * 4);
426         uprv_free(data);
427         data = newData;
428         dataCapacity = capacity;
429     }
430     dataLength = newTop;
431     return newBlock;
432 }
433 
434 /**
435  * No error checking for illegal arguments.
436  *
437  * @return -1 if no new data block available (out of memory in data array)
438  * @internal
439  */
getDataBlock(int32_t i)440 int32_t MutableCodePointTrie::getDataBlock(int32_t i) {
441     if (flags[i] == MIXED) {
442         return index[i];
443     }
444     if (i < BMP_I_LIMIT) {
445         int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH);
446         if (newBlock < 0) { return newBlock; }
447         int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
448         int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
449         do {
450             U_ASSERT(flags[iStart] == ALL_SAME);
451             writeBlock(data + newBlock, index[iStart]);
452             flags[iStart] = MIXED;
453             index[iStart++] = newBlock;
454             newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
455         } while (iStart < iLimit);
456         return index[i];
457     } else {
458         int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH);
459         if (newBlock < 0) { return newBlock; }
460         writeBlock(data + newBlock, index[i]);
461         flags[i] = MIXED;
462         index[i] = newBlock;
463         return newBlock;
464     }
465 }
466 
set(UChar32 c,uint32_t value,UErrorCode & errorCode)467 void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) {
468     if (U_FAILURE(errorCode)) {
469         return;
470     }
471     if ((uint32_t)c > MAX_UNICODE) {
472         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
473         return;
474     }
475 
476     int32_t block;
477     if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) {
478         errorCode = U_MEMORY_ALLOCATION_ERROR;
479         return;
480     }
481 
482     data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value;
483 }
484 
485 void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value)486 fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) {
487     uint32_t *pLimit = block + limit;
488     block += start;
489     while (block < pLimit) {
490         *block++ = value;
491     }
492 }
493 
setRange(UChar32 start,UChar32 end,uint32_t value,UErrorCode & errorCode)494 void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) {
495     if (U_FAILURE(errorCode)) {
496         return;
497     }
498     if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) {
499         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
500         return;
501     }
502     if (!ensureHighStart(end)) {
503         errorCode = U_MEMORY_ALLOCATION_ERROR;
504         return;
505     }
506 
507     UChar32 limit = end + 1;
508     if (start & UCPTRIE_SMALL_DATA_MASK) {
509         // Set partial block at [start..following block boundary[.
510         int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
511         if (block < 0) {
512             errorCode = U_MEMORY_ALLOCATION_ERROR;
513             return;
514         }
515 
516         UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK;
517         if (nextStart <= limit) {
518             fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH,
519                       value);
520             start = nextStart;
521         } else {
522             fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK,
523                       value);
524             return;
525         }
526     }
527 
528     // Number of positions in the last, partial block.
529     int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK;
530 
531     // Round down limit to a block boundary.
532     limit &= ~UCPTRIE_SMALL_DATA_MASK;
533 
534     // Iterate over all-value blocks.
535     while (start < limit) {
536         int32_t i = start >> UCPTRIE_SHIFT_3;
537         if (flags[i] == ALL_SAME) {
538             index[i] = value;
539         } else /* MIXED */ {
540             fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value);
541         }
542         start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
543     }
544 
545     if (rest > 0) {
546         // Set partial block at [last block boundary..limit[.
547         int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
548         if (block < 0) {
549             errorCode = U_MEMORY_ALLOCATION_ERROR;
550             return;
551         }
552 
553         fillBlock(data + block, 0, rest, value);
554     }
555 }
556 
557 /* compaction --------------------------------------------------------------- */
558 
maskValues(uint32_t mask)559 void MutableCodePointTrie::maskValues(uint32_t mask) {
560     initialValue &= mask;
561     errorValue &= mask;
562     highValue &= mask;
563     int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
564     for (int32_t i = 0; i < iLimit; ++i) {
565         if (flags[i] == ALL_SAME) {
566             index[i] &= mask;
567         }
568     }
569     for (int32_t i = 0; i < dataLength; ++i) {
570         data[i] &= mask;
571     }
572 }
573 
574 template<typename UIntA, typename UIntB>
equalBlocks(const UIntA * s,const UIntB * t,int32_t length)575 bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
576     while (length > 0 && *s == *t) {
577         ++s;
578         ++t;
579         --length;
580     }
581     return length == 0;
582 }
583 
allValuesSameAs(const uint32_t * p,int32_t length,uint32_t value)584 bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
585     const uint32_t *pLimit = p + length;
586     while (p < pLimit && *p == value) { ++p; }
587     return p == pLimit;
588 }
589 
590 /** Search for an identical block. */
findSameBlock(const uint16_t * p,int32_t pStart,int32_t length,const uint16_t * q,int32_t qStart,int32_t blockLength)591 int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
592                       const uint16_t *q, int32_t qStart, int32_t blockLength) {
593     // Ensure that we do not even partially get past length.
594     length -= blockLength;
595 
596     q += qStart;
597     while (pStart <= length) {
598         if (equalBlocks(p + pStart, q, blockLength)) {
599             return pStart;
600         }
601         ++pStart;
602     }
603     return -1;
604 }
605 
findAllSameBlock(const uint32_t * p,int32_t start,int32_t limit,uint32_t value,int32_t blockLength)606 int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
607                          uint32_t value, int32_t blockLength) {
608     // Ensure that we do not even partially get past limit.
609     limit -= blockLength;
610 
611     for (int32_t block = start; block <= limit; ++block) {
612         if (p[block] == value) {
613             for (int32_t i = 1;; ++i) {
614                 if (i == blockLength) {
615                     return block;
616                 }
617                 if (p[block + i] != value) {
618                     block += i;
619                     break;
620                 }
621             }
622         }
623     }
624     return -1;
625 }
626 
627 /**
628  * Look for maximum overlap of the beginning of the other block
629  * with the previous, adjacent block.
630  */
631 template<typename UIntA, typename UIntB>
getOverlap(const UIntA * p,int32_t length,const UIntB * q,int32_t qStart,int32_t blockLength)632 int32_t getOverlap(const UIntA *p, int32_t length,
633                    const UIntB *q, int32_t qStart, int32_t blockLength) {
634     int32_t overlap = blockLength - 1;
635     U_ASSERT(overlap <= length);
636     q += qStart;
637     while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
638         --overlap;
639     }
640     return overlap;
641 }
642 
getAllSameOverlap(const uint32_t * p,int32_t length,uint32_t value,int32_t blockLength)643 int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
644                           int32_t blockLength) {
645     int32_t min = length - (blockLength - 1);
646     int32_t i = length;
647     while (min < i && p[i - 1] == value) { --i; }
648     return length - i;
649 }
650 
isStartOfSomeFastBlock(uint32_t dataOffset,const uint32_t index[],int32_t fastILimit)651 bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
652     for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
653         if (index[i] == dataOffset) {
654             return true;
655         }
656     }
657     return false;
658 }
659 
660 /**
661  * Finds the start of the last range in the trie by enumerating backward.
662  * Indexes for code points higher than this will be omitted.
663  */
findHighStart() const664 UChar32 MutableCodePointTrie::findHighStart() const {
665     int32_t i = highStart >> UCPTRIE_SHIFT_3;
666     while (i > 0) {
667         bool match;
668         if (flags[--i] == ALL_SAME) {
669             match = index[i] == highValue;
670         } else /* MIXED */ {
671             const uint32_t *p = data + index[i];
672             for (int32_t j = 0;; ++j) {
673                 if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) {
674                     match = true;
675                     break;
676                 }
677                 if (p[j] != highValue) {
678                     match = false;
679                     break;
680                 }
681             }
682         }
683         if (!match) {
684             return (i + 1) << UCPTRIE_SHIFT_3;
685         }
686     }
687     return 0;
688 }
689 
690 class AllSameBlocks {
691 public:
692     static constexpr int32_t NEW_UNIQUE = -1;
693     static constexpr int32_t OVERFLOW = -2;
694 
AllSameBlocks()695     AllSameBlocks() : length(0), mostRecent(-1) {}
696 
findOrAdd(int32_t index,int32_t count,uint32_t value)697     int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) {
698         if (mostRecent >= 0 && values[mostRecent] == value) {
699             refCounts[mostRecent] += count;
700             return indexes[mostRecent];
701         }
702         for (int32_t i = 0; i < length; ++i) {
703             if (values[i] == value) {
704                 mostRecent = i;
705                 refCounts[i] += count;
706                 return indexes[i];
707             }
708         }
709         if (length == CAPACITY) {
710             return OVERFLOW;
711         }
712         mostRecent = length;
713         indexes[length] = index;
714         values[length] = value;
715         refCounts[length++] = count;
716         return NEW_UNIQUE;
717     }
718 
719     /** Replaces the block which has the lowest reference count. */
add(int32_t index,int32_t count,uint32_t value)720     void add(int32_t index, int32_t count, uint32_t value) {
721         U_ASSERT(length == CAPACITY);
722         int32_t least = -1;
723         int32_t leastCount = I_LIMIT;
724         for (int32_t i = 0; i < length; ++i) {
725             U_ASSERT(values[i] != value);
726             if (refCounts[i] < leastCount) {
727                 least = i;
728                 leastCount = refCounts[i];
729             }
730         }
731         U_ASSERT(least >= 0);
732         mostRecent = least;
733         indexes[least] = index;
734         values[least] = value;
735         refCounts[least] = count;
736     }
737 
findMostUsed() const738     int32_t findMostUsed() const {
739         if (length == 0) { return -1; }
740         int32_t max = -1;
741         int32_t maxCount = 0;
742         for (int32_t i = 0; i < length; ++i) {
743             if (refCounts[i] > maxCount) {
744                 max = i;
745                 maxCount = refCounts[i];
746             }
747         }
748         return indexes[max];
749     }
750 
751 private:
752     static constexpr int32_t CAPACITY = 32;
753 
754     int32_t length;
755     int32_t mostRecent;
756 
757     int32_t indexes[CAPACITY];
758     uint32_t values[CAPACITY];
759     int32_t refCounts[CAPACITY];
760 };
761 
762 // Custom hash table for mixed-value blocks to be found anywhere in the
763 // compacted data or index so far.
764 class MixedBlocks {
765 public:
MixedBlocks()766     MixedBlocks() {}
~MixedBlocks()767     ~MixedBlocks() {
768         uprv_free(table);
769     }
770 
init(int32_t maxLength,int32_t newBlockLength)771     bool init(int32_t maxLength, int32_t newBlockLength) {
772         // We store actual data indexes + 1 to reserve 0 for empty entries.
773         int32_t maxDataIndex = maxLength - newBlockLength + 1;
774         int32_t newLength;
775         if (maxDataIndex <= 0xfff) {  // 4k
776             newLength = 6007;
777             shift = 12;
778             mask = 0xfff;
779         } else if (maxDataIndex <= 0x7fff) {  // 32k
780             newLength = 50021;
781             shift = 15;
782             mask = 0x7fff;
783         } else if (maxDataIndex <= 0x1ffff) {  // 128k
784             newLength = 200003;
785             shift = 17;
786             mask = 0x1ffff;
787         } else {
788             // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
789             newLength = 1500007;
790             shift = 21;
791             mask = 0x1fffff;
792         }
793         if (newLength > capacity) {
794             uprv_free(table);
795             table = (uint32_t *)uprv_malloc(newLength * 4);
796             if (table == nullptr) {
797                 return false;
798             }
799             capacity = newLength;
800         }
801         length = newLength;
802         uprv_memset(table, 0, length * 4);
803 
804         blockLength = newBlockLength;
805         return true;
806     }
807 
808     template<typename UInt>
extend(const UInt * data,int32_t minStart,int32_t prevDataLength,int32_t newDataLength)809     void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
810         int32_t start = prevDataLength - blockLength;
811         if (start >= minStart) {
812             ++start;  // Skip the last block that we added last time.
813         } else {
814             start = minStart;  // Begin with the first full block.
815         }
816         for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
817             uint32_t hashCode = makeHashCode(data, start);
818             addEntry(data, start, hashCode, start);
819         }
820     }
821 
822     template<typename UIntA, typename UIntB>
findBlock(const UIntA * data,const UIntB * blockData,int32_t blockStart) const823     int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
824         uint32_t hashCode = makeHashCode(blockData, blockStart);
825         int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
826         if (entryIndex >= 0) {
827             return (table[entryIndex] & mask) - 1;
828         } else {
829             return -1;
830         }
831     }
832 
findAllSameBlock(const uint32_t * data,uint32_t blockValue) const833     int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
834         uint32_t hashCode = makeHashCode(blockValue);
835         int32_t entryIndex = findEntry(data, blockValue, hashCode);
836         if (entryIndex >= 0) {
837             return (table[entryIndex] & mask) - 1;
838         } else {
839             return -1;
840         }
841     }
842 
843 private:
844     template<typename UInt>
makeHashCode(const UInt * blockData,int32_t blockStart) const845     uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
846         int32_t blockLimit = blockStart + blockLength;
847         uint32_t hashCode = blockData[blockStart++];
848         do {
849             hashCode = 37 * hashCode + blockData[blockStart++];
850         } while (blockStart < blockLimit);
851         return hashCode;
852     }
853 
makeHashCode(uint32_t blockValue) const854     uint32_t makeHashCode(uint32_t blockValue) const {
855         uint32_t hashCode = blockValue;
856         for (int32_t i = 1; i < blockLength; ++i) {
857             hashCode = 37 * hashCode + blockValue;
858         }
859         return hashCode;
860     }
861 
862     template<typename UInt>
addEntry(const UInt * data,int32_t blockStart,uint32_t hashCode,int32_t dataIndex)863     void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
864         U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
865         int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
866         if (entryIndex < 0) {
867             table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
868         }
869     }
870 
871     template<typename UIntA, typename UIntB>
findEntry(const UIntA * data,const UIntB * blockData,int32_t blockStart,uint32_t hashCode) const872     int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
873                       uint32_t hashCode) const {
874         uint32_t shiftedHashCode = hashCode << shift;
875         int32_t initialEntryIndex = (hashCode % (length - 1)) + 1;  // 1..length-1
876         for (int32_t entryIndex = initialEntryIndex;;) {
877             uint32_t entry = table[entryIndex];
878             if (entry == 0) {
879                 return ~entryIndex;
880             }
881             if ((entry & ~mask) == shiftedHashCode) {
882                 int32_t dataIndex = (entry & mask) - 1;
883                 if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
884                     return entryIndex;
885                 }
886             }
887             entryIndex = nextIndex(initialEntryIndex, entryIndex);
888         }
889     }
890 
findEntry(const uint32_t * data,uint32_t blockValue,uint32_t hashCode) const891     int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
892         uint32_t shiftedHashCode = hashCode << shift;
893         int32_t initialEntryIndex = (hashCode % (length - 1)) + 1;  // 1..length-1
894         for (int32_t entryIndex = initialEntryIndex;;) {
895             uint32_t entry = table[entryIndex];
896             if (entry == 0) {
897                 return ~entryIndex;
898             }
899             if ((entry & ~mask) == shiftedHashCode) {
900                 int32_t dataIndex = (entry & mask) - 1;
901                 if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
902                     return entryIndex;
903                 }
904             }
905             entryIndex = nextIndex(initialEntryIndex, entryIndex);
906         }
907     }
908 
nextIndex(int32_t initialEntryIndex,int32_t entryIndex) const909     inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
910         // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
911         return (entryIndex + initialEntryIndex) % length;
912     }
913 
914     // Hash table.
915     // The length is a prime number, larger than the maximum data length.
916     // The "shift" lower bits store a data index + 1.
917     // The remaining upper bits store a partial hashCode of the block data values.
918     uint32_t *table = nullptr;
919     int32_t capacity = 0;
920     int32_t length = 0;
921     int32_t shift = 0;
922     uint32_t mask = 0;
923 
924     int32_t blockLength = 0;
925 };
926 
compactWholeDataBlocks(int32_t fastILimit,AllSameBlocks & allSameBlocks)927 int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
928 #ifdef UCPTRIE_DEBUG
929     bool overflow = false;
930 #endif
931 
932     // ASCII data will be stored as a linear table, even if the following code
933     // does not yet count it that way.
934     int32_t newDataCapacity = ASCII_LIMIT;
935     // Add room for a small data null block in case it would match the start of
936     // a fast data block where dataNullOffset must not be set in that case.
937     newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
938     // Add room for special values (errorValue, highValue) and padding.
939     newDataCapacity += 4;
940     int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
941     int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
942     int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
943     for (int32_t i = 0; i < iLimit; i += inc) {
944         if (i == fastILimit) {
945             blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
946             inc = 1;
947         }
948         uint32_t value = index[i];
949         if (flags[i] == MIXED) {
950             // Really mixed?
951             const uint32_t *p = data + value;
952             value = *p;
953             if (allValuesSameAs(p + 1, blockLength - 1, value)) {
954                 flags[i] = ALL_SAME;
955                 index[i] = value;
956                 // Fall through to ALL_SAME handling.
957             } else {
958                 newDataCapacity += blockLength;
959                 continue;
960             }
961         } else {
962             U_ASSERT(flags[i] == ALL_SAME);
963             if (inc > 1) {
964                 // Do all of the fast-range data block's ALL_SAME parts have the same value?
965                 bool allSame = true;
966                 int32_t next_i = i + inc;
967                 for (int32_t j = i + 1; j < next_i; ++j) {
968                     U_ASSERT(flags[j] == ALL_SAME);
969                     if (index[j] != value) {
970                         allSame = false;
971                         break;
972                     }
973                 }
974                 if (!allSame) {
975                     // Turn it into a MIXED block.
976                     if (getDataBlock(i) < 0) {
977                         return -1;
978                     }
979                     newDataCapacity += blockLength;
980                     continue;
981                 }
982             }
983         }
984         // Is there another ALL_SAME block with the same value?
985         int32_t other = allSameBlocks.findOrAdd(i, inc, value);
986         if (other == AllSameBlocks::OVERFLOW) {
987             // The fixed-size array overflowed. Slow check for a duplicate block.
988 #ifdef UCPTRIE_DEBUG
989             if (!overflow) {
990                 puts("UCPTrie AllSameBlocks overflow");
991                 overflow = true;
992             }
993 #endif
994             int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
995             for (int32_t j = 0;; j += jInc) {
996                 if (j == i) {
997                     allSameBlocks.add(i, inc, value);
998                     break;
999                 }
1000                 if (j == fastILimit) {
1001                     jInc = 1;
1002                 }
1003                 if (flags[j] == ALL_SAME && index[j] == value) {
1004                     allSameBlocks.add(j, jInc + inc, value);
1005                     other = j;
1006                     break;
1007                     // We could keep counting blocks with the same value
1008                     // before we add the first one, which may improve compaction in rare cases,
1009                     // but it would make it slower.
1010                 }
1011             }
1012         }
1013         if (other >= 0) {
1014             flags[i] = SAME_AS;
1015             index[i] = other;
1016         } else {
1017             // New unique same-value block.
1018             newDataCapacity += blockLength;
1019         }
1020     }
1021     return newDataCapacity;
1022 }
1023 
1024 #ifdef UCPTRIE_DEBUG
1025 #   define DEBUG_DO(expr) expr
1026 #else
1027 #   define DEBUG_DO(expr)
1028 #endif
1029 
1030 #ifdef UCPTRIE_DEBUG
1031 // Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
appendValue(char s[],int32_t length,uint32_t value)1032 int32_t appendValue(char s[], int32_t length, uint32_t value) {
1033     value ^= value >> 16;
1034     value ^= value >> 8;
1035     s[length] = 0xE2;
1036     s[length + 1] = (char)(0xA0 + ((value >> 6) & 3));
1037     s[length + 2] = (char)(0x80 + (value & 0x3F));
1038     return length + 3;
1039 }
1040 
printBlock(const uint32_t * block,int32_t blockLength,uint32_t value,UChar32 start,int32_t overlap,uint32_t initialValue)1041 void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
1042                 UChar32 start, int32_t overlap, uint32_t initialValue) {
1043     char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3];
1044     int32_t length = 0;
1045     int32_t i;
1046     for (i = 0; i < overlap; ++i) {
1047         length = appendValue(s, length, 0);  // Braille blank
1048     }
1049     s[length++] = '|';
1050     for (; i < blockLength; ++i) {
1051         if (block != nullptr) {
1052             value = block[i];
1053         }
1054         if (value == initialValue) {
1055             value = 0x40;  // Braille lower left dot
1056         }
1057         length = appendValue(s, length, value);
1058     }
1059     s[length] = 0;
1060     start += overlap;
1061     if (start <= 0xffff) {
1062         printf("    %04lX  %s|\n", (long)start, s);
1063     } else if (start <= 0xfffff) {
1064         printf("   %5lX  %s|\n", (long)start, s);
1065     } else {
1066         printf("  %6lX  %s|\n", (long)start, s);
1067     }
1068 }
1069 #endif
1070 
1071 /**
1072  * Compacts a build-time trie.
1073  *
1074  * The compaction
1075  * - removes blocks that are identical with earlier ones
1076  * - overlaps each new non-duplicate block as much as possible with the previously-written one
1077  * - works with fast-range data blocks whose length is a multiple of that of
1078  *   higher-code-point data blocks
1079  *
1080  * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
1081  */
compactData(int32_t fastILimit,uint32_t * newData,int32_t newDataCapacity,int32_t dataNullIndex,MixedBlocks & mixedBlocks,UErrorCode & errorCode)1082 int32_t MutableCodePointTrie::compactData(
1083         int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
1084         int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
1085 #ifdef UCPTRIE_DEBUG
1086     int32_t countSame=0, sumOverlaps=0;
1087     bool printData = dataLength == 29088 /* line.brk */ ||
1088         // dataLength == 30048 /* CanonIterData */ ||
1089         dataLength == 50400 /* zh.txt~stroke */;
1090 #endif
1091 
1092     // The linear ASCII data has been copied into newData already.
1093     int32_t newDataLength = 0;
1094     for (int32_t i = 0; newDataLength < ASCII_LIMIT;
1095             newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
1096         index[i] = newDataLength;
1097 #ifdef UCPTRIE_DEBUG
1098         if (printData) {
1099             printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue);
1100         }
1101 #endif
1102     }
1103 
1104     int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
1105     if (!mixedBlocks.init(newDataCapacity, blockLength)) {
1106         errorCode = U_MEMORY_ALLOCATION_ERROR;
1107         return 0;
1108     }
1109     mixedBlocks.extend(newData, 0, 0, newDataLength);
1110 
1111     int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
1112     int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
1113     int32_t fastLength = 0;
1114     for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
1115         if (i == fastILimit) {
1116             blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
1117             inc = 1;
1118             fastLength = newDataLength;
1119             if (!mixedBlocks.init(newDataCapacity, blockLength)) {
1120                 errorCode = U_MEMORY_ALLOCATION_ERROR;
1121                 return 0;
1122             }
1123             mixedBlocks.extend(newData, 0, 0, newDataLength);
1124         }
1125         if (flags[i] == ALL_SAME) {
1126             uint32_t value = index[i];
1127             // Find an earlier part of the data array of length blockLength
1128             // that is filled with this value.
1129             int32_t n = mixedBlocks.findAllSameBlock(newData, value);
1130             // If we find a match, and the current block is the data null block,
1131             // and it is not a fast block but matches the start of a fast block,
1132             // then we need to continue looking.
1133             // This is because this small block is shorter than the fast block,
1134             // and not all of the rest of the fast block is filled with this value.
1135             // Otherwise trie.getRange() would detect that the fast block starts at
1136             // dataNullOffset and assume incorrectly that it is filled with the null value.
1137             while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
1138                     isStartOfSomeFastBlock(n, index, fastILimit)) {
1139                 n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
1140             }
1141             if (n >= 0) {
1142                 DEBUG_DO(++countSame);
1143                 index[i] = n;
1144             } else {
1145                 n = getAllSameOverlap(newData, newDataLength, value, blockLength);
1146                 DEBUG_DO(sumOverlaps += n);
1147 #ifdef UCPTRIE_DEBUG
1148                 if (printData) {
1149                     printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue);
1150                 }
1151 #endif
1152                 index[i] = newDataLength - n;
1153                 int32_t prevDataLength = newDataLength;
1154                 while (n < blockLength) {
1155                     newData[newDataLength++] = value;
1156                     ++n;
1157                 }
1158                 mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
1159             }
1160         } else if (flags[i] == MIXED) {
1161             const uint32_t *block = data + index[i];
1162             int32_t n = mixedBlocks.findBlock(newData, block, 0);
1163             if (n >= 0) {
1164                 DEBUG_DO(++countSame);
1165                 index[i] = n;
1166             } else {
1167                 n = getOverlap(newData, newDataLength, block, 0, blockLength);
1168                 DEBUG_DO(sumOverlaps += n);
1169 #ifdef UCPTRIE_DEBUG
1170                 if (printData) {
1171                     printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue);
1172                 }
1173 #endif
1174                 index[i] = newDataLength - n;
1175                 int32_t prevDataLength = newDataLength;
1176                 while (n < blockLength) {
1177                     newData[newDataLength++] = block[n++];
1178                 }
1179                 mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
1180             }
1181         } else /* SAME_AS */ {
1182             uint32_t j = index[i];
1183             index[i] = index[j];
1184         }
1185     }
1186 
1187 #ifdef UCPTRIE_DEBUG
1188     /* we saved some space */
1189     printf("compacting UCPTrie: count of 32-bit data words %lu->%lu  countSame=%ld  sumOverlaps=%ld\n",
1190             (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps);
1191 #endif
1192     return newDataLength;
1193 }
1194 
compactIndex(int32_t fastILimit,MixedBlocks & mixedBlocks,UErrorCode & errorCode)1195 int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
1196                                            UErrorCode &errorCode) {
1197     int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
1198     if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
1199         // Only the linear fast index, no multi-stage index tables.
1200         index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
1201         return fastIndexLength;
1202     }
1203 
1204     // Condense the fast index table.
1205     // Also, does it contain an index-3 block with all dataNullOffset?
1206     uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH];  // fastIndexLength
1207     int32_t i3FirstNull = -1;
1208     for (int32_t i = 0, j = 0; i < fastILimit; ++j) {
1209         uint32_t i3 = index[i];
1210         fastIndex[j] = (uint16_t)i3;
1211         if (i3 == (uint32_t)dataNullOffset) {
1212             if (i3FirstNull < 0) {
1213                 i3FirstNull = j;
1214             } else if (index3NullOffset < 0 &&
1215                     (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) {
1216                 index3NullOffset = i3FirstNull;
1217             }
1218         } else {
1219             i3FirstNull = -1;
1220         }
1221         // Set the index entries that compactData() skipped.
1222         // Needed when the multi-stage index covers the fast index range as well.
1223         int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
1224         while (++i < iNext) {
1225             i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
1226             index[i] = i3;
1227         }
1228     }
1229 
1230     if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
1231         errorCode = U_MEMORY_ALLOCATION_ERROR;
1232         return 0;
1233     }
1234     mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
1235 
1236     // Examine index-3 blocks. For each determine one of:
1237     // - same as the index-3 null block
1238     // - same as a fast-index block
1239     // - 16-bit indexes
1240     // - 18-bit indexes
1241     // We store this in the first flags entry for the index-3 block.
1242     //
1243     // Also determine an upper limit for the index-3 table length.
1244     int32_t index3Capacity = 0;
1245     i3FirstNull = index3NullOffset;
1246     bool hasLongI3Blocks = false;
1247     // If the fast index covers the whole BMP, then
1248     // the multi-stage index is only for supplementary code points.
1249     // Otherwise, the multi-stage index covers all of Unicode.
1250     int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
1251     int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
1252     for (int32_t i = iStart; i < iLimit;) {
1253         int32_t j = i;
1254         int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
1255         uint32_t oredI3 = 0;
1256         bool isNull = true;
1257         do {
1258             uint32_t i3 = index[j];
1259             oredI3 |= i3;
1260             if (i3 != (uint32_t)dataNullOffset) {
1261                 isNull = false;
1262             }
1263         } while (++j < jLimit);
1264         if (isNull) {
1265             flags[i] = I3_NULL;
1266             if (i3FirstNull < 0) {
1267                 if (oredI3 <= 0xffff) {
1268                     index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
1269                 } else {
1270                     index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
1271                     hasLongI3Blocks = true;
1272                 }
1273                 i3FirstNull = 0;
1274             }
1275         } else {
1276             if (oredI3 <= 0xffff) {
1277                 int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
1278                 if (n >= 0) {
1279                     flags[i] = I3_BMP;
1280                     index[i] = n;
1281                 } else {
1282                     flags[i] = I3_16;
1283                     index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
1284                 }
1285             } else {
1286                 flags[i] = I3_18;
1287                 index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
1288                 hasLongI3Blocks = true;
1289             }
1290         }
1291         i = j;
1292     }
1293 
1294     int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3;
1295 
1296     // Length of the index-1 table, rounded up.
1297     int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2;
1298 
1299     // Index table: Fast index, index-1, index-3, index-2.
1300     // +1 for possible index table padding.
1301     int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
1302     index16 = (uint16_t *)uprv_malloc(index16Capacity * 2);
1303     if (index16 == nullptr) {
1304         errorCode = U_MEMORY_ALLOCATION_ERROR;
1305         return 0;
1306     }
1307     uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
1308 
1309     if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
1310         errorCode = U_MEMORY_ALLOCATION_ERROR;
1311         return 0;
1312     }
1313     MixedBlocks longI3Blocks;
1314     if (hasLongI3Blocks) {
1315         if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
1316             errorCode = U_MEMORY_ALLOCATION_ERROR;
1317             return 0;
1318         }
1319     }
1320 
1321     // Compact the index-3 table and write an uncompacted version of the index-2 table.
1322     uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2];  // index2Capacity
1323     int32_t i2Length = 0;
1324     i3FirstNull = index3NullOffset;
1325     int32_t index3Start = fastIndexLength + index1Length;
1326     int32_t indexLength = index3Start;
1327     for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) {
1328         int32_t i3;
1329         uint8_t f = flags[i];
1330         if (f == I3_NULL && i3FirstNull < 0) {
1331             // First index-3 null block. Write & overlap it like a normal block, then remember it.
1332             f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
1333             i3FirstNull = 0;
1334         }
1335         if (f == I3_NULL) {
1336             i3 = index3NullOffset;
1337         } else if (f == I3_BMP) {
1338             i3 = index[i];
1339         } else if (f == I3_16) {
1340             int32_t n = mixedBlocks.findBlock(index16, index, i);
1341             if (n >= 0) {
1342                 i3 = n;
1343             } else {
1344                 if (indexLength == index3Start) {
1345                     // No overlap at the boundary between the index-1 and index-3 tables.
1346                     n = 0;
1347                 } else {
1348                     n = getOverlap(index16, indexLength,
1349                                    index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
1350                 }
1351                 i3 = indexLength - n;
1352                 int32_t prevIndexLength = indexLength;
1353                 while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
1354                     index16[indexLength++] = index[i + n++];
1355                 }
1356                 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1357                 if (hasLongI3Blocks) {
1358                     longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
1359                 }
1360             }
1361         } else {
1362             U_ASSERT(f == I3_18);
1363             U_ASSERT(hasLongI3Blocks);
1364             // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
1365             int32_t j = i;
1366             int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
1367             int32_t k = indexLength;
1368             do {
1369                 ++k;
1370                 uint32_t v = index[j++];
1371                 uint32_t upperBits = (v & 0x30000) >> 2;
1372                 index16[k++] = v;
1373                 v = index[j++];
1374                 upperBits |= (v & 0x30000) >> 4;
1375                 index16[k++] = v;
1376                 v = index[j++];
1377                 upperBits |= (v & 0x30000) >> 6;
1378                 index16[k++] = v;
1379                 v = index[j++];
1380                 upperBits |= (v & 0x30000) >> 8;
1381                 index16[k++] = v;
1382                 v = index[j++];
1383                 upperBits |= (v & 0x30000) >> 10;
1384                 index16[k++] = v;
1385                 v = index[j++];
1386                 upperBits |= (v & 0x30000) >> 12;
1387                 index16[k++] = v;
1388                 v = index[j++];
1389                 upperBits |= (v & 0x30000) >> 14;
1390                 index16[k++] = v;
1391                 v = index[j++];
1392                 upperBits |= (v & 0x30000) >> 16;
1393                 index16[k++] = v;
1394                 index16[k - 9] = upperBits;
1395             } while (j < jLimit);
1396             int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
1397             if (n >= 0) {
1398                 i3 = n | 0x8000;
1399             } else {
1400                 if (indexLength == index3Start) {
1401                     // No overlap at the boundary between the index-1 and index-3 tables.
1402                     n = 0;
1403                 } else {
1404                     n = getOverlap(index16, indexLength,
1405                                    index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
1406                 }
1407                 i3 = (indexLength - n) | 0x8000;
1408                 int32_t prevIndexLength = indexLength;
1409                 if (n > 0) {
1410                     int32_t start = indexLength;
1411                     while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
1412                         index16[indexLength++] = index16[start + n++];
1413                     }
1414                 } else {
1415                     indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
1416                 }
1417                 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1418                 if (hasLongI3Blocks) {
1419                     longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
1420                 }
1421             }
1422         }
1423         if (index3NullOffset < 0 && i3FirstNull >= 0) {
1424             index3NullOffset = i3;
1425         }
1426         // Set the index-2 table entry.
1427         index2[i2Length++] = i3;
1428     }
1429     U_ASSERT(i2Length == index2Capacity);
1430     U_ASSERT(indexLength <= index3Start + index3Capacity);
1431 
1432     if (index3NullOffset < 0) {
1433         index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
1434     }
1435     if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
1436         // The index-3 offsets exceed 15 bits, or
1437         // the last one cannot be distinguished from the no-null-block value.
1438         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1439         return 0;
1440     }
1441 
1442     // Compact the index-2 table and write the index-1 table.
1443     static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
1444                   "must re-init mixedBlocks");
1445     int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
1446     int32_t i1 = fastIndexLength;
1447     for (int32_t i = 0; i < i2Length; i += blockLength) {
1448         int32_t n;
1449         if ((i2Length - i) >= blockLength) {
1450             // normal block
1451             U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
1452             n = mixedBlocks.findBlock(index16, index2, i);
1453         } else {
1454             // highStart is inside the last index-2 block. Shorten it.
1455             blockLength = i2Length - i;
1456             n = findSameBlock(index16, index3Start, indexLength,
1457                               index2, i, blockLength);
1458         }
1459         int32_t i2;
1460         if (n >= 0) {
1461             i2 = n;
1462         } else {
1463             if (indexLength == index3Start) {
1464                 // No overlap at the boundary between the index-1 and index-3/2 tables.
1465                 n = 0;
1466             } else {
1467                 n = getOverlap(index16, indexLength, index2, i, blockLength);
1468             }
1469             i2 = indexLength - n;
1470             int32_t prevIndexLength = indexLength;
1471             while (n < blockLength) {
1472                 index16[indexLength++] = index2[i + n++];
1473             }
1474             mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1475         }
1476         // Set the index-1 table entry.
1477         index16[i1++] = i2;
1478     }
1479     U_ASSERT(i1 == index3Start);
1480     U_ASSERT(indexLength <= index16Capacity);
1481 
1482 #ifdef UCPTRIE_DEBUG
1483     /* we saved some space */
1484     printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
1485             (long)iLimit, (long)indexLength);
1486 #endif
1487 
1488     return indexLength;
1489 }
1490 
compactTrie(int32_t fastILimit,UErrorCode & errorCode)1491 int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) {
1492     // Find the real highStart and round it up.
1493     U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0);
1494     highValue = get(MAX_UNICODE);
1495     int32_t realHighStart = findHighStart();
1496     realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) &
1497         ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
1498     if (realHighStart == UNICODE_LIMIT) {
1499         highValue = initialValue;
1500     }
1501 
1502 #ifdef UCPTRIE_DEBUG
1503     printf("UCPTrie: highStart U+%06lx  highValue 0x%lx  initialValue 0x%lx\n",
1504             (long)realHighStart, (long)highValue, (long)initialValue);
1505 #endif
1506 
1507     // We always store indexes and data values for the fast range.
1508     // Pin highStart to the top of that range while building.
1509     UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3;
1510     if (realHighStart < fastLimit) {
1511         for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) {
1512             flags[i] = ALL_SAME;
1513             index[i] = highValue;
1514         }
1515         highStart = fastLimit;
1516     } else {
1517         highStart = realHighStart;
1518     }
1519 
1520     uint32_t asciiData[ASCII_LIMIT];
1521     for (int32_t i = 0; i < ASCII_LIMIT; ++i) {
1522         asciiData[i] = get(i);
1523     }
1524 
1525     // First we look for which data blocks have the same value repeated over the whole block,
1526     // deduplicate such blocks, find a good null data block (for faster enumeration),
1527     // and get an upper bound for the necessary data array length.
1528     AllSameBlocks allSameBlocks;
1529     int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
1530     if (newDataCapacity < 0) {
1531         errorCode = U_MEMORY_ALLOCATION_ERROR;
1532         return 0;
1533     }
1534     uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4);
1535     if (newData == nullptr) {
1536         errorCode = U_MEMORY_ALLOCATION_ERROR;
1537         return 0;
1538     }
1539     uprv_memcpy(newData, asciiData, sizeof(asciiData));
1540 
1541     int32_t dataNullIndex = allSameBlocks.findMostUsed();
1542 
1543     MixedBlocks mixedBlocks;
1544     int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
1545                                         dataNullIndex, mixedBlocks, errorCode);
1546     if (U_FAILURE(errorCode)) { return 0; }
1547     U_ASSERT(newDataLength <= newDataCapacity);
1548     uprv_free(data);
1549     data = newData;
1550     dataCapacity = newDataCapacity;
1551     dataLength = newDataLength;
1552     if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) {
1553         // The offset of the last data block is too high to be stored in the index table.
1554         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1555         return 0;
1556     }
1557 
1558     if (dataNullIndex >= 0) {
1559         dataNullOffset = index[dataNullIndex];
1560 #ifdef UCPTRIE_DEBUG
1561         if (data[dataNullOffset] != initialValue) {
1562             printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
1563                    (long)initialValue, (long)data[dataNullOffset]);
1564         }
1565 #endif
1566         initialValue = data[dataNullOffset];
1567     } else {
1568         dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
1569     }
1570 
1571     int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
1572     highStart = realHighStart;
1573     return indexLength;
1574 }
1575 
build(UCPTrieType type,UCPTrieValueWidth valueWidth,UErrorCode & errorCode)1576 UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) {
1577     if (U_FAILURE(errorCode)) {
1578         return nullptr;
1579     }
1580     if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type ||
1581             valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) {
1582         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1583         return nullptr;
1584     }
1585 
1586     // The mutable trie always stores 32-bit values.
1587     // When we build a UCPTrie for a smaller value width, we first mask off unused bits
1588     // before compacting the data.
1589     switch (valueWidth) {
1590     case UCPTRIE_VALUE_BITS_32:
1591         break;
1592     case UCPTRIE_VALUE_BITS_16:
1593         maskValues(0xffff);
1594         break;
1595     case UCPTRIE_VALUE_BITS_8:
1596         maskValues(0xff);
1597         break;
1598     default:
1599         break;
1600     }
1601 
1602     UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT;
1603     int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode);
1604     if (U_FAILURE(errorCode)) {
1605         clear();
1606         return nullptr;
1607     }
1608 
1609     // Ensure data table alignment: The index length must be even for uint32_t data.
1610     if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) {
1611         index16[indexLength++] = 0xffee;  // arbitrary value
1612     }
1613 
1614     // Make the total trie structure length a multiple of 4 bytes by padding the data table,
1615     // and store special values as the last two data values.
1616     int32_t length = indexLength * 2;
1617     if (valueWidth == UCPTRIE_VALUE_BITS_16) {
1618         if (((indexLength ^ dataLength) & 1) != 0) {
1619             // padding
1620             data[dataLength++] = errorValue;
1621         }
1622         if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
1623             data[dataLength++] = highValue;
1624             data[dataLength++] = errorValue;
1625         }
1626         length += dataLength * 2;
1627     } else if (valueWidth == UCPTRIE_VALUE_BITS_32) {
1628         // 32-bit data words never need padding to a multiple of 4 bytes.
1629         if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
1630             if (data[dataLength - 1] != highValue) {
1631                 data[dataLength++] = highValue;
1632             }
1633             data[dataLength++] = errorValue;
1634         }
1635         length += dataLength * 4;
1636     } else {
1637         int32_t and3 = (length + dataLength) & 3;
1638         if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
1639             // all set
1640         } else if(and3 == 3 && data[dataLength - 1] == highValue) {
1641             data[dataLength++] = errorValue;
1642         } else {
1643             while (and3 != 2) {
1644                 data[dataLength++] = highValue;
1645                 and3 = (and3 + 1) & 3;
1646             }
1647             data[dataLength++] = highValue;
1648             data[dataLength++] = errorValue;
1649         }
1650         length += dataLength;
1651     }
1652 
1653     // Calculate the total length of the UCPTrie as a single memory block.
1654     length += sizeof(UCPTrie);
1655     U_ASSERT((length & 3) == 0);
1656 
1657     uint8_t *bytes = (uint8_t *)uprv_malloc(length);
1658     if (bytes == nullptr) {
1659         errorCode = U_MEMORY_ALLOCATION_ERROR;
1660         clear();
1661         return nullptr;
1662     }
1663     UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes);
1664     uprv_memset(trie, 0, sizeof(UCPTrie));
1665     trie->indexLength = indexLength;
1666     trie->dataLength = dataLength;
1667 
1668     trie->highStart = highStart;
1669     // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
1670     // Runtime code needs to then test for the real highStart as well.
1671     trie->shifted12HighStart = (highStart + 0xfff) >> 12;
1672     trie->type = type;
1673     trie->valueWidth = valueWidth;
1674 
1675     trie->index3NullOffset = index3NullOffset;
1676     trie->dataNullOffset = dataNullOffset;
1677     trie->nullValue = initialValue;
1678 
1679     bytes += sizeof(UCPTrie);
1680 
1681     // Fill the index and data arrays.
1682     uint16_t *dest16 = (uint16_t *)bytes;
1683     trie->index = dest16;
1684 
1685     if (highStart <= fastLimit) {
1686         // Condense only the fast index from the mutable-trie index.
1687         for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
1688             *dest16++ = (uint16_t)index[i];  // dest16[j]
1689         }
1690     } else {
1691         uprv_memcpy(dest16, index16, indexLength * 2);
1692         dest16 += indexLength;
1693     }
1694     bytes += indexLength * 2;
1695 
1696     // Write the data array.
1697     const uint32_t *p = data;
1698     switch (valueWidth) {
1699     case UCPTRIE_VALUE_BITS_16:
1700         // Write 16-bit data values.
1701         trie->data.ptr16 = dest16;
1702         for (int32_t i = dataLength; i > 0; --i) {
1703             *dest16++ = (uint16_t)*p++;
1704         }
1705         break;
1706     case UCPTRIE_VALUE_BITS_32:
1707         // Write 32-bit data values.
1708         trie->data.ptr32 = (uint32_t *)bytes;
1709         uprv_memcpy(bytes, p, (size_t)dataLength * 4);
1710         break;
1711     case UCPTRIE_VALUE_BITS_8:
1712         // Write 8-bit data values.
1713         trie->data.ptr8 = bytes;
1714         for (int32_t i = dataLength; i > 0; --i) {
1715             *bytes++ = (uint8_t)*p++;
1716         }
1717         break;
1718     default:
1719         // Will not occur, valueWidth checked at the beginning.
1720         break;
1721     }
1722 
1723 #ifdef UCPTRIE_DEBUG
1724     trie->name = name;
1725 
1726     ucptrie_printLengths(trie, "");
1727 #endif
1728 
1729     clear();
1730     return trie;
1731 }
1732 
1733 }  // namespace
1734 
1735 U_NAMESPACE_END
1736 
1737 U_NAMESPACE_USE
1738 
1739 U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)1740 umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
1741     if (U_FAILURE(*pErrorCode)) {
1742         return nullptr;
1743     }
1744     LocalPointer<MutableCodePointTrie> trie(
1745         new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode);
1746     if (U_FAILURE(*pErrorCode)) {
1747         return nullptr;
1748     }
1749     return reinterpret_cast<UMutableCPTrie *>(trie.orphan());
1750 }
1751 
1752 U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_clone(const UMutableCPTrie * other,UErrorCode * pErrorCode)1753 umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) {
1754     if (U_FAILURE(*pErrorCode)) {
1755         return nullptr;
1756     }
1757     if (other == nullptr) {
1758         return nullptr;
1759     }
1760     LocalPointer<MutableCodePointTrie> clone(
1761         new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode);
1762     if (U_FAILURE(*pErrorCode)) {
1763         return nullptr;
1764     }
1765     return reinterpret_cast<UMutableCPTrie *>(clone.orphan());
1766 }
1767 
1768 U_CAPI void U_EXPORT2
umutablecptrie_close(UMutableCPTrie * trie)1769 umutablecptrie_close(UMutableCPTrie *trie) {
1770     delete reinterpret_cast<MutableCodePointTrie *>(trie);
1771 }
1772 
1773 U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_fromUCPMap(const UCPMap * map,UErrorCode * pErrorCode)1774 umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
1775     if (U_FAILURE(*pErrorCode)) {
1776         return nullptr;
1777     }
1778     if (map == nullptr) {
1779         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1780         return nullptr;
1781     }
1782     return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
1783 }
1784 
1785 U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_fromUCPTrie(const UCPTrie * trie,UErrorCode * pErrorCode)1786 umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
1787     if (U_FAILURE(*pErrorCode)) {
1788         return nullptr;
1789     }
1790     if (trie == nullptr) {
1791         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1792         return nullptr;
1793     }
1794     return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode));
1795 }
1796 
1797 U_CAPI uint32_t U_EXPORT2
umutablecptrie_get(const UMutableCPTrie * trie,UChar32 c)1798 umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
1799     return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c);
1800 }
1801 
1802 namespace {
1803 
getRange(const void * trie,UChar32 start,UCPMapValueFilter * filter,const void * context,uint32_t * pValue)1804 UChar32 getRange(const void *trie, UChar32 start,
1805                  UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
1806     return reinterpret_cast<const MutableCodePointTrie *>(trie)->
1807         getRange(start, filter, context, pValue);
1808 }
1809 
1810 }  // namespace
1811 
1812 U_CAPI UChar32 U_EXPORT2
umutablecptrie_getRange(const UMutableCPTrie * trie,UChar32 start,UCPMapRangeOption option,uint32_t surrogateValue,UCPMapValueFilter * filter,const void * context,uint32_t * pValue)1813 umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
1814                         UCPMapRangeOption option, uint32_t surrogateValue,
1815                         UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
1816     return ucptrie_internalGetRange(getRange, trie, start,
1817                                     option, surrogateValue,
1818                                     filter, context, pValue);
1819 }
1820 
1821 U_CAPI void U_EXPORT2
umutablecptrie_set(UMutableCPTrie * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)1822 umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
1823     if (U_FAILURE(*pErrorCode)) {
1824         return;
1825     }
1826     reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode);
1827 }
1828 
1829 U_CAPI void U_EXPORT2
umutablecptrie_setRange(UMutableCPTrie * trie,UChar32 start,UChar32 end,uint32_t value,UErrorCode * pErrorCode)1830 umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end,
1831                    uint32_t value, UErrorCode *pErrorCode) {
1832     if (U_FAILURE(*pErrorCode)) {
1833         return;
1834     }
1835     reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode);
1836 }
1837 
1838 /* Compact and internally serialize the trie. */
1839 U_CAPI UCPTrie * U_EXPORT2
umutablecptrie_buildImmutable(UMutableCPTrie * trie,UCPTrieType type,UCPTrieValueWidth valueWidth,UErrorCode * pErrorCode)1840 umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
1841                               UErrorCode *pErrorCode) {
1842     if (U_FAILURE(*pErrorCode)) {
1843         return nullptr;
1844     }
1845     return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode);
1846 }
1847 
1848 #ifdef UCPTRIE_DEBUG
umutablecptrie_setName(UMutableCPTrie * trie,const char * name)1849 U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) {
1850     reinterpret_cast<MutableCodePointTrie *>(trie)->name = name;
1851 }
1852 #endif
1853