1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2001-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  utrie2_builder.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2008sep26 (split off from utrie2.c)
16 *   created by: Markus W. Scherer
17 *
18 *   This is a common implementation of a Unicode trie.
19 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20 *   Unicode code points (0..0x10ffff).
21 *   This is the second common version of a Unicode trie (hence the name UTrie2).
22 *   See utrie2.h for a comparison.
23 *
24 *   This file contains only the builder code.
25 *   See utrie2.c for the runtime and enumeration code.
26 */
27 // #define UTRIE2_DEBUG
28 #ifdef UTRIE2_DEBUG
29 #   include <stdio.h>
30 #endif
31 // #define UCPTRIE_DEBUG
32 
33 #include "unicode/utypes.h"
34 #ifdef UCPTRIE_DEBUG
35 #include "unicode/ucptrie.h"
36 #include "unicode/umutablecptrie.h"
37 #include "ucptrie_impl.h"
38 #endif
39 #include "cmemory.h"
40 #include "utrie2.h"
41 #include "utrie2_impl.h"
42 
43 #include "utrie.h"  // for utrie2_fromUTrie()
44 
45 /* Implementation notes ----------------------------------------------------- */
46 
47 /*
48  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
49  * have been chosen to minimize trie sizes overall.
50  * Most of the code is flexible enough to work with a range of values,
51  * within certain limits.
52  *
53  * Exception: Support for separate values for lead surrogate code _units_
54  * vs. code _points_ was added after the constants were fixed,
55  * and has not been tested nor particularly designed for different constant values.
56  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
57  * part and back.)
58  *
59  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
60  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
61  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
62  * remapping) stops working.
63  *
64  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
65  * assumes that a single index-2 block is used for 0x400 code points
66  * corresponding to one lead surrogate.
67  *
68  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
69  * more than one Unicode plane, and the split of the index-2 table into a BMP
70  * part and a supplementary part, with a gap in between, would not work.
71  *
72  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
73  * there is data with more than 64k distinct values,
74  * for example for Unihan collation with a separate collation weight per
75  * Han character.
76  */
77 
78 /* Building a trie ----------------------------------------------------------*/
79 
80 enum {
81     /** The null index-2 block, following the gap in the index-2 table. */
82     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
83 
84     /** The start of allocated index-2 blocks. */
85     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
86 
87     /**
88      * The null data block.
89      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
90      * to work with 6-bit trail bytes from 2-byte UTF-8.
91      */
92     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
93 
94     /** The start of allocated data blocks. */
95     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
96 
97     /**
98      * The start of data blocks for U+0800 and above.
99      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
100      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
101      * Data values for 0x780 code points beyond ASCII.
102      */
103     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
104 };
105 
106 /* Start with allocation of 16k data entries. */
107 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
108 
109 /* Grow about 8x each time. */
110 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
111 
112 static int32_t
113 allocIndex2Block(UNewTrie2 *trie);
114 
115 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)116 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
117     UTrie2 *trie;
118     UNewTrie2 *newTrie;
119     uint32_t *data;
120     int32_t i, j;
121 
122     if(U_FAILURE(*pErrorCode)) {
123         return NULL;
124     }
125 
126     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
127     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
128     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
129     if(trie==NULL || newTrie==NULL || data==NULL) {
130         uprv_free(trie);
131         uprv_free(newTrie);
132         uprv_free(data);
133         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
134         return 0;
135     }
136 
137     uprv_memset(trie, 0, sizeof(UTrie2));
138     trie->initialValue=initialValue;
139     trie->errorValue=errorValue;
140     trie->highStart=0x110000;
141     trie->newTrie=newTrie;
142 #ifdef UTRIE2_DEBUG
143     trie->name="open";
144 #endif
145 
146     newTrie->data=data;
147 #ifdef UCPTRIE_DEBUG
148     newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode);
149 #endif
150     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
151     newTrie->initialValue=initialValue;
152     newTrie->errorValue=errorValue;
153     newTrie->highStart=0x110000;
154     newTrie->firstFreeBlock=0;  /* no free block in the list */
155     newTrie->isCompacted=FALSE;
156 
157     /*
158      * preallocate and reset
159      * - ASCII
160      * - the bad-UTF-8-data block
161      * - the null data block
162      */
163     for(i=0; i<0x80; ++i) {
164         newTrie->data[i]=initialValue;
165     }
166     for(; i<0xc0; ++i) {
167         newTrie->data[i]=errorValue;
168     }
169     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
170         newTrie->data[i]=initialValue;
171     }
172     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
173     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
174 
175     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
176     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
177         newTrie->index2[i]=j;
178         newTrie->map[i]=1;
179     }
180     /* reference counts for the bad-UTF-8-data block */
181     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
182         newTrie->map[i]=0;
183     }
184     /*
185      * Reference counts for the null data block: all blocks except for the ASCII blocks.
186      * Plus 1 so that we don't drop this block during compaction.
187      * Plus as many as needed for lead surrogate code points.
188      */
189     /* i==newTrie->dataNullOffset */
190     newTrie->map[i++]=
191         (0x110000>>UTRIE2_SHIFT_2)-
192         (0x80>>UTRIE2_SHIFT_2)+
193         1+
194         UTRIE2_LSCP_INDEX_2_LENGTH;
195     j+=UTRIE2_DATA_BLOCK_LENGTH;
196     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
197         newTrie->map[i]=0;
198     }
199 
200     /*
201      * set the remaining indexes in the BMP index-2 block
202      * to the null data block
203      */
204     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
205         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
206     }
207 
208     /*
209      * Fill the index gap with impossible values so that compaction
210      * does not overlap other index-2 blocks with the gap.
211      */
212     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
213         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
214     }
215 
216     /* set the indexes in the null index-2 block */
217     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
218         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
219     }
220     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
222 
223     /* set the index-1 indexes for the linear index-2 block */
224     for(i=0, j=0;
225         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
226         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
227     ) {
228         newTrie->index1[i]=j;
229     }
230 
231     /* set the remaining index-1 indexes to the null index-2 block */
232     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
233         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
234     }
235 
236     /*
237      * Preallocate and reset data for U+0080..U+07ff,
238      * for 2-byte UTF-8 which will be compacted in 64-blocks
239      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
240      */
241     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
242         utrie2_set32(trie, i, initialValue, pErrorCode);
243     }
244 
245     return trie;
246 }
247 
248 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)249 cloneBuilder(const UNewTrie2 *other) {
250     UNewTrie2 *trie;
251 
252     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
253     if(trie==NULL) {
254         return NULL;
255     }
256 
257     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
258     if(trie->data==NULL) {
259         uprv_free(trie);
260         return NULL;
261     }
262 #ifdef UCPTRIE_DEBUG
263     if(other->t3==nullptr) {
264         trie->t3=nullptr;
265     } else {
266         UErrorCode errorCode=U_ZERO_ERROR;
267         trie->t3=umutablecptrie_clone(other->t3, &errorCode);
268     }
269 #endif
270     trie->dataCapacity=other->dataCapacity;
271 
272     /* clone data */
273     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
274     uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
275     trie->index2NullOffset=other->index2NullOffset;
276     trie->index2Length=other->index2Length;
277 
278     uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
279     trie->dataNullOffset=other->dataNullOffset;
280     trie->dataLength=other->dataLength;
281 
282     /* reference counters */
283     if(other->isCompacted) {
284         trie->firstFreeBlock=0;
285     } else {
286         uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
287         trie->firstFreeBlock=other->firstFreeBlock;
288     }
289 
290     trie->initialValue=other->initialValue;
291     trie->errorValue=other->errorValue;
292     trie->highStart=other->highStart;
293     trie->isCompacted=other->isCompacted;
294 
295     return trie;
296 }
297 
298 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)299 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
300     UTrie2 *trie;
301 
302     if(U_FAILURE(*pErrorCode)) {
303         return NULL;
304     }
305     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
306         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
307         return NULL;
308     }
309 
310     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
311     if(trie==NULL) {
312         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
313         return NULL;
314     }
315     uprv_memcpy(trie, other, sizeof(UTrie2));
316 
317     if(other->memory!=NULL) {
318         trie->memory=uprv_malloc(other->length);
319         if(trie->memory!=NULL) {
320             trie->isMemoryOwned=TRUE;
321             uprv_memcpy(trie->memory, other->memory, other->length);
322 
323             /* make the clone's pointers point to its own memory */
324             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
325             if(other->data16!=NULL) {
326                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
327             }
328             if(other->data32!=NULL) {
329                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
330             }
331         }
332     } else /* other->newTrie!=NULL */ {
333         trie->newTrie=cloneBuilder(other->newTrie);
334     }
335 
336     if(trie->memory==NULL && trie->newTrie==NULL) {
337         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
338         uprv_free(trie);
339         trie=NULL;
340     }
341     return trie;
342 }
343 
344 typedef struct NewTrieAndStatus {
345     UTrie2 *trie;
346     UErrorCode errorCode;
347     UBool exclusiveLimit;  /* rather than inclusive range end */
348 } NewTrieAndStatus;
349 
350 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)351 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
352     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
353     if(value!=nt->trie->initialValue) {
354         if(nt->exclusiveLimit) {
355             --end;
356         }
357         if(start==end) {
358             utrie2_set32(nt->trie, start, value, &nt->errorCode);
359         } else {
360             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
361         }
362         return U_SUCCESS(nt->errorCode);
363     } else {
364         return TRUE;
365     }
366 }
367 
368 #ifdef UTRIE2_DEBUG
countInitial(const UTrie2 * trie)369 static long countInitial(const UTrie2 *trie) {
370     uint32_t initialValue=trie->initialValue;
371     int32_t length=trie->dataLength;
372     long count=0;
373     if(trie->data16!=nullptr) {
374         for(int32_t i=0; i<length; ++i) {
375             if(trie->data16[i]==initialValue) { ++count; }
376         }
377     } else {
378         for(int32_t i=0; i<length; ++i) {
379             if(trie->data32[i]==initialValue) { ++count; }
380         }
381     }
382     return count;
383 }
384 
385 static void
utrie_printLengths(const UTrie * trie)386 utrie_printLengths(const UTrie *trie) {
387     long indexLength=trie->indexLength;
388     long dataLength=(long)trie->dataLength;
389     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
390     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
391            indexLength, dataLength, totalLength);
392 }
393 
394 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)395 utrie2_printLengths(const UTrie2 *trie, const char *which) {
396     long indexLength=trie->indexLength;
397     long dataLength=(long)trie->dataLength;
398     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
399     printf("**UTrie2Lengths(%s %s)** index:%6ld  data:%6ld  countInitial:%6ld  serialized:%6ld\n",
400            which, trie->name, indexLength, dataLength, countInitial(trie), totalLength);
401 }
402 #endif
403 
404 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)405 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
406     NewTrieAndStatus context;
407     UChar lead;
408 
409     if(U_FAILURE(*pErrorCode)) {
410         return NULL;
411     }
412     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
413         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
414         return NULL;
415     }
416     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
417         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
418     }
419 
420     /* Clone the frozen trie by enumerating it and building a new one. */
421     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
422     if(U_FAILURE(*pErrorCode)) {
423         return NULL;
424     }
425     context.exclusiveLimit=FALSE;
426     context.errorCode=*pErrorCode;
427     utrie2_enum(other, NULL, copyEnumRange, &context);
428     *pErrorCode=context.errorCode;
429     for(lead=0xd800; lead<0xdc00; ++lead) {
430         uint32_t value;
431         if(other->data32==NULL) {
432             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
433         } else {
434             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
435         }
436         if(value!=other->initialValue) {
437             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438         }
439     }
440     if(U_FAILURE(*pErrorCode)) {
441         utrie2_close(context.trie);
442         context.trie=NULL;
443     }
444     return context.trie;
445 }
446 
447 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
448 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)449 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
450     NewTrieAndStatus context;
451     UChar lead;
452 
453     if(U_FAILURE(*pErrorCode)) {
454         return NULL;
455     }
456     if(trie1==NULL) {
457         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
458         return NULL;
459     }
460     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
461     if(U_FAILURE(*pErrorCode)) {
462         return NULL;
463     }
464     context.exclusiveLimit=TRUE;
465     context.errorCode=*pErrorCode;
466     utrie_enum(trie1, NULL, copyEnumRange, &context);
467     *pErrorCode=context.errorCode;
468     for(lead=0xd800; lead<0xdc00; ++lead) {
469         uint32_t value;
470         if(trie1->data32==NULL) {
471             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
472         } else {
473             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
474         }
475         if(value!=trie1->initialValue) {
476             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
477         }
478     }
479     if(U_SUCCESS(*pErrorCode)) {
480         utrie2_freeze(context.trie,
481                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
482                       pErrorCode);
483     }
484 #ifdef UTRIE2_DEBUG
485     if(U_SUCCESS(*pErrorCode)) {
486         utrie_printLengths(trie1);
487         utrie2_printLengths(context.trie, "fromUTrie");
488     }
489 #endif
490     if(U_FAILURE(*pErrorCode)) {
491         utrie2_close(context.trie);
492         context.trie=NULL;
493     }
494     return context.trie;
495 }
496 
497 static inline UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)498 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
499     int32_t i2, block;
500 
501     if(U_IS_LEAD(c) && forLSCP) {
502         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
503             (c>>UTRIE2_SHIFT_2);
504     } else {
505         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
506             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
507     }
508     block=trie->index2[i2];
509     return (UBool)(block==trie->dataNullOffset);
510 }
511 
512 static int32_t
allocIndex2Block(UNewTrie2 * trie)513 allocIndex2Block(UNewTrie2 *trie) {
514     int32_t newBlock, newTop;
515 
516     newBlock=trie->index2Length;
517     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
518     if(newTop>UPRV_LENGTHOF(trie->index2)) {
519         /*
520          * Should never occur.
521          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
522          * or the code writes more values than should be possible.
523          */
524         return -1;
525     }
526     trie->index2Length=newTop;
527     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
528     return newBlock;
529 }
530 
531 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)532 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
533     int32_t i1, i2;
534 
535     if(U_IS_LEAD(c) && forLSCP) {
536         return UTRIE2_LSCP_INDEX_2_OFFSET;
537     }
538 
539     i1=c>>UTRIE2_SHIFT_1;
540     i2=trie->index1[i1];
541     if(i2==trie->index2NullOffset) {
542         i2=allocIndex2Block(trie);
543         if(i2<0) {
544             return -1;  /* program error */
545         }
546         trie->index1[i1]=i2;
547     }
548     return i2;
549 }
550 
551 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)552 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
553     int32_t newBlock, newTop;
554 
555     if(trie->firstFreeBlock!=0) {
556         /* get the first free block */
557         newBlock=trie->firstFreeBlock;
558         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
559     } else {
560         /* get a new block from the high end */
561         newBlock=trie->dataLength;
562         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
563         if(newTop>trie->dataCapacity) {
564             /* out of memory in the data array */
565             int32_t capacity;
566             uint32_t *data;
567 
568             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
569                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
570             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
571                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
572             } else {
573                 /*
574                  * Should never occur.
575                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
576                  * or the code writes more values than should be possible.
577                  */
578                 return -1;
579             }
580             data=(uint32_t *)uprv_malloc(capacity*4);
581             if(data==NULL) {
582                 return -1;
583             }
584             uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
585             uprv_free(trie->data);
586             trie->data=data;
587             trie->dataCapacity=capacity;
588         }
589         trie->dataLength=newTop;
590     }
591     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
592     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
593     return newBlock;
594 }
595 
596 /* call when the block's reference counter reaches 0 */
597 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)598 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
599     /* put this block at the front of the free-block chain */
600     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
601     trie->firstFreeBlock=block;
602 }
603 
604 static inline UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)605 isWritableBlock(UNewTrie2 *trie, int32_t block) {
606     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
607 }
608 
609 static inline void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)610 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
611     int32_t oldBlock;
612     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
613     oldBlock=trie->index2[i2];
614     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
615         releaseDataBlock(trie, oldBlock);
616     }
617     trie->index2[i2]=block;
618 }
619 
620 /**
621  * No error checking for illegal arguments.
622  *
623  * @return -1 if no new data block available (out of memory in data array)
624  * @internal
625  */
626 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)627 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
628     int32_t i2, oldBlock, newBlock;
629 
630     i2=getIndex2Block(trie, c, forLSCP);
631     if(i2<0) {
632         return -1;  /* program error */
633     }
634 
635     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
636     oldBlock=trie->index2[i2];
637     if(isWritableBlock(trie, oldBlock)) {
638         return oldBlock;
639     }
640 
641     /* allocate a new data block */
642     newBlock=allocDataBlock(trie, oldBlock);
643     if(newBlock<0) {
644         /* out of memory in the data array */
645         return -1;
646     }
647     setIndex2Entry(trie, i2, newBlock);
648     return newBlock;
649 }
650 
651 /**
652  * @return TRUE if the value was successfully set
653  */
654 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)655 set32(UNewTrie2 *trie,
656       UChar32 c, UBool forLSCP, uint32_t value,
657       UErrorCode *pErrorCode) {
658     int32_t block;
659 
660     if(trie==NULL || trie->isCompacted) {
661         *pErrorCode=U_NO_WRITE_PERMISSION;
662         return;
663     }
664 #ifdef UCPTRIE_DEBUG
665     umutablecptrie_set(trie->t3, c, value, pErrorCode);
666 #endif
667 
668     block=getDataBlock(trie, c, forLSCP);
669     if(block<0) {
670         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
671         return;
672     }
673 
674     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
675 }
676 
677 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)678 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
679     if(U_FAILURE(*pErrorCode)) {
680         return;
681     }
682     if((uint32_t)c>0x10ffff) {
683         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
684         return;
685     }
686     set32(trie->newTrie, c, TRUE, value, pErrorCode);
687 }
688 
689 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)690 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
691                                      UChar32 c, uint32_t value,
692                                      UErrorCode *pErrorCode) {
693     if(U_FAILURE(*pErrorCode)) {
694         return;
695     }
696     if(!U_IS_LEAD(c)) {
697         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
698         return;
699     }
700     set32(trie->newTrie, c, FALSE, value, pErrorCode);
701 }
702 
703 static void
writeBlock(uint32_t * block,uint32_t value)704 writeBlock(uint32_t *block, uint32_t value) {
705     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
706     while(block<limit) {
707         *block++=value;
708     }
709 }
710 
711 /**
712  * initialValue is ignored if overwrite=TRUE
713  * @internal
714  */
715 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)716 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
717           uint32_t value, uint32_t initialValue, UBool overwrite) {
718     uint32_t *pLimit;
719 
720     pLimit=block+limit;
721     block+=start;
722     if(overwrite) {
723         while(block<pLimit) {
724             *block++=value;
725         }
726     } else {
727         while(block<pLimit) {
728             if(*block==initialValue) {
729                 *block=value;
730             }
731             ++block;
732         }
733     }
734 }
735 
736 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)737 utrie2_setRange32(UTrie2 *trie,
738                   UChar32 start, UChar32 end,
739                   uint32_t value, UBool overwrite,
740                   UErrorCode *pErrorCode) {
741     /*
742      * repeat value in [start..end]
743      * mark index values for repeat-data blocks by setting bit 31 of the index values
744      * fill around existing values if any, if(overwrite)
745      */
746     UNewTrie2 *newTrie;
747     int32_t block, rest, repeatBlock;
748     UChar32 limit;
749 
750     if(U_FAILURE(*pErrorCode)) {
751         return;
752     }
753     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
754         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
755         return;
756     }
757     newTrie=trie->newTrie;
758     if(newTrie==NULL || newTrie->isCompacted) {
759         *pErrorCode=U_NO_WRITE_PERMISSION;
760         return;
761     }
762 #ifdef UCPTRIE_DEBUG
763     umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode);
764 #endif
765     if(!overwrite && value==newTrie->initialValue) {
766         return; /* nothing to do */
767     }
768 
769     limit=end+1;
770     if(start&UTRIE2_DATA_MASK) {
771         UChar32 nextStart;
772 
773         /* set partial block at [start..following block boundary[ */
774         block=getDataBlock(newTrie, start, TRUE);
775         if(block<0) {
776             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
777             return;
778         }
779 
780         nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK;
781         if(nextStart<=limit) {
782             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
783                       value, newTrie->initialValue, overwrite);
784             start=nextStart;
785         } else {
786             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
787                       value, newTrie->initialValue, overwrite);
788             return;
789         }
790     }
791 
792     /* number of positions in the last, partial block */
793     rest=limit&UTRIE2_DATA_MASK;
794 
795     /* round down limit to a block boundary */
796     limit&=~UTRIE2_DATA_MASK;
797 
798     /* iterate over all-value blocks */
799     if(value==newTrie->initialValue) {
800         repeatBlock=newTrie->dataNullOffset;
801     } else {
802         repeatBlock=-1;
803     }
804 
805     while(start<limit) {
806         int32_t i2;
807         UBool setRepeatBlock=FALSE;
808 
809         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
810             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
811             continue;
812         }
813 
814         /* get index value */
815         i2=getIndex2Block(newTrie, start, TRUE);
816         if(i2<0) {
817             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
818             return;
819         }
820         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
821         block=newTrie->index2[i2];
822         if(isWritableBlock(newTrie, block)) {
823             /* already allocated */
824             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
825                 /*
826                  * We overwrite all values, and it's not a
827                  * protected (ASCII-linear or 2-byte UTF-8) block:
828                  * replace with the repeatBlock.
829                  */
830                 setRepeatBlock=TRUE;
831             } else {
832                 /* !overwrite, or protected block: just write the values into this block */
833                 fillBlock(newTrie->data+block,
834                           0, UTRIE2_DATA_BLOCK_LENGTH,
835                           value, newTrie->initialValue, overwrite);
836             }
837         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
838             /*
839              * Set the repeatBlock instead of the null block or previous repeat block:
840              *
841              * If !isWritableBlock() then all entries in the block have the same value
842              * because it's the null block or a range block (the repeatBlock from a previous
843              * call to utrie2_setRange32()).
844              * No other blocks are used multiple times before compacting.
845              *
846              * The null block is the only non-writable block with the initialValue because
847              * of the repeatBlock initialization above. (If value==initialValue, then
848              * the repeatBlock will be the null data block.)
849              *
850              * We set our repeatBlock if the desired value differs from the block's value,
851              * and if we overwrite any data or if the data is all initial values
852              * (which is the same as the block being the null block, see above).
853              */
854             setRepeatBlock=TRUE;
855         }
856         if(setRepeatBlock) {
857             if(repeatBlock>=0) {
858                 setIndex2Entry(newTrie, i2, repeatBlock);
859             } else {
860                 /* create and set and fill the repeatBlock */
861                 repeatBlock=getDataBlock(newTrie, start, TRUE);
862                 if(repeatBlock<0) {
863                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
864                     return;
865                 }
866                 writeBlock(newTrie->data+repeatBlock, value);
867             }
868         }
869 
870         start+=UTRIE2_DATA_BLOCK_LENGTH;
871     }
872 
873     if(rest>0) {
874         /* set partial block at [last block boundary..limit[ */
875         block=getDataBlock(newTrie, start, TRUE);
876         if(block<0) {
877             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
878             return;
879         }
880 
881         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
882     }
883 
884     return;
885 }
886 
887 /* compaction --------------------------------------------------------------- */
888 
889 static inline UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)890 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
891     while(length>0 && *s==*t) {
892         ++s;
893         ++t;
894         --length;
895     }
896     return (UBool)(length==0);
897 }
898 
899 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)900 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
901     while(length>0 && *s==*t) {
902         ++s;
903         ++t;
904         --length;
905     }
906     return (UBool)(length==0);
907 }
908 
909 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)910 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
911     int32_t block;
912 
913     /* ensure that we do not even partially get past index2Length */
914     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
915 
916     for(block=0; block<=index2Length; ++block) {
917         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
918             return block;
919         }
920     }
921     return -1;
922 }
923 
924 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)925 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
926     int32_t block;
927 
928     /* ensure that we do not even partially get past dataLength */
929     dataLength-=blockLength;
930 
931     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
932         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
933             return block;
934         }
935     }
936     return -1;
937 }
938 
939 /*
940  * Find the start of the last range in the trie by enumerating backward.
941  * Indexes for supplementary code points higher than this will be omitted.
942  */
943 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)944 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
945     const uint32_t *data32;
946 
947     uint32_t value, initialValue;
948     UChar32 c, prev;
949     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
950 
951     data32=trie->data;
952     initialValue=trie->initialValue;
953 
954     index2NullOffset=trie->index2NullOffset;
955     nullBlock=trie->dataNullOffset;
956 
957     /* set variables for previous range */
958     if(highValue==initialValue) {
959         prevI2Block=index2NullOffset;
960         prevBlock=nullBlock;
961     } else {
962         prevI2Block=-1;
963         prevBlock=-1;
964     }
965     prev=0x110000;
966 
967     /* enumerate index-2 blocks */
968     i1=UNEWTRIE2_INDEX_1_LENGTH;
969     c=prev;
970     while(c>0) {
971         i2Block=trie->index1[--i1];
972         if(i2Block==prevI2Block) {
973             /* the index-2 block is the same as the previous one, and filled with highValue */
974             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
975             continue;
976         }
977         prevI2Block=i2Block;
978         if(i2Block==index2NullOffset) {
979             /* this is the null index-2 block */
980             if(highValue!=initialValue) {
981                 return c;
982             }
983             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
984         } else {
985             /* enumerate data blocks for one index-2 block */
986             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
987                 block=trie->index2[i2Block+ --i2];
988                 if(block==prevBlock) {
989                     /* the block is the same as the previous one, and filled with highValue */
990                     c-=UTRIE2_DATA_BLOCK_LENGTH;
991                     continue;
992                 }
993                 prevBlock=block;
994                 if(block==nullBlock) {
995                     /* this is the null data block */
996                     if(highValue!=initialValue) {
997                         return c;
998                     }
999                     c-=UTRIE2_DATA_BLOCK_LENGTH;
1000                 } else {
1001                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
1002                         value=data32[block+ --j];
1003                         if(value!=highValue) {
1004                             return c;
1005                         }
1006                         --c;
1007                     }
1008                 }
1009             }
1010         }
1011     }
1012 
1013     /* deliver last range */
1014     return 0;
1015 }
1016 
1017 /*
1018  * Compact a build-time trie.
1019  *
1020  * The compaction
1021  * - removes blocks that are identical with earlier ones
1022  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
1023  * - moves blocks in steps of the data granularity
1024  * - moves and overlaps blocks that overlap with multiple values in the overlap region
1025  *
1026  * It does not
1027  * - try to move and overlap blocks that are not already adjacent
1028  */
1029 static void
compactData(UNewTrie2 * trie)1030 compactData(UNewTrie2 *trie) {
1031 #ifdef UTRIE2_DEBUG
1032     int32_t countSame=0, sumOverlaps=0;
1033 #endif
1034 
1035     int32_t start, newStart, movedStart;
1036     int32_t blockLength, overlap;
1037     int32_t i, mapIndex, blockCount;
1038 
1039     /* do not compact linear-ASCII data */
1040     newStart=UTRIE2_DATA_START_OFFSET;
1041     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
1042         trie->map[i]=start;
1043     }
1044 
1045     /*
1046      * Start with a block length of 64 for 2-byte UTF-8,
1047      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
1048      */
1049     blockLength=64;
1050     blockCount=blockLength>>UTRIE2_SHIFT_2;
1051     for(start=newStart; start<trie->dataLength;) {
1052         /*
1053          * start: index of first entry of current block
1054          * newStart: index where the current block is to be moved
1055          *           (right after current end of already-compacted data)
1056          */
1057         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1058             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1059             blockCount=1;
1060         }
1061 
1062         /* skip blocks that are not used */
1063         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1064             /* advance start to the next block */
1065             start+=blockLength;
1066 
1067             /* leave newStart with the previous block! */
1068             continue;
1069         }
1070 
1071         /* search for an identical block */
1072         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1073              >=0
1074         ) {
1075 #ifdef UTRIE2_DEBUG
1076             ++countSame;
1077 #endif
1078             /* found an identical block, set the other block's index value for the current block */
1079             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1080                 trie->map[mapIndex++]=movedStart;
1081                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1082             }
1083 
1084             /* advance start to the next block */
1085             start+=blockLength;
1086 
1087             /* leave newStart with the previous block! */
1088             continue;
1089         }
1090 
1091         /* see if the beginning of this block can be overlapped with the end of the previous block */
1092         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1093         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1094             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1095             overlap-=UTRIE2_DATA_GRANULARITY) {}
1096 
1097 #ifdef UTRIE2_DEBUG
1098             sumOverlaps+=overlap;
1099 #endif
1100         if(overlap>0 || newStart<start) {
1101             /* some overlap, or just move the whole block */
1102             movedStart=newStart-overlap;
1103             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1104                 trie->map[mapIndex++]=movedStart;
1105                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1106             }
1107 
1108             /* move the non-overlapping indexes to their new positions */
1109             start+=overlap;
1110             for(i=blockLength-overlap; i>0; --i) {
1111                 trie->data[newStart++]=trie->data[start++];
1112             }
1113         } else /* no overlap && newStart==start */ {
1114             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1115                 trie->map[mapIndex++]=start;
1116                 start+=UTRIE2_DATA_BLOCK_LENGTH;
1117             }
1118             newStart=start;
1119         }
1120     }
1121 
1122     /* now adjust the index-2 table */
1123     for(i=0; i<trie->index2Length; ++i) {
1124         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1125             /* Gap indexes are invalid (-1). Skip over the gap. */
1126             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1127         }
1128         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1129     }
1130     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1131 
1132     /* ensure dataLength alignment */
1133     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1134         trie->data[newStart++]=trie->initialValue;
1135     }
1136 
1137 #ifdef UTRIE2_DEBUG
1138     /* we saved some space */
1139     printf("compacting UTrie2: count of 32-bit data words %lu->%lu  countSame=%ld  sumOverlaps=%ld\n",
1140             (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps);
1141 #endif
1142 
1143     trie->dataLength=newStart;
1144 }
1145 
1146 static void
compactIndex2(UNewTrie2 * trie)1147 compactIndex2(UNewTrie2 *trie) {
1148     int32_t i, start, newStart, movedStart, overlap;
1149 
1150     /* do not compact linear-BMP index-2 blocks */
1151     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1152     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1153         trie->map[i]=start;
1154     }
1155 
1156     /* Reduce the index table gap to what will be needed at runtime. */
1157     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1158 
1159     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1160         /*
1161          * start: index of first entry of current block
1162          * newStart: index where the current block is to be moved
1163          *           (right after current end of already-compacted data)
1164          */
1165 
1166         /* search for an identical block */
1167         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1168              >=0
1169         ) {
1170             /* found an identical block, set the other block's index value for the current block */
1171             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1172 
1173             /* advance start to the next block */
1174             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1175 
1176             /* leave newStart with the previous block! */
1177             continue;
1178         }
1179 
1180         /* see if the beginning of this block can be overlapped with the end of the previous block */
1181         /* look for maximum overlap with the previous, adjacent block */
1182         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1183             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1184             --overlap) {}
1185 
1186         if(overlap>0 || newStart<start) {
1187             /* some overlap, or just move the whole block */
1188             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1189 
1190             /* move the non-overlapping indexes to their new positions */
1191             start+=overlap;
1192             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1193                 trie->index2[newStart++]=trie->index2[start++];
1194             }
1195         } else /* no overlap && newStart==start */ {
1196             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1197             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1198             newStart=start;
1199         }
1200     }
1201 
1202     /* now adjust the index-1 table */
1203     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1204         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1205     }
1206     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1207 
1208     /*
1209      * Ensure data table alignment:
1210      * Needs to be granularity-aligned for 16-bit trie
1211      * (so that dataMove will be down-shiftable),
1212      * and 2-aligned for uint32_t data.
1213      */
1214     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1215         /* Arbitrary value: 0x3fffc not possible for real data. */
1216         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1217     }
1218 
1219 #ifdef UTRIE2_DEBUG
1220     /* we saved some space */
1221     printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n",
1222             (long)trie->index2Length, (long)newStart);
1223 #endif
1224 
1225     trie->index2Length=newStart;
1226 }
1227 
1228 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1229 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1230     UNewTrie2 *newTrie;
1231     UChar32 highStart, suppHighStart;
1232     uint32_t highValue;
1233 
1234     newTrie=trie->newTrie;
1235 
1236     /* find highStart and round it up */
1237     highValue=utrie2_get32(trie, 0x10ffff);
1238     highStart=findHighStart(newTrie, highValue);
1239     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1240     if(highStart==0x110000) {
1241         highValue=trie->errorValue;
1242     }
1243 
1244     /*
1245      * Set trie->highStart only after utrie2_get32(trie, highStart).
1246      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1247      */
1248     trie->highStart=newTrie->highStart=highStart;
1249 
1250 #ifdef UTRIE2_DEBUG
1251     printf("UTrie2: highStart U+%06lx  highValue 0x%lx  initialValue 0x%lx\n",
1252             (long)highStart, (long)highValue, (long)trie->initialValue);
1253 #endif
1254 
1255     if(highStart<0x110000) {
1256         /* Blank out [highStart..10ffff] to release associated data blocks. */
1257         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1258         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1259         if(U_FAILURE(*pErrorCode)) {
1260             return;
1261         }
1262     }
1263 
1264     compactData(newTrie);
1265     if(highStart>0x10000) {
1266         compactIndex2(newTrie);
1267 #ifdef UTRIE2_DEBUG
1268     } else {
1269         printf("UTrie2: highStart U+%04lx  count of 16-bit index words %lu->%lu\n",
1270                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1271 #endif
1272     }
1273 
1274     /*
1275      * Store the highValue in the data array and round up the dataLength.
1276      * Must be done after compactData() because that assumes that dataLength
1277      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1278      */
1279     newTrie->data[newTrie->dataLength++]=highValue;
1280     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1281         newTrie->data[newTrie->dataLength++]=trie->initialValue;
1282     }
1283 
1284     newTrie->isCompacted=TRUE;
1285 }
1286 
1287 /* serialization ------------------------------------------------------------ */
1288 
1289 /**
1290  * Maximum length of the runtime index array.
1291  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1292  * (The actual maximum length is lower,
1293  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1294  */
1295 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1296 
1297 /**
1298  * Maximum length of the runtime data array.
1299  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1300  * and by uint16_t UTrie2Header.shiftedDataLength.
1301  */
1302 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1303 
1304 /* Compact and internally serialize the trie. */
1305 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1306 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1307     UNewTrie2 *newTrie;
1308     UTrie2Header *header;
1309     uint32_t *p;
1310     uint16_t *dest16;
1311     int32_t i, length;
1312     int32_t allIndexesLength;
1313     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
1314     UChar32 highStart;
1315 
1316     /* argument check */
1317     if(U_FAILURE(*pErrorCode)) {
1318         return;
1319     }
1320     if( trie==NULL ||
1321         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1322     ) {
1323         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1324         return;
1325     }
1326     newTrie=trie->newTrie;
1327     if(newTrie==NULL) {
1328         /* already frozen */
1329         UTrie2ValueBits frozenValueBits=
1330             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1331         if(valueBits!=frozenValueBits) {
1332             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1333         }
1334         return;
1335     }
1336 
1337     /* compact if necessary */
1338     if(!newTrie->isCompacted) {
1339         compactTrie(trie, pErrorCode);
1340         if(U_FAILURE(*pErrorCode)) {
1341             return;
1342         }
1343     }
1344     highStart=trie->highStart;
1345 
1346     if(highStart<=0x10000) {
1347         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1348     } else {
1349         allIndexesLength=newTrie->index2Length;
1350     }
1351     if(valueBits==UTRIE2_16_VALUE_BITS) {
1352         dataMove=allIndexesLength;
1353     } else {
1354         dataMove=0;
1355     }
1356 
1357     /* are indexLength and dataLength within limits? */
1358     if( /* for unshifted indexLength */
1359         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1360         /* for unshifted dataNullOffset */
1361         (dataMove+newTrie->dataNullOffset)>0xffff ||
1362         /* for unshifted 2-byte UTF-8 index-2 values */
1363         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1364         /* for shiftedDataLength */
1365         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1366     ) {
1367         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1368         return;
1369     }
1370 
1371     /* calculate the total serialized length */
1372     length=sizeof(UTrie2Header)+allIndexesLength*2;
1373     if(valueBits==UTRIE2_16_VALUE_BITS) {
1374         length+=newTrie->dataLength*2;
1375     } else {
1376         length+=newTrie->dataLength*4;
1377     }
1378 
1379     trie->memory=uprv_malloc(length);
1380     if(trie->memory==NULL) {
1381         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1382         return;
1383     }
1384     trie->length=length;
1385     trie->isMemoryOwned=TRUE;
1386 
1387     trie->indexLength=allIndexesLength;
1388     trie->dataLength=newTrie->dataLength;
1389     if(highStart<=0x10000) {
1390         trie->index2NullOffset=0xffff;
1391     } else {
1392         trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset);
1393     }
1394     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1395     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1396 
1397     /* set the header fields */
1398     header=(UTrie2Header *)trie->memory;
1399 
1400     header->signature=UTRIE2_SIG; /* "Tri2" */
1401     header->options=(uint16_t)valueBits;
1402 
1403     header->indexLength=(uint16_t)trie->indexLength;
1404     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1405     header->index2NullOffset=trie->index2NullOffset;
1406     header->dataNullOffset=trie->dataNullOffset;
1407     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1408 
1409     /* fill the index and data arrays */
1410     dest16=(uint16_t *)(header+1);
1411     trie->index=dest16;
1412 
1413     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1414     p=(uint32_t *)newTrie->index2;
1415     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1416         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1417     }
1418 
1419     /* write UTF-8 2-byte index-2 values, not right-shifted */
1420     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
1421         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1422     }
1423     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
1424         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1425     }
1426 
1427     if(highStart>0x10000) {
1428         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1429         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1430 
1431         /* write 16-bit index-1 values for supplementary code points */
1432         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1433         for(i=index1Length; i>0; --i) {
1434             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1435         }
1436 
1437         /*
1438          * write the index-2 array values for supplementary code points,
1439          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1440          */
1441         p=(uint32_t *)newTrie->index2+index2Offset;
1442         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1443             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1444         }
1445     }
1446 
1447     /* write the 16/32-bit data array */
1448     switch(valueBits) {
1449     case UTRIE2_16_VALUE_BITS:
1450         /* write 16-bit data values */
1451         trie->data16=dest16;
1452         trie->data32=NULL;
1453         p=newTrie->data;
1454         for(i=newTrie->dataLength; i>0; --i) {
1455             *dest16++=(uint16_t)*p++;
1456         }
1457         break;
1458     case UTRIE2_32_VALUE_BITS:
1459         /* write 32-bit data values */
1460         trie->data16=NULL;
1461         trie->data32=(uint32_t *)dest16;
1462         uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
1463         break;
1464     default:
1465         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1466         return;
1467     }
1468 
1469 #ifdef UTRIE2_DEBUG
1470     utrie2_printLengths(trie, "");
1471 #endif
1472 
1473 #ifdef UCPTRIE_DEBUG
1474     umutablecptrie_setName(newTrie->t3, trie->name);
1475     ucptrie_close(
1476         umutablecptrie_buildImmutable(
1477             newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode));
1478 #endif
1479     /* Delete the UNewTrie2. */
1480     uprv_free(newTrie->data);
1481     uprv_free(newTrie);
1482     trie->newTrie=NULL;
1483 }
1484