1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2012, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  ucharstriebuilder.h
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010nov14
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/ucharstrie.h"
19 #include "unicode/ucharstriebuilder.h"
20 #include "unicode/unistr.h"
21 #include "unicode/ustring.h"
22 #include "cmemory.h"
23 #include "uarrsort.h"
24 #include "uassert.h"
25 #include "uhash.h"
26 #include "ustr_imp.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  * Note: This builder implementation stores (string, value) pairs with full copies
32  * of the 16-bit-unit sequences, until the UCharsTrie is built.
33  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34  */
35 
36 class UCharsTrieElement : public UMemory {
37 public:
38     // Use compiler's default constructor, initializes nothing.
39 
40     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
41 
getString(const UnicodeString & strings) const42     UnicodeString getString(const UnicodeString &strings) const {
43         int32_t length=strings[stringOffset];
44         return strings.tempSubString(stringOffset+1, length);
45     }
getStringLength(const UnicodeString & strings) const46     int32_t getStringLength(const UnicodeString &strings) const {
47         return strings[stringOffset];
48     }
49 
charAt(int32_t index,const UnicodeString & strings) const50     UChar charAt(int32_t index, const UnicodeString &strings) const {
51         return strings[stringOffset+1+index];
52     }
53 
getValue() const54     int32_t getValue() const { return value; }
55 
56     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
57 
58 private:
59     // The first strings unit contains the string length.
60     // (Compared with a stringLength field here, this saves 2 bytes per string.)
61     int32_t stringOffset;
62     int32_t value;
63 };
64 
65 void
setTo(const UnicodeString & s,int32_t val,UnicodeString & strings,UErrorCode & errorCode)66 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
67                          UnicodeString &strings, UErrorCode &errorCode) {
68     if(U_FAILURE(errorCode)) {
69         return;
70     }
71     int32_t length=s.length();
72     if(length>0xffff) {
73         // Too long: We store the length in 1 unit.
74         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
75         return;
76     }
77     stringOffset=strings.length();
78     strings.append((UChar)length);
79     value=val;
80     strings.append(s);
81 }
82 
83 int32_t
compareStringTo(const UCharsTrieElement & other,const UnicodeString & strings) const84 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
85     return getString(strings).compare(other.getString(strings));
86 }
87 
UCharsTrieBuilder(UErrorCode &)88 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
89         : elements(NULL), elementsCapacity(0), elementsLength(0),
90           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
91 
~UCharsTrieBuilder()92 UCharsTrieBuilder::~UCharsTrieBuilder() {
93     delete[] elements;
94     uprv_free(uchars);
95 }
96 
97 UCharsTrieBuilder &
add(const UnicodeString & s,int32_t value,UErrorCode & errorCode)98 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
99     if(U_FAILURE(errorCode)) {
100         return *this;
101     }
102     if(ucharsLength>0) {
103         // Cannot add elements after building.
104         errorCode=U_NO_WRITE_PERMISSION;
105         return *this;
106     }
107     if(elementsLength==elementsCapacity) {
108         int32_t newCapacity;
109         if(elementsCapacity==0) {
110             newCapacity=1024;
111         } else {
112             newCapacity=4*elementsCapacity;
113         }
114         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
115         if(newElements==NULL) {
116             errorCode=U_MEMORY_ALLOCATION_ERROR;
117             return *this;
118         }
119         if(elementsLength>0) {
120             uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
121         }
122         delete[] elements;
123         elements=newElements;
124         elementsCapacity=newCapacity;
125     }
126     elements[elementsLength++].setTo(s, value, strings, errorCode);
127     if(U_SUCCESS(errorCode) && strings.isBogus()) {
128         errorCode=U_MEMORY_ALLOCATION_ERROR;
129     }
130     return *this;
131 }
132 
133 U_CDECL_BEGIN
134 
135 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)136 compareElementStrings(const void *context, const void *left, const void *right) {
137     const UnicodeString *strings=static_cast<const UnicodeString *>(context);
138     const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
139     const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
140     return leftElement->compareStringTo(*rightElement, *strings);
141 }
142 
143 U_CDECL_END
144 
145 UCharsTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)146 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
147     buildUChars(buildOption, errorCode);
148     UCharsTrie *newTrie=NULL;
149     if(U_SUCCESS(errorCode)) {
150         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
151         if(newTrie==NULL) {
152             errorCode=U_MEMORY_ALLOCATION_ERROR;
153         } else {
154             uchars=NULL;  // The new trie now owns the array.
155             ucharsCapacity=0;
156         }
157     }
158     return newTrie;
159 }
160 
161 UnicodeString &
buildUnicodeString(UStringTrieBuildOption buildOption,UnicodeString & result,UErrorCode & errorCode)162 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
163                                       UErrorCode &errorCode) {
164     buildUChars(buildOption, errorCode);
165     if(U_SUCCESS(errorCode)) {
166         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
167     }
168     return result;
169 }
170 
171 void
buildUChars(UStringTrieBuildOption buildOption,UErrorCode & errorCode)172 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
173     if(U_FAILURE(errorCode)) {
174         return;
175     }
176     if(uchars!=NULL && ucharsLength>0) {
177         // Already built.
178         return;
179     }
180     if(ucharsLength==0) {
181         if(elementsLength==0) {
182             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
183             return;
184         }
185         if(strings.isBogus()) {
186             errorCode=U_MEMORY_ALLOCATION_ERROR;
187             return;
188         }
189         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
190                       compareElementStrings, &strings,
191                       FALSE,  // need not be a stable sort
192                       &errorCode);
193         if(U_FAILURE(errorCode)) {
194             return;
195         }
196         // Duplicate strings are not allowed.
197         UnicodeString prev=elements[0].getString(strings);
198         for(int32_t i=1; i<elementsLength; ++i) {
199             UnicodeString current=elements[i].getString(strings);
200             if(prev==current) {
201                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
202                 return;
203             }
204             prev.fastCopyFrom(current);
205         }
206     }
207     // Create and UChar-serialize the trie for the elements.
208     ucharsLength=0;
209     int32_t capacity=strings.length();
210     if(capacity<1024) {
211         capacity=1024;
212     }
213     if(ucharsCapacity<capacity) {
214         uprv_free(uchars);
215         uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
216         if(uchars==NULL) {
217             errorCode=U_MEMORY_ALLOCATION_ERROR;
218             ucharsCapacity=0;
219             return;
220         }
221         ucharsCapacity=capacity;
222     }
223     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
224     if(uchars==NULL) {
225         errorCode=U_MEMORY_ALLOCATION_ERROR;
226     }
227 }
228 
229 int32_t
getElementStringLength(int32_t i) const230 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
231     return elements[i].getStringLength(strings);
232 }
233 
234 UChar
getElementUnit(int32_t i,int32_t unitIndex) const235 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
236     return elements[i].charAt(unitIndex, strings);
237 }
238 
239 int32_t
getElementValue(int32_t i) const240 UCharsTrieBuilder::getElementValue(int32_t i) const {
241     return elements[i].getValue();
242 }
243 
244 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t unitIndex) const245 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
246     const UCharsTrieElement &firstElement=elements[first];
247     const UCharsTrieElement &lastElement=elements[last];
248     int32_t minStringLength=firstElement.getStringLength(strings);
249     while(++unitIndex<minStringLength &&
250             firstElement.charAt(unitIndex, strings)==
251             lastElement.charAt(unitIndex, strings)) {}
252     return unitIndex;
253 }
254 
255 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t unitIndex) const256 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
257     int32_t length=0;  // Number of different units at unitIndex.
258     int32_t i=start;
259     do {
260         UChar unit=elements[i++].charAt(unitIndex, strings);
261         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
262             ++i;
263         }
264         ++length;
265     } while(i<limit);
266     return length;
267 }
268 
269 int32_t
skipElementsBySomeUnits(int32_t i,int32_t unitIndex,int32_t count) const270 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
271     do {
272         UChar unit=elements[i++].charAt(unitIndex, strings);
273         while(unit==elements[i].charAt(unitIndex, strings)) {
274             ++i;
275         }
276     } while(--count>0);
277     return i;
278 }
279 
280 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t unitIndex,UChar unit) const281 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
282     while(unit==elements[i].charAt(unitIndex, strings)) {
283         ++i;
284     }
285     return i;
286 }
287 
UCTLinearMatchNode(const UChar * units,int32_t len,Node * nextNode)288 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
289         : LinearMatchNode(len, nextNode), s(units) {
290     hash=hash*37u+ustr_hashUCharsN(units, len);
291 }
292 
293 UBool
operator ==(const Node & other) const294 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
295     if(this==&other) {
296         return TRUE;
297     }
298     if(!LinearMatchNode::operator==(other)) {
299         return FALSE;
300     }
301     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
302     return 0==u_memcmp(s, o.s, length);
303 }
304 
305 void
write(StringTrieBuilder & builder)306 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
307     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
308     next->write(builder);
309     b.write(s, length);
310     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
311 }
312 
313 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t unitIndex,int32_t length,Node * nextNode) const314 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
315                                          Node *nextNode) const {
316     return new UCTLinearMatchNode(
317             elements[i].getString(strings).getBuffer()+unitIndex,
318             length,
319             nextNode);
320 }
321 
322 UBool
ensureCapacity(int32_t length)323 UCharsTrieBuilder::ensureCapacity(int32_t length) {
324     if(uchars==NULL) {
325         return FALSE;  // previous memory allocation had failed
326     }
327     if(length>ucharsCapacity) {
328         int32_t newCapacity=ucharsCapacity;
329         do {
330             newCapacity*=2;
331         } while(newCapacity<=length);
332         UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
333         if(newUChars==NULL) {
334             // unable to allocate memory
335             uprv_free(uchars);
336             uchars=NULL;
337             ucharsCapacity=0;
338             return FALSE;
339         }
340         u_memcpy(newUChars+(newCapacity-ucharsLength),
341                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
342         uprv_free(uchars);
343         uchars=newUChars;
344         ucharsCapacity=newCapacity;
345     }
346     return TRUE;
347 }
348 
349 int32_t
write(int32_t unit)350 UCharsTrieBuilder::write(int32_t unit) {
351     int32_t newLength=ucharsLength+1;
352     if(ensureCapacity(newLength)) {
353         ucharsLength=newLength;
354         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
355     }
356     return ucharsLength;
357 }
358 
359 int32_t
write(const UChar * s,int32_t length)360 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
361     int32_t newLength=ucharsLength+length;
362     if(ensureCapacity(newLength)) {
363         ucharsLength=newLength;
364         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
365     }
366     return ucharsLength;
367 }
368 
369 int32_t
writeElementUnits(int32_t i,int32_t unitIndex,int32_t length)370 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
371     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
372 }
373 
374 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)375 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
376     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
377         return write(i|(isFinal<<15));
378     }
379     UChar intUnits[3];
380     int32_t length;
381     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
382         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
383         intUnits[1]=(UChar)((uint32_t)i>>16);
384         intUnits[2]=(UChar)i;
385         length=3;
386     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
387     //     intUnits[0]=(UChar)(i);
388     //     length=1;
389     } else {
390         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
391         intUnits[1]=(UChar)i;
392         length=2;
393     }
394     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
395     return write(intUnits, length);
396 }
397 
398 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)399 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
400     if(!hasValue) {
401         return write(node);
402     }
403     UChar intUnits[3];
404     int32_t length;
405     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
406         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
407         intUnits[1]=(UChar)((uint32_t)value>>16);
408         intUnits[2]=(UChar)value;
409         length=3;
410     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
411         intUnits[0]=(UChar)((value+1)<<6);
412         length=1;
413     } else {
414         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
415         intUnits[1]=(UChar)value;
416         length=2;
417     }
418     intUnits[0]|=(UChar)node;
419     return write(intUnits, length);
420 }
421 
422 int32_t
writeDeltaTo(int32_t jumpTarget)423 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
424     int32_t i=ucharsLength-jumpTarget;
425     U_ASSERT(i>=0);
426     if(i<=UCharsTrie::kMaxOneUnitDelta) {
427         return write(i);
428     }
429     UChar intUnits[3];
430     int32_t length;
431     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
432         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
433         length=1;
434     } else {
435         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
436         intUnits[1]=(UChar)(i>>16);
437         length=2;
438     }
439     intUnits[length++]=(UChar)i;
440     return write(intUnits, length);
441 }
442 
443 U_NAMESPACE_END
444