1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2012, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  bytestriebuilder.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010sep25
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/bytestrie.h"
19 #include "unicode/bytestriebuilder.h"
20 #include "unicode/stringpiece.h"
21 #include "charstr.h"
22 #include "cmemory.h"
23 #include "uhash.h"
24 #include "uarrsort.h"
25 #include "uassert.h"
26 #include "ustr_imp.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  * Note: This builder implementation stores (bytes, value) pairs with full copies
32  * of the byte sequences, until the BytesTrie is built.
33  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34  */
35 
36 class BytesTrieElement : public UMemory {
37 public:
38     // Use compiler's default constructor, initializes nothing.
39 
40     void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
41 
getString(const CharString & strings) const42     StringPiece getString(const CharString &strings) const {
43         int32_t offset=stringOffset;
44         int32_t length;
45         if(offset>=0) {
46             length=(uint8_t)strings[offset++];
47         } else {
48             offset=~offset;
49             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
50             offset+=2;
51         }
52         return StringPiece(strings.data()+offset, length);
53     }
getStringLength(const CharString & strings) const54     int32_t getStringLength(const CharString &strings) const {
55         int32_t offset=stringOffset;
56         if(offset>=0) {
57             return (uint8_t)strings[offset];
58         } else {
59             offset=~offset;
60             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
61         }
62     }
63 
charAt(int32_t index,const CharString & strings) const64     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
65 
getValue() const66     int32_t getValue() const { return value; }
67 
68     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
69 
70 private:
data(const CharString & strings) const71     const char *data(const CharString &strings) const {
72         int32_t offset=stringOffset;
73         if(offset>=0) {
74             ++offset;
75         } else {
76             offset=~offset+2;
77         }
78         return strings.data()+offset;
79     }
80 
81     // If the stringOffset is non-negative, then the first strings byte contains
82     // the string length.
83     // If the stringOffset is negative, then the first two strings bytes contain
84     // the string length (big-endian), and the offset needs to be bit-inverted.
85     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
86     int32_t stringOffset;
87     int32_t value;
88 };
89 
90 void
setTo(StringPiece s,int32_t val,CharString & strings,UErrorCode & errorCode)91 BytesTrieElement::setTo(StringPiece s, int32_t val,
92                         CharString &strings, UErrorCode &errorCode) {
93     if(U_FAILURE(errorCode)) {
94         return;
95     }
96     int32_t length=s.length();
97     if(length>0xffff) {
98         // Too long: We store the length in 1 or 2 bytes.
99         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
100         return;
101     }
102     int32_t offset=strings.length();
103     if(length>0xff) {
104         offset=~offset;
105         strings.append((char)(length>>8), errorCode);
106     }
107     strings.append((char)length, errorCode);
108     stringOffset=offset;
109     value=val;
110     strings.append(s, errorCode);
111 }
112 
113 int32_t
compareStringTo(const BytesTrieElement & other,const CharString & strings) const114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115     // TODO: add StringPiece::compare(), see ticket #8187
116     StringPiece thisString=getString(strings);
117     StringPiece otherString=other.getString(strings);
118     int32_t lengthDiff=thisString.length()-otherString.length();
119     int32_t commonLength;
120     if(lengthDiff<=0) {
121         commonLength=thisString.length();
122     } else {
123         commonLength=otherString.length();
124     }
125     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126     return diff!=0 ? diff : lengthDiff;
127 }
128 
BytesTrieBuilder(UErrorCode & errorCode)129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
131           bytes(NULL), bytesCapacity(0), bytesLength(0) {
132     if(U_FAILURE(errorCode)) {
133         return;
134     }
135     strings=new CharString();
136     if(strings==NULL) {
137         errorCode=U_MEMORY_ALLOCATION_ERROR;
138     }
139 }
140 
~BytesTrieBuilder()141 BytesTrieBuilder::~BytesTrieBuilder() {
142     delete strings;
143     delete[] elements;
144     uprv_free(bytes);
145 }
146 
147 BytesTrieBuilder &
add(StringPiece s,int32_t value,UErrorCode & errorCode)148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149     if(U_FAILURE(errorCode)) {
150         return *this;
151     }
152     if(bytesLength>0) {
153         // Cannot add elements after building.
154         errorCode=U_NO_WRITE_PERMISSION;
155         return *this;
156     }
157     if(elementsLength==elementsCapacity) {
158         int32_t newCapacity;
159         if(elementsCapacity==0) {
160             newCapacity=1024;
161         } else {
162             newCapacity=4*elementsCapacity;
163         }
164         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165         if(newElements==NULL) {
166             errorCode=U_MEMORY_ALLOCATION_ERROR;
167             return *this; // error instead of dereferencing null
168         }
169         if(elementsLength>0) {
170             uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
171         }
172         delete[] elements;
173         elements=newElements;
174         elementsCapacity=newCapacity;
175     }
176     elements[elementsLength++].setTo(s, value, *strings, errorCode);
177     return *this;
178 }
179 
180 U_CDECL_BEGIN
181 
182 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)183 compareElementStrings(const void *context, const void *left, const void *right) {
184     const CharString *strings=static_cast<const CharString *>(context);
185     const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186     const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187     return leftElement->compareStringTo(*rightElement, *strings);
188 }
189 
190 U_CDECL_END
191 
192 BytesTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194     buildBytes(buildOption, errorCode);
195     BytesTrie *newTrie=NULL;
196     if(U_SUCCESS(errorCode)) {
197         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
198         if(newTrie==NULL) {
199             errorCode=U_MEMORY_ALLOCATION_ERROR;
200         } else {
201             bytes=NULL;  // The new trie now owns the array.
202             bytesCapacity=0;
203         }
204     }
205     return newTrie;
206 }
207 
208 StringPiece
buildStringPiece(UStringTrieBuildOption buildOption,UErrorCode & errorCode)209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210     buildBytes(buildOption, errorCode);
211     StringPiece result;
212     if(U_SUCCESS(errorCode)) {
213         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
214     }
215     return result;
216 }
217 
218 void
buildBytes(UStringTrieBuildOption buildOption,UErrorCode & errorCode)219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220     if(U_FAILURE(errorCode)) {
221         return;
222     }
223     if(bytes!=NULL && bytesLength>0) {
224         // Already built.
225         return;
226     }
227     if(bytesLength==0) {
228         if(elementsLength==0) {
229             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
230             return;
231         }
232         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
233                       compareElementStrings, strings,
234                       FALSE,  // need not be a stable sort
235                       &errorCode);
236         if(U_FAILURE(errorCode)) {
237             return;
238         }
239         // Duplicate strings are not allowed.
240         StringPiece prev=elements[0].getString(*strings);
241         for(int32_t i=1; i<elementsLength; ++i) {
242             StringPiece current=elements[i].getString(*strings);
243             if(prev==current) {
244                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
245                 return;
246             }
247             prev=current;
248         }
249     }
250     // Create and byte-serialize the trie for the elements.
251     bytesLength=0;
252     int32_t capacity=strings->length();
253     if(capacity<1024) {
254         capacity=1024;
255     }
256     if(bytesCapacity<capacity) {
257         uprv_free(bytes);
258         bytes=static_cast<char *>(uprv_malloc(capacity));
259         if(bytes==NULL) {
260             errorCode=U_MEMORY_ALLOCATION_ERROR;
261             bytesCapacity=0;
262             return;
263         }
264         bytesCapacity=capacity;
265     }
266     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
267     if(bytes==NULL) {
268         errorCode=U_MEMORY_ALLOCATION_ERROR;
269     }
270 }
271 
272 BytesTrieBuilder &
clear()273 BytesTrieBuilder::clear() {
274     strings->clear();
275     elementsLength=0;
276     bytesLength=0;
277     return *this;
278 }
279 
280 int32_t
getElementStringLength(int32_t i) const281 BytesTrieBuilder::getElementStringLength(int32_t i) const {
282     return elements[i].getStringLength(*strings);
283 }
284 
285 UChar
getElementUnit(int32_t i,int32_t byteIndex) const286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287     return (uint8_t)elements[i].charAt(byteIndex, *strings);
288 }
289 
290 int32_t
getElementValue(int32_t i) const291 BytesTrieBuilder::getElementValue(int32_t i) const {
292     return elements[i].getValue();
293 }
294 
295 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t byteIndex) const296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297     const BytesTrieElement &firstElement=elements[first];
298     const BytesTrieElement &lastElement=elements[last];
299     int32_t minStringLength=firstElement.getStringLength(*strings);
300     while(++byteIndex<minStringLength &&
301             firstElement.charAt(byteIndex, *strings)==
302             lastElement.charAt(byteIndex, *strings)) {}
303     return byteIndex;
304 }
305 
306 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t byteIndex) const307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308     int32_t length=0;  // Number of different bytes at byteIndex.
309     int32_t i=start;
310     do {
311         char byte=elements[i++].charAt(byteIndex, *strings);
312         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
313             ++i;
314         }
315         ++length;
316     } while(i<limit);
317     return length;
318 }
319 
320 int32_t
skipElementsBySomeUnits(int32_t i,int32_t byteIndex,int32_t count) const321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
322     do {
323         char byte=elements[i++].charAt(byteIndex, *strings);
324         while(byte==elements[i].charAt(byteIndex, *strings)) {
325             ++i;
326         }
327     } while(--count>0);
328     return i;
329 }
330 
331 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t byteIndex,UChar byte) const332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
333     char b=(char)byte;
334     while(b==elements[i].charAt(byteIndex, *strings)) {
335         ++i;
336     }
337     return i;
338 }
339 
BTLinearMatchNode(const char * bytes,int32_t len,Node * nextNode)340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341         : LinearMatchNode(len, nextNode), s(bytes) {
342     hash=static_cast<int32_t>(
343         static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
344 }
345 
346 UBool
operator ==(const Node & other) const347 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
348     if(this==&other) {
349         return TRUE;
350     }
351     if(!LinearMatchNode::operator==(other)) {
352         return FALSE;
353     }
354     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
355     return 0==uprv_memcmp(s, o.s, length);
356 }
357 
358 void
write(StringTrieBuilder & builder)359 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
360     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
361     next->write(builder);
362     b.write(s, length);
363     offset=b.write(b.getMinLinearMatch()+length-1);
364 }
365 
366 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t byteIndex,int32_t length,Node * nextNode) const367 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
368                                         Node *nextNode) const {
369     return new BTLinearMatchNode(
370             elements[i].getString(*strings).data()+byteIndex,
371             length,
372             nextNode);
373 }
374 
375 UBool
ensureCapacity(int32_t length)376 BytesTrieBuilder::ensureCapacity(int32_t length) {
377     if(bytes==NULL) {
378         return FALSE;  // previous memory allocation had failed
379     }
380     if(length>bytesCapacity) {
381         int32_t newCapacity=bytesCapacity;
382         do {
383             newCapacity*=2;
384         } while(newCapacity<=length);
385         char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
386         if(newBytes==NULL) {
387             // unable to allocate memory
388             uprv_free(bytes);
389             bytes=NULL;
390             bytesCapacity=0;
391             return FALSE;
392         }
393         uprv_memcpy(newBytes+(newCapacity-bytesLength),
394                     bytes+(bytesCapacity-bytesLength), bytesLength);
395         uprv_free(bytes);
396         bytes=newBytes;
397         bytesCapacity=newCapacity;
398     }
399     return TRUE;
400 }
401 
402 int32_t
write(int32_t byte)403 BytesTrieBuilder::write(int32_t byte) {
404     int32_t newLength=bytesLength+1;
405     if(ensureCapacity(newLength)) {
406         bytesLength=newLength;
407         bytes[bytesCapacity-bytesLength]=(char)byte;
408     }
409     return bytesLength;
410 }
411 
412 int32_t
write(const char * b,int32_t length)413 BytesTrieBuilder::write(const char *b, int32_t length) {
414     int32_t newLength=bytesLength+length;
415     if(ensureCapacity(newLength)) {
416         bytesLength=newLength;
417         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
418     }
419     return bytesLength;
420 }
421 
422 int32_t
writeElementUnits(int32_t i,int32_t byteIndex,int32_t length)423 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
424     return write(elements[i].getString(*strings).data()+byteIndex, length);
425 }
426 
427 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)428 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
429     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
430         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
431     }
432     char intBytes[5];
433     int32_t length=1;
434     if(i<0 || i>0xffffff) {
435         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
436         intBytes[1]=(char)((uint32_t)i>>24);
437         intBytes[2]=(char)((uint32_t)i>>16);
438         intBytes[3]=(char)((uint32_t)i>>8);
439         intBytes[4]=(char)i;
440         length=5;
441     // } else if(i<=BytesTrie::kMaxOneByteValue) {
442     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
443     } else {
444         if(i<=BytesTrie::kMaxTwoByteValue) {
445             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
446         } else {
447             if(i<=BytesTrie::kMaxThreeByteValue) {
448                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
449             } else {
450                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
451                 intBytes[1]=(char)(i>>16);
452                 length=2;
453             }
454             intBytes[length++]=(char)(i>>8);
455         }
456         intBytes[length++]=(char)i;
457     }
458     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
459     return write(intBytes, length);
460 }
461 
462 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)463 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
464     int32_t offset=write(node);
465     if(hasValue) {
466         offset=writeValueAndFinal(value, FALSE);
467     }
468     return offset;
469 }
470 
471 int32_t
writeDeltaTo(int32_t jumpTarget)472 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
473     int32_t i=bytesLength-jumpTarget;
474     U_ASSERT(i>=0);
475     if(i<=BytesTrie::kMaxOneByteDelta) {
476         return write(i);
477     } else {
478         char intBytes[5];
479         return write(intBytes, internalEncodeDelta(i, intBytes));
480     }
481 }
482 
483 int32_t
internalEncodeDelta(int32_t i,char intBytes[])484 BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
485     U_ASSERT(i>=0);
486     if(i<=BytesTrie::kMaxOneByteDelta) {
487         intBytes[0]=(char)i;
488         return 1;
489     }
490     int32_t length=1;
491     if(i<=BytesTrie::kMaxTwoByteDelta) {
492         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
493     } else {
494         if(i<=BytesTrie::kMaxThreeByteDelta) {
495             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
496         } else {
497             if(i<=0xffffff) {
498                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
499             } else {
500                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
501                 intBytes[1]=(char)(i>>24);
502                 length=2;
503             }
504             intBytes[length++]=(char)(i>>16);
505         }
506         intBytes[length++]=(char)(i>>8);
507     }
508     intBytes[length++]=(char)i;
509     return length;
510 }
511 
512 U_NAMESPACE_END
513