1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  bytestrieiterator.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010nov03
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/stringpiece.h"
18 #include "charstr.h"
19 #include "uvectr32.h"
20 
21 U_NAMESPACE_BEGIN
22 
Iterator(const void * trieBytes,int32_t maxStringLength,UErrorCode & errorCode)23 BytesTrie::Iterator::Iterator(const void *trieBytes, int32_t maxStringLength,
24                               UErrorCode &errorCode)
25         : bytes_(static_cast<const uint8_t *>(trieBytes)),
26           pos_(bytes_), initialPos_(bytes_),
27           remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
28           str_(NULL), maxLength_(maxStringLength), value_(0), stack_(NULL) {
29     if(U_FAILURE(errorCode)) {
30         return;
31     }
32     // str_ and stack_ are pointers so that it's easy to turn bytestrie.h into
33     // a public API header for which we would want it to depend only on
34     // other public headers.
35     // Unlike BytesTrie itself, its Iterator performs memory allocations anyway
36     // via the CharString and UVector32 implementations, so this additional
37     // cost is minimal.
38     str_=new CharString();
39     stack_=new UVector32(errorCode);
40     if(U_SUCCESS(errorCode) && (str_==NULL || stack_==NULL)) {
41         errorCode=U_MEMORY_ALLOCATION_ERROR;
42     }
43 }
44 
Iterator(const BytesTrie & trie,int32_t maxStringLength,UErrorCode & errorCode)45 BytesTrie::Iterator::Iterator(const BytesTrie &trie, int32_t maxStringLength,
46                               UErrorCode &errorCode)
47         : bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
48           remainingMatchLength_(trie.remainingMatchLength_),
49           initialRemainingMatchLength_(trie.remainingMatchLength_),
50           str_(NULL), maxLength_(maxStringLength), value_(0), stack_(NULL) {
51     if(U_FAILURE(errorCode)) {
52         return;
53     }
54     str_=new CharString();
55     stack_=new UVector32(errorCode);
56     if(U_FAILURE(errorCode)) {
57         return;
58     }
59     if(str_==NULL || stack_==NULL) {
60         errorCode=U_MEMORY_ALLOCATION_ERROR;
61         return;
62     }
63     int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
64     if(length>=0) {
65         // Pending linear-match node, append remaining bytes to str_.
66         ++length;
67         if(maxLength_>0 && length>maxLength_) {
68             length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
69         }
70         str_->append(reinterpret_cast<const char *>(pos_), length, errorCode);
71         pos_+=length;
72         remainingMatchLength_-=length;
73     }
74 }
75 
~Iterator()76 BytesTrie::Iterator::~Iterator() {
77     delete str_;
78     delete stack_;
79 }
80 
81 BytesTrie::Iterator &
reset()82 BytesTrie::Iterator::reset() {
83     pos_=initialPos_;
84     remainingMatchLength_=initialRemainingMatchLength_;
85     int32_t length=remainingMatchLength_+1;  // Remaining match length.
86     if(maxLength_>0 && length>maxLength_) {
87         length=maxLength_;
88     }
89     str_->truncate(length);
90     pos_+=length;
91     remainingMatchLength_-=length;
92     stack_->setSize(0);
93     return *this;
94 }
95 
96 UBool
hasNext() const97 BytesTrie::Iterator::hasNext() const { return pos_!=NULL || !stack_->isEmpty(); }
98 
99 UBool
next(UErrorCode & errorCode)100 BytesTrie::Iterator::next(UErrorCode &errorCode) {
101     if(U_FAILURE(errorCode)) {
102         return FALSE;
103     }
104     const uint8_t *pos=pos_;
105     if(pos==NULL) {
106         if(stack_->isEmpty()) {
107             return FALSE;
108         }
109         // Pop the state off the stack and continue with the next outbound edge of
110         // the branch node.
111         int32_t stackSize=stack_->size();
112         int32_t length=stack_->elementAti(stackSize-1);
113         pos=bytes_+stack_->elementAti(stackSize-2);
114         stack_->setSize(stackSize-2);
115         str_->truncate(length&0xffff);
116         length=(int32_t)((uint32_t)length>>16);
117         if(length>1) {
118             pos=branchNext(pos, length, errorCode);
119             if(pos==NULL) {
120                 return TRUE;  // Reached a final value.
121             }
122         } else {
123             str_->append((char)*pos++, errorCode);
124         }
125     }
126     if(remainingMatchLength_>=0) {
127         // We only get here if we started in a pending linear-match node
128         // with more than maxLength remaining bytes.
129         return truncateAndStop();
130     }
131     for(;;) {
132         int32_t node=*pos++;
133         if(node>=kMinValueLead) {
134             // Deliver value for the byte sequence so far.
135             UBool isFinal=(UBool)(node&kValueIsFinal);
136             value_=readValue(pos, node>>1);
137             if(isFinal || (maxLength_>0 && str_->length()==maxLength_)) {
138                 pos_=NULL;
139             } else {
140                 pos_=skipValue(pos, node);
141             }
142             sp_.set(str_->data(), str_->length());
143             return TRUE;
144         }
145         if(maxLength_>0 && str_->length()==maxLength_) {
146             return truncateAndStop();
147         }
148         if(node<kMinLinearMatch) {
149             if(node==0) {
150                 node=*pos++;
151             }
152             pos=branchNext(pos, node+1, errorCode);
153             if(pos==NULL) {
154                 return TRUE;  // Reached a final value.
155             }
156         } else {
157             // Linear-match node, append length bytes to str_.
158             int32_t length=node-kMinLinearMatch+1;
159             if(maxLength_>0 && str_->length()+length>maxLength_) {
160                 str_->append(reinterpret_cast<const char *>(pos),
161                             maxLength_-str_->length(), errorCode);
162                 return truncateAndStop();
163             }
164             str_->append(reinterpret_cast<const char *>(pos), length, errorCode);
165             pos+=length;
166         }
167     }
168 }
169 
170 UBool
truncateAndStop()171 BytesTrie::Iterator::truncateAndStop() {
172     pos_=NULL;
173     sp_.set(str_->data(), str_->length());
174     value_=-1;  // no real value for str
175     return TRUE;
176 }
177 
178 // Branch node, needs to take the first outbound edge and push state for the rest.
179 const uint8_t *
branchNext(const uint8_t * pos,int32_t length,UErrorCode & errorCode)180 BytesTrie::Iterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
181     while(length>kMaxBranchLinearSubNodeLength) {
182         ++pos;  // ignore the comparison byte
183         // Push state for the greater-or-equal edge.
184         stack_->addElement((int32_t)(skipDelta(pos)-bytes_), errorCode);
185         stack_->addElement(((length-(length>>1))<<16)|str_->length(), errorCode);
186         // Follow the less-than edge.
187         length>>=1;
188         pos=jumpByDelta(pos);
189     }
190     // List of key-value pairs where values are either final values or jump deltas.
191     // Read the first (key, value) pair.
192     uint8_t trieByte=*pos++;
193     int32_t node=*pos++;
194     UBool isFinal=(UBool)(node&kValueIsFinal);
195     int32_t value=readValue(pos, node>>1);
196     pos=skipValue(pos, node);
197     stack_->addElement((int32_t)(pos-bytes_), errorCode);
198     stack_->addElement(((length-1)<<16)|str_->length(), errorCode);
199     str_->append((char)trieByte, errorCode);
200     if(isFinal) {
201         pos_=NULL;
202         sp_.set(str_->data(), str_->length());
203         value_=value;
204         return NULL;
205     } else {
206         return pos+value;
207     }
208 }
209 
210 U_NAMESPACE_END
211