1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2011, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/appendable.h"
19 #include "unicode/ucharstrie.h"
20 #include "unicode/uobject.h"
21 #include "unicode/utf16.h"
22 #include "cmemory.h"
23 #include "uassert.h"
24
25 U_NAMESPACE_BEGIN
26
~UCharsTrie()27 UCharsTrie::~UCharsTrie() {
28 uprv_free(ownedArray_);
29 }
30
31 UStringTrieResult
current() const32 UCharsTrie::current() const {
33 const UChar *pos=pos_;
34 if(pos==NULL) {
35 return USTRINGTRIE_NO_MATCH;
36 } else {
37 int32_t node;
38 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
39 valueResult(node) : USTRINGTRIE_NO_VALUE;
40 }
41 }
42
43 UStringTrieResult
firstForCodePoint(UChar32 cp)44 UCharsTrie::firstForCodePoint(UChar32 cp) {
45 return cp<=0xffff ?
46 first(cp) :
47 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
48 next(U16_TRAIL(cp)) :
49 USTRINGTRIE_NO_MATCH);
50 }
51
52 UStringTrieResult
nextForCodePoint(UChar32 cp)53 UCharsTrie::nextForCodePoint(UChar32 cp) {
54 return cp<=0xffff ?
55 next(cp) :
56 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
57 next(U16_TRAIL(cp)) :
58 USTRINGTRIE_NO_MATCH);
59 }
60
61 UStringTrieResult
branchNext(const UChar * pos,int32_t length,int32_t uchar)62 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
63 // Branch according to the current unit.
64 if(length==0) {
65 length=*pos++;
66 }
67 ++length;
68 // The length of the branch is the number of units to select from.
69 // The data structure encodes a binary search.
70 while(length>kMaxBranchLinearSubNodeLength) {
71 if(uchar<*pos++) {
72 length>>=1;
73 pos=jumpByDelta(pos);
74 } else {
75 length=length-(length>>1);
76 pos=skipDelta(pos);
77 }
78 }
79 // Drop down to linear search for the last few units.
80 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
81 // and divides length by 2.
82 do {
83 if(uchar==*pos++) {
84 UStringTrieResult result;
85 int32_t node=*pos;
86 if(node&kValueIsFinal) {
87 // Leave the final value for getValue() to read.
88 result=USTRINGTRIE_FINAL_VALUE;
89 } else {
90 // Use the non-final value as the jump delta.
91 ++pos;
92 // int32_t delta=readValue(pos, node);
93 int32_t delta;
94 if(node<kMinTwoUnitValueLead) {
95 delta=node;
96 } else if(node<kThreeUnitValueLead) {
97 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
98 } else {
99 delta=(pos[0]<<16)|pos[1];
100 pos+=2;
101 }
102 // end readValue()
103 pos+=delta;
104 node=*pos;
105 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
106 }
107 pos_=pos;
108 return result;
109 }
110 --length;
111 pos=skipValue(pos);
112 } while(length>1);
113 if(uchar==*pos++) {
114 pos_=pos;
115 int32_t node=*pos;
116 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
117 } else {
118 stop();
119 return USTRINGTRIE_NO_MATCH;
120 }
121 }
122
123 UStringTrieResult
nextImpl(const UChar * pos,int32_t uchar)124 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
125 int32_t node=*pos++;
126 for(;;) {
127 if(node<kMinLinearMatch) {
128 return branchNext(pos, node, uchar);
129 } else if(node<kMinValueLead) {
130 // Match the first of length+1 units.
131 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
132 if(uchar==*pos++) {
133 remainingMatchLength_=--length;
134 pos_=pos;
135 return (length<0 && (node=*pos)>=kMinValueLead) ?
136 valueResult(node) : USTRINGTRIE_NO_VALUE;
137 } else {
138 // No match.
139 break;
140 }
141 } else if(node&kValueIsFinal) {
142 // No further matching units.
143 break;
144 } else {
145 // Skip intermediate value.
146 pos=skipNodeValue(pos, node);
147 node&=kNodeTypeMask;
148 }
149 }
150 stop();
151 return USTRINGTRIE_NO_MATCH;
152 }
153
154 UStringTrieResult
next(int32_t uchar)155 UCharsTrie::next(int32_t uchar) {
156 const UChar *pos=pos_;
157 if(pos==NULL) {
158 return USTRINGTRIE_NO_MATCH;
159 }
160 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
161 if(length>=0) {
162 // Remaining part of a linear-match node.
163 if(uchar==*pos++) {
164 remainingMatchLength_=--length;
165 pos_=pos;
166 int32_t node;
167 return (length<0 && (node=*pos)>=kMinValueLead) ?
168 valueResult(node) : USTRINGTRIE_NO_VALUE;
169 } else {
170 stop();
171 return USTRINGTRIE_NO_MATCH;
172 }
173 }
174 return nextImpl(pos, uchar);
175 }
176
177 UStringTrieResult
next(const UChar * s,int32_t sLength)178 UCharsTrie::next(const UChar *s, int32_t sLength) {
179 if(sLength<0 ? *s==0 : sLength==0) {
180 // Empty input.
181 return current();
182 }
183 const UChar *pos=pos_;
184 if(pos==NULL) {
185 return USTRINGTRIE_NO_MATCH;
186 }
187 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
188 for(;;) {
189 // Fetch the next input unit, if there is one.
190 // Continue a linear-match node without rechecking sLength<0.
191 int32_t uchar;
192 if(sLength<0) {
193 for(;;) {
194 if((uchar=*s++)==0) {
195 remainingMatchLength_=length;
196 pos_=pos;
197 int32_t node;
198 return (length<0 && (node=*pos)>=kMinValueLead) ?
199 valueResult(node) : USTRINGTRIE_NO_VALUE;
200 }
201 if(length<0) {
202 remainingMatchLength_=length;
203 break;
204 }
205 if(uchar!=*pos) {
206 stop();
207 return USTRINGTRIE_NO_MATCH;
208 }
209 ++pos;
210 --length;
211 }
212 } else {
213 for(;;) {
214 if(sLength==0) {
215 remainingMatchLength_=length;
216 pos_=pos;
217 int32_t node;
218 return (length<0 && (node=*pos)>=kMinValueLead) ?
219 valueResult(node) : USTRINGTRIE_NO_VALUE;
220 }
221 uchar=*s++;
222 --sLength;
223 if(length<0) {
224 remainingMatchLength_=length;
225 break;
226 }
227 if(uchar!=*pos) {
228 stop();
229 return USTRINGTRIE_NO_MATCH;
230 }
231 ++pos;
232 --length;
233 }
234 }
235 int32_t node=*pos++;
236 for(;;) {
237 if(node<kMinLinearMatch) {
238 UStringTrieResult result=branchNext(pos, node, uchar);
239 if(result==USTRINGTRIE_NO_MATCH) {
240 return USTRINGTRIE_NO_MATCH;
241 }
242 // Fetch the next input unit, if there is one.
243 if(sLength<0) {
244 if((uchar=*s++)==0) {
245 return result;
246 }
247 } else {
248 if(sLength==0) {
249 return result;
250 }
251 uchar=*s++;
252 --sLength;
253 }
254 if(result==USTRINGTRIE_FINAL_VALUE) {
255 // No further matching units.
256 stop();
257 return USTRINGTRIE_NO_MATCH;
258 }
259 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
260 node=*pos++;
261 } else if(node<kMinValueLead) {
262 // Match length+1 units.
263 length=node-kMinLinearMatch; // Actual match length minus 1.
264 if(uchar!=*pos) {
265 stop();
266 return USTRINGTRIE_NO_MATCH;
267 }
268 ++pos;
269 --length;
270 break;
271 } else if(node&kValueIsFinal) {
272 // No further matching units.
273 stop();
274 return USTRINGTRIE_NO_MATCH;
275 } else {
276 // Skip intermediate value.
277 pos=skipNodeValue(pos, node);
278 node&=kNodeTypeMask;
279 }
280 }
281 }
282 }
283
284 const UChar *
findUniqueValueFromBranch(const UChar * pos,int32_t length,UBool haveUniqueValue,int32_t & uniqueValue)285 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
286 UBool haveUniqueValue, int32_t &uniqueValue) {
287 while(length>kMaxBranchLinearSubNodeLength) {
288 ++pos; // ignore the comparison unit
289 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
290 return NULL;
291 }
292 length=length-(length>>1);
293 pos=skipDelta(pos);
294 }
295 do {
296 ++pos; // ignore a comparison unit
297 // handle its value
298 int32_t node=*pos++;
299 UBool isFinal=(UBool)(node>>15);
300 node&=0x7fff;
301 int32_t value=readValue(pos, node);
302 pos=skipValue(pos, node);
303 if(isFinal) {
304 if(haveUniqueValue) {
305 if(value!=uniqueValue) {
306 return NULL;
307 }
308 } else {
309 uniqueValue=value;
310 haveUniqueValue=TRUE;
311 }
312 } else {
313 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
314 return NULL;
315 }
316 haveUniqueValue=TRUE;
317 }
318 } while(--length>1);
319 return pos+1; // ignore the last comparison unit
320 }
321
322 UBool
findUniqueValue(const UChar * pos,UBool haveUniqueValue,int32_t & uniqueValue)323 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
324 int32_t node=*pos++;
325 for(;;) {
326 if(node<kMinLinearMatch) {
327 if(node==0) {
328 node=*pos++;
329 }
330 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
331 if(pos==NULL) {
332 return FALSE;
333 }
334 haveUniqueValue=TRUE;
335 node=*pos++;
336 } else if(node<kMinValueLead) {
337 // linear-match node
338 pos+=node-kMinLinearMatch+1; // Ignore the match units.
339 node=*pos++;
340 } else {
341 UBool isFinal=(UBool)(node>>15);
342 int32_t value;
343 if(isFinal) {
344 value=readValue(pos, node&0x7fff);
345 } else {
346 value=readNodeValue(pos, node);
347 }
348 if(haveUniqueValue) {
349 if(value!=uniqueValue) {
350 return FALSE;
351 }
352 } else {
353 uniqueValue=value;
354 haveUniqueValue=TRUE;
355 }
356 if(isFinal) {
357 return TRUE;
358 }
359 pos=skipNodeValue(pos, node);
360 node&=kNodeTypeMask;
361 }
362 }
363 }
364
365 int32_t
getNextUChars(Appendable & out) const366 UCharsTrie::getNextUChars(Appendable &out) const {
367 const UChar *pos=pos_;
368 if(pos==NULL) {
369 return 0;
370 }
371 if(remainingMatchLength_>=0) {
372 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node.
373 return 1;
374 }
375 int32_t node=*pos++;
376 if(node>=kMinValueLead) {
377 if(node&kValueIsFinal) {
378 return 0;
379 } else {
380 pos=skipNodeValue(pos, node);
381 node&=kNodeTypeMask;
382 }
383 }
384 if(node<kMinLinearMatch) {
385 if(node==0) {
386 node=*pos++;
387 }
388 out.reserveAppendCapacity(++node);
389 getNextBranchUChars(pos, node, out);
390 return node;
391 } else {
392 // First unit of the linear-match node.
393 out.appendCodeUnit(*pos);
394 return 1;
395 }
396 }
397
398 void
getNextBranchUChars(const UChar * pos,int32_t length,Appendable & out)399 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
400 while(length>kMaxBranchLinearSubNodeLength) {
401 ++pos; // ignore the comparison unit
402 getNextBranchUChars(jumpByDelta(pos), length>>1, out);
403 length=length-(length>>1);
404 pos=skipDelta(pos);
405 }
406 do {
407 out.appendCodeUnit(*pos++);
408 pos=skipValue(pos);
409 } while(--length>1);
410 out.appendCodeUnit(*pos);
411 }
412
413 U_NAMESPACE_END
414