1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2007-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  unisetspan.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007mar01
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf8.h"
23 #include "unicode/utf16.h"
24 #include "cmemory.h"
25 #include "uvector.h"
26 #include "unisetspan.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  * List of offsets from the current position from where to try matching
32  * a code point or a string.
33  * Store offsets rather than indexes to simplify the code and use the same list
34  * for both increments (in span()) and decrements (in spanBack()).
35  *
36  * Assumption: The maximum offset is limited, and the offsets that are stored
37  * at any one time are relatively dense, that is, there are normally no gaps of
38  * hundreds or thousands of offset values.
39  *
40  * The implementation uses a circular buffer of byte flags,
41  * each indicating whether the corresponding offset is in the list.
42  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43  * physically moving part of the list.
44  *
45  * Note: In principle, the caller should setMaxLength() to the maximum of the
46  * max string length and U16_LENGTH/U8_LENGTH to account for
47  * "long" single code points.
48  * However, this implementation uses at least a staticList with more than
49  * U8_LENGTH entries anyway.
50  *
51  * Note: If maxLength were guaranteed to be no more than 32 or 64,
52  * the list could be stored as bit flags in a single integer.
53  * Rather than handling a circular buffer with a start list index,
54  * the integer would simply be shifted when lower offsets are removed.
55  * UnicodeSet does not have a limit on the lengths of strings.
56  */
57 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
58 public:
OffsetList()59     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60 
~OffsetList()61     ~OffsetList() {
62         if(list!=staticList) {
63             uprv_free(list);
64         }
65     }
66 
67     // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68     void setMaxLength(int32_t maxLength) {
69         if(maxLength<=(int32_t)sizeof(staticList)) {
70             capacity=(int32_t)sizeof(staticList);
71         } else {
72             UBool *l=(UBool *)uprv_malloc(maxLength);
73             if(l!=NULL) {
74                 list=l;
75                 capacity=maxLength;
76             }
77         }
78         uprv_memset(list, 0, capacity);
79     }
80 
clear()81     void clear() {
82         uprv_memset(list, 0, capacity);
83         start=length=0;
84     }
85 
isEmpty() const86     UBool isEmpty() const {
87         return (UBool)(length==0);
88     }
89 
90     // Reduce all stored offsets by delta, used when the current position
91     // moves by delta.
92     // There must not be any offsets lower than delta.
93     // If there is an offset equal to delta, it is removed.
94     // delta=[1..maxLength]
shift(int32_t delta)95     void shift(int32_t delta) {
96         int32_t i=start+delta;
97         if(i>=capacity) {
98             i-=capacity;
99         }
100         if(list[i]) {
101             list[i]=FALSE;
102             --length;
103         }
104         start=i;
105     }
106 
107     // Add an offset. The list must not contain it yet.
108     // offset=[1..maxLength]
addOffset(int32_t offset)109     void addOffset(int32_t offset) {
110         int32_t i=start+offset;
111         if(i>=capacity) {
112             i-=capacity;
113         }
114         list[i]=TRUE;
115         ++length;
116     }
117 
118     // offset=[1..maxLength]
containsOffset(int32_t offset) const119     UBool containsOffset(int32_t offset) const {
120         int32_t i=start+offset;
121         if(i>=capacity) {
122             i-=capacity;
123         }
124         return list[i];
125     }
126 
127     // Find the lowest stored offset from a non-empty list, remove it,
128     // and reduce all other offsets by this minimum.
129     // Returns [1..maxLength].
popMinimum()130     int32_t popMinimum() {
131         // Look for the next offset in list[start+1..capacity-1].
132         int32_t i=start, result;
133         while(++i<capacity) {
134             if(list[i]) {
135                 list[i]=FALSE;
136                 --length;
137                 result=i-start;
138                 start=i;
139                 return result;
140             }
141         }
142         // i==capacity
143 
144         // Wrap around and look for the next offset in list[0..start].
145         // Since the list is not empty, there will be one.
146         result=capacity-start;
147         i=0;
148         while(!list[i]) {
149             ++i;
150         }
151         list[i]=FALSE;
152         --length;
153         start=i;
154         return result+=i;
155     }
156 
157 private:
158     UBool *list;
159     int32_t capacity;
160     int32_t length;
161     int32_t start;
162 
163     UBool staticList[16];
164 };
165 
166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167 static int32_t
getUTF8Length(const UChar * s,int32_t length)168 getUTF8Length(const UChar *s, int32_t length) {
169     UErrorCode errorCode=U_ZERO_ERROR;
170     int32_t length8=0;
171     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173         return length8;
174     } else {
175         // The string contains an unpaired surrogate.
176         // Ignore this string.
177         return 0;
178     }
179 }
180 
181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184     UErrorCode errorCode=U_ZERO_ERROR;
185     int32_t length8=0;
186     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187     if(U_SUCCESS(errorCode)) {
188         return length8;
189     } else {
190         // The string contains an unpaired surrogate.
191         // Ignore this string.
192         return 0;
193     }
194 }
195 
196 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197 makeSpanLengthByte(int32_t spanLength) {
198     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 }
201 
202 // Construct for all variants of span(), or only for any one variant.
203 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205                                            const UVector &setStrings,
206                                            uint32_t which)
207         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209           utf8Length(0),
210           maxLength16(0), maxLength8(0),
211           all((UBool)(which==ALL)) {
212     spanSet.retainAll(set);
213     if(which&NOT_CONTAINED) {
214         // Default to the same sets.
215         // addToSpanNotSet() will create a separate set if necessary.
216         pSpanNotSet=&spanSet;
217     }
218 
219     // Determine if the strings even need to be taken into account at all for span() etc.
220     // If any string is relevant, then all strings need to be used for
221     // span(longest match) but only the relevant ones for span(while contained).
222     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226     int32_t stringsLength=strings.size();
227 
228     int32_t i, spanLength;
229     UBool someRelevant=FALSE;
230     for(i=0; i<stringsLength; ++i) {
231         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232         const UChar *s16=string.getBuffer();
233         int32_t length16=string.length();
234         if (length16==0) {
235             continue;  // skip the empty string
236         }
237         UBool thisRelevant;
238         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239         if(spanLength<length16) {  // Relevant string.
240             someRelevant=thisRelevant=TRUE;
241         } else {
242             thisRelevant=FALSE;
243         }
244         if((which&UTF16) && length16>maxLength16) {
245             maxLength16=length16;
246         }
247         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248             int32_t length8=getUTF8Length(s16, length16);
249             utf8Length+=length8;
250             if(length8>maxLength8) {
251                 maxLength8=length8;
252             }
253         }
254     }
255     if(!someRelevant) {
256         maxLength16=maxLength8=0;
257         return;
258     }
259 
260     // Freeze after checking for the need to use strings at all because freezing
261     // a set takes some time and memory which are wasted if there are no relevant strings.
262     if(all) {
263         spanSet.freeze();
264     }
265 
266     uint8_t *spanBackLengths;
267     uint8_t *spanUTF8Lengths;
268     uint8_t *spanBackUTF8Lengths;
269 
270     // Allocate a block of meta data.
271     int32_t allocSize;
272     if(all) {
273         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275     } else {
276         allocSize=stringsLength;  // One set of span lengths.
277         if(which&UTF8) {
278             // UTF-8 lengths and UTF-8 strings.
279             allocSize+=stringsLength*4+utf8Length;
280         }
281     }
282     if(allocSize<=(int32_t)sizeof(staticLengths)) {
283         utf8Lengths=staticLengths;
284     } else {
285         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286         if(utf8Lengths==NULL) {
287             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
288             return;  // Out of memory.
289         }
290     }
291 
292     if(all) {
293         // Store span lengths for all span() variants.
294         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295         spanBackLengths=spanLengths+stringsLength;
296         spanUTF8Lengths=spanBackLengths+stringsLength;
297         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298         utf8=spanBackUTF8Lengths+stringsLength;
299     } else {
300         // Store span lengths for only one span() variant.
301         if(which&UTF8) {
302             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303             utf8=spanLengths+stringsLength;
304         } else {
305             spanLengths=(uint8_t *)utf8Lengths;
306         }
307         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308     }
309 
310     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
312 
313     for(i=0; i<stringsLength; ++i) {
314         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315         const UChar *s16=string.getBuffer();
316         int32_t length16=string.length();
317         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318         if(spanLength<length16 && length16>0) {  // Relevant string.
319             if(which&UTF16) {
320                 if(which&CONTAINED) {
321                     if(which&FWD) {
322                         spanLengths[i]=makeSpanLengthByte(spanLength);
323                     }
324                     if(which&BACK) {
325                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
327                     }
328                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
330                 }
331             }
332             if(which&UTF8) {
333                 uint8_t *s8=utf8+utf8Count;
334                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335                 utf8Count+=utf8Lengths[i]=length8;
336                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
337                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338                 } else {  // Relevant for UTF-8.
339                     if(which&CONTAINED) {
340                         if(which&FWD) {
341                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343                         }
344                         if(which&BACK) {
345                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347                         }
348                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
350                     }
351                 }
352             }
353             if(which&NOT_CONTAINED) {
354                 // Add string start and end code points to the spanNotSet so that
355                 // a span(while not contained) stops before any string.
356                 UChar32 c;
357                 if(which&FWD) {
358                     int32_t len=0;
359                     U16_NEXT(s16, len, length16, c);
360                     addToSpanNotSet(c);
361                 }
362                 if(which&BACK) {
363                     int32_t len=length16;
364                     U16_PREV(s16, 0, len, c);
365                     addToSpanNotSet(c);
366                 }
367             }
368         } else {  // Irrelevant string. (Also the empty string.)
369             if(which&UTF8) {
370                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
371                     uint8_t *s8=utf8+utf8Count;
372                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373                     utf8Count+=utf8Lengths[i]=length8;
374                 } else {
375                     utf8Lengths[i]=0;
376                 }
377             }
378             if(all) {
379                 spanLengths[i]=spanBackLengths[i]=
380                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381                         (uint8_t)ALL_CP_CONTAINED;
382             } else {
383                 // All spanXYZLengths pointers contain the same address.
384                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385             }
386         }
387     }
388 
389     // Finish.
390     if(all) {
391         pSpanNotSet->freeze();
392     }
393 }
394 
395 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)396 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397                                            const UVector &newParentSetStrings)
398         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
399           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
400           utf8Length(otherStringSpan.utf8Length),
401           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402           all(TRUE) {
403     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404         pSpanNotSet=&spanSet;
405     } else {
406         pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407     }
408 
409     // Allocate a block of meta data.
410     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411     int32_t stringsLength=strings.size();
412     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413     if(allocSize<=(int32_t)sizeof(staticLengths)) {
414         utf8Lengths=staticLengths;
415     } else {
416         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417         if(utf8Lengths==NULL) {
418             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
419             return;  // Out of memory.
420         }
421     }
422 
423     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424     utf8=spanLengths+stringsLength*4;
425     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426 }
427 
~UnicodeSetStringSpan()428 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
430         delete pSpanNotSet;
431     }
432     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
433         uprv_free(utf8Lengths);
434     }
435 }
436 
addToSpanNotSet(UChar32 c)437 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
439         if(spanSet.contains(c)) {
440             return;  // Nothing to do.
441         }
442         UnicodeSet *newSet=spanSet.cloneAsThawed();
443         if(newSet==NULL) {
444             return;  // Out of memory.
445         } else {
446             pSpanNotSet=newSet;
447         }
448     }
449     pSpanNotSet->add(c);
450 }
451 
452 // Compare strings without any argument checks. Requires length>0.
453 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)454 matches16(const UChar *s, const UChar *t, int32_t length) {
455     do {
456         if(*s++!=*t++) {
457             return FALSE;
458         }
459     } while(--length>0);
460     return TRUE;
461 }
462 
463 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)464 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465     do {
466         if(*s++!=*t++) {
467             return FALSE;
468         }
469     } while(--length>0);
470     return TRUE;
471 }
472 
473 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
474 // at code point boundaries.
475 // That is, each edge of a match must not be in the middle of a surrogate pair.
476 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)477 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
478     s+=start;
479     limit-=start;
480     return matches16(s, t, length) &&
481            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
482            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
483 }
484 
485 // Does the set contain the next code point?
486 // If so, return its length; otherwise return its negative length.
487 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)488 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
489     UChar c=*s, c2;
490     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
491         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
492     }
493     return set.contains(c) ? 1 : -1;
494 }
495 
496 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)497 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
498     UChar c=s[length-1], c2;
499     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
500         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
501     }
502     return set.contains(c) ? 1 : -1;
503 }
504 
505 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)506 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507     UChar32 c=*s;
508     if(U8_IS_SINGLE(c)) {
509         return set.contains(c) ? 1 : -1;
510     }
511     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512     int32_t i=0;
513     U8_NEXT_OR_FFFD(s, i, length, c);
514     return set.contains(c) ? i : -i;
515 }
516 
517 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)518 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519     UChar32 c=s[length-1];
520     if(U8_IS_SINGLE(c)) {
521         return set.contains(c) ? 1 : -1;
522     }
523     int32_t i=length-1;
524     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
525     length-=i;
526     return set.contains(c) ? length : -length;
527 }
528 
529 /*
530  * Note: In span() when spanLength==0 (after a string match, or at the beginning
531  * after an empty code point span) and in spanNot() and spanNotUTF8(),
532  * string matching could use a binary search
533  * because all string matches are done from the same start index.
534  *
535  * For UTF-8, this would require a comparison function that returns UTF-16 order.
536  *
537  * This optimization should not be necessary for normal UnicodeSets because
538  * most sets have no strings, and most sets with strings have
539  * very few very short strings.
540  * For cases with many strings, it might be better to use a different API
541  * and implementation with a DFA (state machine).
542  */
543 
544 /*
545  * Algorithm for span(USET_SPAN_CONTAINED)
546  *
547  * Theoretical algorithm:
548  * - Iterate through the string, and at each code point boundary:
549  *   + If the code point there is in the set, then remember to continue after it.
550  *   + If a set string matches at the current position, then remember to continue after it.
551  *   + Either recursively span for each code point or string match,
552  *     or recursively span for all but the shortest one and
553  *     iteratively continue the span with the shortest local match.
554  *   + Remember the longest recursive span (the farthest end point).
555  *   + If there is no match at the current position, neither for the code point there
556  *     nor for any set string, then stop and return the longest recursive span length.
557  *
558  * Optimized implementation:
559  *
560  * (We assume that most sets will have very few very short strings.
561  * A span using a string-less set is extremely fast.)
562  *
563  * Create and cache a spanSet which contains all of the single code points
564  * of the original set but none of its strings.
565  *
566  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567  * - Loop:
568  *   + Try to match each set string at the end of the spanLength.
569  *     ~ Set strings that start with set-contained code points must be matched
570  *       with a partial overlap because the recursive algorithm would have tried
571  *       to match them at every position.
572  *     ~ Set strings that entirely consist of set-contained code points
573  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
574  *       recursive algorithm would continue after them anyway
575  *       and find the longest recursive match from their end.
576  *     ~ Rather than recursing, note each end point of a set string match.
577  *   + If no set string matched after spanSet.span(), then return
578  *     with where the spanSet.span() ended.
579  *   + If at least one set string matched after spanSet.span(), then
580  *     pop the shortest string match end point and continue
581  *     the loop, trying to match all set strings from there.
582  *   + If at least one more set string matched after a previous string match,
583  *     then test if the code point after the previous string match is also
584  *     contained in the set.
585  *     Continue the loop with the shortest end point of either this code point
586  *     or a matching set string.
587  *   + If no more set string matched after a previous string match,
588  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589  *     Stop if spanLength==0, otherwise continue the loop.
590  *
591  * By noting each end point of a set string match,
592  * the function visits each string position at most once and finishes
593  * in linear time.
594  *
595  * The recursive algorithm may visit the same string position many times
596  * if multiple paths lead to it and finishes in exponential time.
597  */
598 
599 /*
600  * Algorithm for span(USET_SPAN_SIMPLE)
601  *
602  * Theoretical algorithm:
603  * - Iterate through the string, and at each code point boundary:
604  *   + If the code point there is in the set, then remember to continue after it.
605  *   + If a set string matches at the current position, then remember to continue after it.
606  *   + Continue from the farthest match position and ignore all others.
607  *   + If there is no match at the current position,
608  *     then stop and return the current position.
609  *
610  * Optimized implementation:
611  *
612  * (Same assumption and spanSet as above.)
613  *
614  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615  * - Loop:
616  *   + Try to match each set string at the end of the spanLength.
617  *     ~ Set strings that start with set-contained code points must be matched
618  *       with a partial overlap because the standard algorithm would have tried
619  *       to match them earlier.
620  *     ~ Set strings that entirely consist of set-contained code points
621  *       must be matched with a full overlap because the longest-match algorithm
622  *       would hide set string matches that end earlier.
623  *       Such set strings need not be matched earlier inside the code point span
624  *       because the standard algorithm would then have continued after
625  *       the set string match anyway.
626  *     ~ Remember the longest set string match (farthest end point) from the earliest
627  *       starting point.
628  *   + If no set string matched after spanSet.span(), then return
629  *     with where the spanSet.span() ended.
630  *   + If at least one set string matched, then continue the loop after the
631  *     longest match from the earliest position.
632  *   + If no more set string matched after a previous string match,
633  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634  *     Stop if spanLength==0, otherwise continue the loop.
635  */
636 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const637 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
638     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639         return spanNot(s, length);
640     }
641     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642     if(spanLength==length) {
643         return length;
644     }
645 
646     // Consider strings; they may overlap with the span.
647     OffsetList offsets;
648     if(spanCondition==USET_SPAN_CONTAINED) {
649         // Use offset list to try all possibilities.
650         offsets.setMaxLength(maxLength16);
651     }
652     int32_t pos=spanLength, rest=length-pos;
653     int32_t i, stringsLength=strings.size();
654     for(;;) {
655         if(spanCondition==USET_SPAN_CONTAINED) {
656             for(i=0; i<stringsLength; ++i) {
657                 int32_t overlap=spanLengths[i];
658                 if(overlap==ALL_CP_CONTAINED) {
659                     continue;  // Irrelevant string. (Also the empty string.)
660                 }
661                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662                 const UChar *s16=string.getBuffer();
663                 int32_t length16=string.length();
664                 U_ASSERT(length>0);
665 
666                 // Try to match this string at pos-overlap..pos.
667                 if(overlap>=LONG_SPAN) {
668                     overlap=length16;
669                     // While contained: No point matching fully inside the code point span.
670                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
671                 }
672                 if(overlap>spanLength) {
673                     overlap=spanLength;
674                 }
675                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
676                 for(;;) {
677                     if(inc>rest) {
678                         break;
679                     }
680                     // Try to match if the increment is not listed already.
681                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682                         if(inc==rest) {
683                             return length;  // Reached the end of the string.
684                         }
685                         offsets.addOffset(inc);
686                     }
687                     if(overlap==0) {
688                         break;
689                     }
690                     --overlap;
691                     ++inc;
692                 }
693             }
694         } else /* USET_SPAN_SIMPLE */ {
695             int32_t maxInc=0, maxOverlap=0;
696             for(i=0; i<stringsLength; ++i) {
697                 int32_t overlap=spanLengths[i];
698                 // For longest match, we do need to try to match even an all-contained string
699                 // to find the match from the earliest start.
700 
701                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702                 const UChar *s16=string.getBuffer();
703                 int32_t length16=string.length();
704                 if (length16==0) {
705                     continue;  // skip the empty string
706                 }
707 
708                 // Try to match this string at pos-overlap..pos.
709                 if(overlap>=LONG_SPAN) {
710                     overlap=length16;
711                     // Longest match: Need to match fully inside the code point span
712                     // to find the match from the earliest start.
713                 }
714                 if(overlap>spanLength) {
715                     overlap=spanLength;
716                 }
717                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
718                 for(;;) {
719                     if(inc>rest || overlap<maxOverlap) {
720                         break;
721                     }
722                     // Try to match if the string is longer or starts earlier.
723                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724                         matches16CPB(s, pos-overlap, length, s16, length16)
725                     ) {
726                         maxInc=inc;  // Longest match from earliest start.
727                         maxOverlap=overlap;
728                         break;
729                     }
730                     --overlap;
731                     ++inc;
732                 }
733             }
734 
735             if(maxInc!=0 || maxOverlap!=0) {
736                 // Longest-match algorithm, and there was a string match.
737                 // Simply continue after it.
738                 pos+=maxInc;
739                 rest-=maxInc;
740                 if(rest==0) {
741                     return length;  // Reached the end of the string.
742                 }
743                 spanLength=0;  // Match strings from after a string match.
744                 continue;
745             }
746         }
747         // Finished trying to match all strings at pos.
748 
749         if(spanLength!=0 || pos==0) {
750             // The position is after an unlimited code point span (spanLength!=0),
751             // not after a string match.
752             // The only position where spanLength==0 after a span is pos==0.
753             // Otherwise, an unlimited code point span is only tried again when no
754             // strings match, and if such a non-initial span fails we stop.
755             if(offsets.isEmpty()) {
756                 return pos;  // No strings matched after a span.
757             }
758             // Match strings from after the next string match.
759         } else {
760             // The position is after a string match (or a single code point).
761             if(offsets.isEmpty()) {
762                 // No more strings matched after a previous string match.
763                 // Try another code point span from after the last string match.
764                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765                 if( spanLength==rest || // Reached the end of the string, or
766                     spanLength==0       // neither strings nor span progressed.
767                 ) {
768                     return pos+spanLength;
769                 }
770                 pos+=spanLength;
771                 rest-=spanLength;
772                 continue;  // spanLength>0: Match strings from after a span.
773             } else {
774                 // Try to match only one code point from after a string match if some
775                 // string matched beyond it, so that we try all possible positions
776                 // and don't overshoot.
777                 spanLength=spanOne(spanSet, s+pos, rest);
778                 if(spanLength>0) {
779                     if(spanLength==rest) {
780                         return length;  // Reached the end of the string.
781                     }
782                     // Match strings after this code point.
783                     // There cannot be any increments below it because UnicodeSet strings
784                     // contain multiple code points.
785                     pos+=spanLength;
786                     rest-=spanLength;
787                     offsets.shift(spanLength);
788                     spanLength=0;
789                     continue;  // Match strings from after a single code point.
790                 }
791                 // Match strings from after the next string match.
792             }
793         }
794         int32_t minOffset=offsets.popMinimum();
795         pos+=minOffset;
796         rest-=minOffset;
797         spanLength=0;  // Match strings from after a string match.
798     }
799 }
800 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const801 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
802     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803         return spanNotBack(s, length);
804     }
805     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806     if(pos==0) {
807         return 0;
808     }
809     int32_t spanLength=length-pos;
810 
811     // Consider strings; they may overlap with the span.
812     OffsetList offsets;
813     if(spanCondition==USET_SPAN_CONTAINED) {
814         // Use offset list to try all possibilities.
815         offsets.setMaxLength(maxLength16);
816     }
817     int32_t i, stringsLength=strings.size();
818     uint8_t *spanBackLengths=spanLengths;
819     if(all) {
820         spanBackLengths+=stringsLength;
821     }
822     for(;;) {
823         if(spanCondition==USET_SPAN_CONTAINED) {
824             for(i=0; i<stringsLength; ++i) {
825                 int32_t overlap=spanBackLengths[i];
826                 if(overlap==ALL_CP_CONTAINED) {
827                     continue;  // Irrelevant string. (Also the empty string.)
828                 }
829                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830                 const UChar *s16=string.getBuffer();
831                 int32_t length16=string.length();
832                 U_ASSERT(length>0);
833 
834                 // Try to match this string at pos-(length16-overlap)..pos-length16.
835                 if(overlap>=LONG_SPAN) {
836                     overlap=length16;
837                     // While contained: No point matching fully inside the code point span.
838                     int32_t len1=0;
839                     U16_FWD_1(s16, len1, overlap);
840                     overlap-=len1;  // Length of the string minus the first code point.
841                 }
842                 if(overlap>spanLength) {
843                     overlap=spanLength;
844                 }
845                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
846                 for(;;) {
847                     if(dec>pos) {
848                         break;
849                     }
850                     // Try to match if the decrement is not listed already.
851                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852                         if(dec==pos) {
853                             return 0;  // Reached the start of the string.
854                         }
855                         offsets.addOffset(dec);
856                     }
857                     if(overlap==0) {
858                         break;
859                     }
860                     --overlap;
861                     ++dec;
862                 }
863             }
864         } else /* USET_SPAN_SIMPLE */ {
865             int32_t maxDec=0, maxOverlap=0;
866             for(i=0; i<stringsLength; ++i) {
867                 int32_t overlap=spanBackLengths[i];
868                 // For longest match, we do need to try to match even an all-contained string
869                 // to find the match from the latest end.
870 
871                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872                 const UChar *s16=string.getBuffer();
873                 int32_t length16=string.length();
874                 if (length16==0) {
875                     continue;  // skip the empty string
876                 }
877 
878                 // Try to match this string at pos-(length16-overlap)..pos-length16.
879                 if(overlap>=LONG_SPAN) {
880                     overlap=length16;
881                     // Longest match: Need to match fully inside the code point span
882                     // to find the match from the latest end.
883                 }
884                 if(overlap>spanLength) {
885                     overlap=spanLength;
886                 }
887                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
888                 for(;;) {
889                     if(dec>pos || overlap<maxOverlap) {
890                         break;
891                     }
892                     // Try to match if the string is longer or ends later.
893                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894                         matches16CPB(s, pos-dec, length, s16, length16)
895                     ) {
896                         maxDec=dec;  // Longest match from latest end.
897                         maxOverlap=overlap;
898                         break;
899                     }
900                     --overlap;
901                     ++dec;
902                 }
903             }
904 
905             if(maxDec!=0 || maxOverlap!=0) {
906                 // Longest-match algorithm, and there was a string match.
907                 // Simply continue before it.
908                 pos-=maxDec;
909                 if(pos==0) {
910                     return 0;  // Reached the start of the string.
911                 }
912                 spanLength=0;  // Match strings from before a string match.
913                 continue;
914             }
915         }
916         // Finished trying to match all strings at pos.
917 
918         if(spanLength!=0 || pos==length) {
919             // The position is before an unlimited code point span (spanLength!=0),
920             // not before a string match.
921             // The only position where spanLength==0 before a span is pos==length.
922             // Otherwise, an unlimited code point span is only tried again when no
923             // strings match, and if such a non-initial span fails we stop.
924             if(offsets.isEmpty()) {
925                 return pos;  // No strings matched before a span.
926             }
927             // Match strings from before the next string match.
928         } else {
929             // The position is before a string match (or a single code point).
930             if(offsets.isEmpty()) {
931                 // No more strings matched before a previous string match.
932                 // Try another code point span from before the last string match.
933                 int32_t oldPos=pos;
934                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935                 spanLength=oldPos-pos;
936                 if( pos==0 ||           // Reached the start of the string, or
937                     spanLength==0       // neither strings nor span progressed.
938                 ) {
939                     return pos;
940                 }
941                 continue;  // spanLength>0: Match strings from before a span.
942             } else {
943                 // Try to match only one code point from before a string match if some
944                 // string matched beyond it, so that we try all possible positions
945                 // and don't overshoot.
946                 spanLength=spanOneBack(spanSet, s, pos);
947                 if(spanLength>0) {
948                     if(spanLength==pos) {
949                         return 0;  // Reached the start of the string.
950                     }
951                     // Match strings before this code point.
952                     // There cannot be any decrements below it because UnicodeSet strings
953                     // contain multiple code points.
954                     pos-=spanLength;
955                     offsets.shift(spanLength);
956                     spanLength=0;
957                     continue;  // Match strings from before a single code point.
958                 }
959                 // Match strings from before the next string match.
960             }
961         }
962         pos-=offsets.popMinimum();
963         spanLength=0;  // Match strings from before a string match.
964     }
965 }
966 
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const967 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969         return spanNotUTF8(s, length);
970     }
971     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972     if(spanLength==length) {
973         return length;
974     }
975 
976     // Consider strings; they may overlap with the span.
977     OffsetList offsets;
978     if(spanCondition==USET_SPAN_CONTAINED) {
979         // Use offset list to try all possibilities.
980         offsets.setMaxLength(maxLength8);
981     }
982     int32_t pos=spanLength, rest=length-pos;
983     int32_t i, stringsLength=strings.size();
984     uint8_t *spanUTF8Lengths=spanLengths;
985     if(all) {
986         spanUTF8Lengths+=2*stringsLength;
987     }
988     for(;;) {
989         const uint8_t *s8=utf8;
990         int32_t length8;
991         if(spanCondition==USET_SPAN_CONTAINED) {
992             for(i=0; i<stringsLength; ++i) {
993                 length8=utf8Lengths[i];
994                 if(length8==0) {
995                     continue;  // String not representable in UTF-8.
996                 }
997                 int32_t overlap=spanUTF8Lengths[i];
998                 if(overlap==ALL_CP_CONTAINED) {
999                     s8+=length8;
1000                     continue;  // Irrelevant string.
1001                 }
1002 
1003                 // Try to match this string at pos-overlap..pos.
1004                 if(overlap>=LONG_SPAN) {
1005                     overlap=length8;
1006                     // While contained: No point matching fully inside the code point span.
1007                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
1008                 }
1009                 if(overlap>spanLength) {
1010                     overlap=spanLength;
1011                 }
1012                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1013                 for(;;) {
1014                     if(inc>rest) {
1015                         break;
1016                     }
1017                     // Try to match if the increment is not listed already.
1018                     // Match at code point boundaries. (The UTF-8 strings were converted
1019                     // from UTF-16 and are guaranteed to be well-formed.)
1020                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021                             !offsets.containsOffset(inc) &&
1022                             matches8(s+pos-overlap, s8, length8)) {
1023                         if(inc==rest) {
1024                             return length;  // Reached the end of the string.
1025                         }
1026                         offsets.addOffset(inc);
1027                     }
1028                     if(overlap==0) {
1029                         break;
1030                     }
1031                     --overlap;
1032                     ++inc;
1033                 }
1034                 s8+=length8;
1035             }
1036         } else /* USET_SPAN_SIMPLE */ {
1037             int32_t maxInc=0, maxOverlap=0;
1038             for(i=0; i<stringsLength; ++i) {
1039                 length8=utf8Lengths[i];
1040                 if(length8==0) {
1041                     continue;  // String not representable in UTF-8.
1042                 }
1043                 int32_t overlap=spanUTF8Lengths[i];
1044                 // For longest match, we do need to try to match even an all-contained string
1045                 // to find the match from the earliest start.
1046 
1047                 // Try to match this string at pos-overlap..pos.
1048                 if(overlap>=LONG_SPAN) {
1049                     overlap=length8;
1050                     // Longest match: Need to match fully inside the code point span
1051                     // to find the match from the earliest start.
1052                 }
1053                 if(overlap>spanLength) {
1054                     overlap=spanLength;
1055                 }
1056                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1057                 for(;;) {
1058                     if(inc>rest || overlap<maxOverlap) {
1059                         break;
1060                     }
1061                     // Try to match if the string is longer or starts earlier.
1062                     // Match at code point boundaries. (The UTF-8 strings were converted
1063                     // from UTF-16 and are guaranteed to be well-formed.)
1064                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065                             (overlap>maxOverlap ||
1066                                 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067                             matches8(s+pos-overlap, s8, length8)) {
1068                         maxInc=inc;  // Longest match from earliest start.
1069                         maxOverlap=overlap;
1070                         break;
1071                     }
1072                     --overlap;
1073                     ++inc;
1074                 }
1075                 s8+=length8;
1076             }
1077 
1078             if(maxInc!=0 || maxOverlap!=0) {
1079                 // Longest-match algorithm, and there was a string match.
1080                 // Simply continue after it.
1081                 pos+=maxInc;
1082                 rest-=maxInc;
1083                 if(rest==0) {
1084                     return length;  // Reached the end of the string.
1085                 }
1086                 spanLength=0;  // Match strings from after a string match.
1087                 continue;
1088             }
1089         }
1090         // Finished trying to match all strings at pos.
1091 
1092         if(spanLength!=0 || pos==0) {
1093             // The position is after an unlimited code point span (spanLength!=0),
1094             // not after a string match.
1095             // The only position where spanLength==0 after a span is pos==0.
1096             // Otherwise, an unlimited code point span is only tried again when no
1097             // strings match, and if such a non-initial span fails we stop.
1098             if(offsets.isEmpty()) {
1099                 return pos;  // No strings matched after a span.
1100             }
1101             // Match strings from after the next string match.
1102         } else {
1103             // The position is after a string match (or a single code point).
1104             if(offsets.isEmpty()) {
1105                 // No more strings matched after a previous string match.
1106                 // Try another code point span from after the last string match.
1107                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108                 if( spanLength==rest || // Reached the end of the string, or
1109                     spanLength==0       // neither strings nor span progressed.
1110                 ) {
1111                     return pos+spanLength;
1112                 }
1113                 pos+=spanLength;
1114                 rest-=spanLength;
1115                 continue;  // spanLength>0: Match strings from after a span.
1116             } else {
1117                 // Try to match only one code point from after a string match if some
1118                 // string matched beyond it, so that we try all possible positions
1119                 // and don't overshoot.
1120                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121                 if(spanLength>0) {
1122                     if(spanLength==rest) {
1123                         return length;  // Reached the end of the string.
1124                     }
1125                     // Match strings after this code point.
1126                     // There cannot be any increments below it because UnicodeSet strings
1127                     // contain multiple code points.
1128                     pos+=spanLength;
1129                     rest-=spanLength;
1130                     offsets.shift(spanLength);
1131                     spanLength=0;
1132                     continue;  // Match strings from after a single code point.
1133                 }
1134                 // Match strings from after the next string match.
1135             }
1136         }
1137         int32_t minOffset=offsets.popMinimum();
1138         pos+=minOffset;
1139         rest-=minOffset;
1140         spanLength=0;  // Match strings from after a string match.
1141     }
1142 }
1143 
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1144 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146         return spanNotBackUTF8(s, length);
1147     }
1148     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149     if(pos==0) {
1150         return 0;
1151     }
1152     int32_t spanLength=length-pos;
1153 
1154     // Consider strings; they may overlap with the span.
1155     OffsetList offsets;
1156     if(spanCondition==USET_SPAN_CONTAINED) {
1157         // Use offset list to try all possibilities.
1158         offsets.setMaxLength(maxLength8);
1159     }
1160     int32_t i, stringsLength=strings.size();
1161     uint8_t *spanBackUTF8Lengths=spanLengths;
1162     if(all) {
1163         spanBackUTF8Lengths+=3*stringsLength;
1164     }
1165     for(;;) {
1166         const uint8_t *s8=utf8;
1167         int32_t length8;
1168         if(spanCondition==USET_SPAN_CONTAINED) {
1169             for(i=0; i<stringsLength; ++i) {
1170                 length8=utf8Lengths[i];
1171                 if(length8==0) {
1172                     continue;  // String not representable in UTF-8.
1173                 }
1174                 int32_t overlap=spanBackUTF8Lengths[i];
1175                 if(overlap==ALL_CP_CONTAINED) {
1176                     s8+=length8;
1177                     continue;  // Irrelevant string.
1178                 }
1179 
1180                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1181                 if(overlap>=LONG_SPAN) {
1182                     overlap=length8;
1183                     // While contained: No point matching fully inside the code point span.
1184                     int32_t len1=0;
1185                     U8_FWD_1(s8, len1, overlap);
1186                     overlap-=len1;  // Length of the string minus the first code point.
1187                 }
1188                 if(overlap>spanLength) {
1189                     overlap=spanLength;
1190                 }
1191                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1192                 for(;;) {
1193                     if(dec>pos) {
1194                         break;
1195                     }
1196                     // Try to match if the decrement is not listed already.
1197                     // Match at code point boundaries. (The UTF-8 strings were converted
1198                     // from UTF-16 and are guaranteed to be well-formed.)
1199                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1200                         !offsets.containsOffset(dec) &&
1201                         matches8(s+pos-dec, s8, length8)
1202                     ) {
1203                         if(dec==pos) {
1204                             return 0;  // Reached the start of the string.
1205                         }
1206                         offsets.addOffset(dec);
1207                     }
1208                     if(overlap==0) {
1209                         break;
1210                     }
1211                     --overlap;
1212                     ++dec;
1213                 }
1214                 s8+=length8;
1215             }
1216         } else /* USET_SPAN_SIMPLE */ {
1217             int32_t maxDec=0, maxOverlap=0;
1218             for(i=0; i<stringsLength; ++i) {
1219                 length8=utf8Lengths[i];
1220                 if(length8==0) {
1221                     continue;  // String not representable in UTF-8.
1222                 }
1223                 int32_t overlap=spanBackUTF8Lengths[i];
1224                 // For longest match, we do need to try to match even an all-contained string
1225                 // to find the match from the latest end.
1226 
1227                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1228                 if(overlap>=LONG_SPAN) {
1229                     overlap=length8;
1230                     // Longest match: Need to match fully inside the code point span
1231                     // to find the match from the latest end.
1232                 }
1233                 if(overlap>spanLength) {
1234                     overlap=spanLength;
1235                 }
1236                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1237                 for(;;) {
1238                     if(dec>pos || overlap<maxOverlap) {
1239                         break;
1240                     }
1241                     // Try to match if the string is longer or ends later.
1242                     // Match at code point boundaries. (The UTF-8 strings were converted
1243                     // from UTF-16 and are guaranteed to be well-formed.)
1244                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1245                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246                         matches8(s+pos-dec, s8, length8)
1247                     ) {
1248                         maxDec=dec;  // Longest match from latest end.
1249                         maxOverlap=overlap;
1250                         break;
1251                     }
1252                     --overlap;
1253                     ++dec;
1254                 }
1255                 s8+=length8;
1256             }
1257 
1258             if(maxDec!=0 || maxOverlap!=0) {
1259                 // Longest-match algorithm, and there was a string match.
1260                 // Simply continue before it.
1261                 pos-=maxDec;
1262                 if(pos==0) {
1263                     return 0;  // Reached the start of the string.
1264                 }
1265                 spanLength=0;  // Match strings from before a string match.
1266                 continue;
1267             }
1268         }
1269         // Finished trying to match all strings at pos.
1270 
1271         if(spanLength!=0 || pos==length) {
1272             // The position is before an unlimited code point span (spanLength!=0),
1273             // not before a string match.
1274             // The only position where spanLength==0 before a span is pos==length.
1275             // Otherwise, an unlimited code point span is only tried again when no
1276             // strings match, and if such a non-initial span fails we stop.
1277             if(offsets.isEmpty()) {
1278                 return pos;  // No strings matched before a span.
1279             }
1280             // Match strings from before the next string match.
1281         } else {
1282             // The position is before a string match (or a single code point).
1283             if(offsets.isEmpty()) {
1284                 // No more strings matched before a previous string match.
1285                 // Try another code point span from before the last string match.
1286                 int32_t oldPos=pos;
1287                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288                 spanLength=oldPos-pos;
1289                 if( pos==0 ||           // Reached the start of the string, or
1290                     spanLength==0       // neither strings nor span progressed.
1291                 ) {
1292                     return pos;
1293                 }
1294                 continue;  // spanLength>0: Match strings from before a span.
1295             } else {
1296                 // Try to match only one code point from before a string match if some
1297                 // string matched beyond it, so that we try all possible positions
1298                 // and don't overshoot.
1299                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1300                 if(spanLength>0) {
1301                     if(spanLength==pos) {
1302                         return 0;  // Reached the start of the string.
1303                     }
1304                     // Match strings before this code point.
1305                     // There cannot be any decrements below it because UnicodeSet strings
1306                     // contain multiple code points.
1307                     pos-=spanLength;
1308                     offsets.shift(spanLength);
1309                     spanLength=0;
1310                     continue;  // Match strings from before a single code point.
1311                 }
1312                 // Match strings from before the next string match.
1313             }
1314         }
1315         pos-=offsets.popMinimum();
1316         spanLength=0;  // Match strings from before a string match.
1317     }
1318 }
1319 
1320 /*
1321  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322  *
1323  * Theoretical algorithm:
1324  * - Iterate through the string, and at each code point boundary:
1325  *   + If the code point there is in the set, then return with the current position.
1326  *   + If a set string matches at the current position, then return with the current position.
1327  *
1328  * Optimized implementation:
1329  *
1330  * (Same assumption as for span() above.)
1331  *
1332  * Create and cache a spanNotSet which contains all of the single code points
1333  * of the original set but none of its strings.
1334  * For each set string add its initial code point to the spanNotSet.
1335  * (Also add its final code point for spanNotBack().)
1336  *
1337  * - Loop:
1338  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339  *   + If the current code point is in the original set, then
1340  *     return the current position.
1341  *   + If any set string matches at the current position, then
1342  *     return the current position.
1343  *   + If there is no match at the current position, neither for the code point there
1344  *     nor for any set string, then skip this code point and continue the loop.
1345  *     This happens for set-string-initial code points that were added to spanNotSet
1346  *     when there is not actually a match for such a set string.
1347  */
1348 
spanNot(const UChar * s,int32_t length) const1349 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1350     int32_t pos=0, rest=length;
1351     int32_t i, stringsLength=strings.size();
1352     do {
1353         // Span until we find a code point from the set,
1354         // or a code point that starts or ends some string.
1355         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356         if(i==rest) {
1357             return length;  // Reached the end of the string.
1358         }
1359         pos+=i;
1360         rest-=i;
1361 
1362         // Check whether the current code point is in the original set,
1363         // without the string starts and ends.
1364         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365         if(cpLength>0) {
1366             return pos;  // There is a set element at pos.
1367         }
1368 
1369         // Try to match the strings at pos.
1370         for(i=0; i<stringsLength; ++i) {
1371             if(spanLengths[i]==ALL_CP_CONTAINED) {
1372                 continue;  // Irrelevant string. (Also the empty string.)
1373             }
1374             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375             const UChar *s16=string.getBuffer();
1376             int32_t length16=string.length();
1377             U_ASSERT(length>0);
1378             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379                 return pos;  // There is a set element at pos.
1380             }
1381         }
1382 
1383         // The span(while not contained) ended on a string start/end which is
1384         // not in the original set. Skip this code point and continue.
1385         // cpLength<0
1386         pos-=cpLength;
1387         rest+=cpLength;
1388     } while(rest!=0);
1389     return length;  // Reached the end of the string.
1390 }
1391 
spanNotBack(const UChar * s,int32_t length) const1392 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1393     int32_t pos=length;
1394     int32_t i, stringsLength=strings.size();
1395     do {
1396         // Span until we find a code point from the set,
1397         // or a code point that starts or ends some string.
1398         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399         if(pos==0) {
1400             return 0;  // Reached the start of the string.
1401         }
1402 
1403         // Check whether the current code point is in the original set,
1404         // without the string starts and ends.
1405         int32_t cpLength=spanOneBack(spanSet, s, pos);
1406         if(cpLength>0) {
1407             return pos;  // There is a set element at pos.
1408         }
1409 
1410         // Try to match the strings at pos.
1411         for(i=0; i<stringsLength; ++i) {
1412             // Use spanLengths rather than a spanBackLengths pointer because
1413             // it is easier and we only need to know whether the string is irrelevant
1414             // which is the same in either array.
1415             if(spanLengths[i]==ALL_CP_CONTAINED) {
1416                 continue;  // Irrelevant string. (Also the empty string.)
1417             }
1418             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419             const UChar *s16=string.getBuffer();
1420             int32_t length16=string.length();
1421             U_ASSERT(length>0);
1422             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423                 return pos;  // There is a set element at pos.
1424             }
1425         }
1426 
1427         // The span(while not contained) ended on a string start/end which is
1428         // not in the original set. Skip this code point and continue.
1429         // cpLength<0
1430         pos+=cpLength;
1431     } while(pos!=0);
1432     return 0;  // Reached the start of the string.
1433 }
1434 
spanNotUTF8(const uint8_t * s,int32_t length) const1435 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436     int32_t pos=0, rest=length;
1437     int32_t i, stringsLength=strings.size();
1438     uint8_t *spanUTF8Lengths=spanLengths;
1439     if(all) {
1440         spanUTF8Lengths+=2*stringsLength;
1441     }
1442     do {
1443         // Span until we find a code point from the set,
1444         // or a code point that starts or ends some string.
1445         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446         if(i==rest) {
1447             return length;  // Reached the end of the string.
1448         }
1449         pos+=i;
1450         rest-=i;
1451 
1452         // Check whether the current code point is in the original set,
1453         // without the string starts and ends.
1454         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455         if(cpLength>0) {
1456             return pos;  // There is a set element at pos.
1457         }
1458 
1459         // Try to match the strings at pos.
1460         const uint8_t *s8=utf8;
1461         int32_t length8;
1462         for(i=0; i<stringsLength; ++i) {
1463             length8=utf8Lengths[i];
1464             // ALL_CP_CONTAINED: Irrelevant string.
1465             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466                 return pos;  // There is a set element at pos.
1467             }
1468             s8+=length8;
1469         }
1470 
1471         // The span(while not contained) ended on a string start/end which is
1472         // not in the original set. Skip this code point and continue.
1473         // cpLength<0
1474         pos-=cpLength;
1475         rest+=cpLength;
1476     } while(rest!=0);
1477     return length;  // Reached the end of the string.
1478 }
1479 
spanNotBackUTF8(const uint8_t * s,int32_t length) const1480 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481     int32_t pos=length;
1482     int32_t i, stringsLength=strings.size();
1483     uint8_t *spanBackUTF8Lengths=spanLengths;
1484     if(all) {
1485         spanBackUTF8Lengths+=3*stringsLength;
1486     }
1487     do {
1488         // Span until we find a code point from the set,
1489         // or a code point that starts or ends some string.
1490         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491         if(pos==0) {
1492             return 0;  // Reached the start of the string.
1493         }
1494 
1495         // Check whether the current code point is in the original set,
1496         // without the string starts and ends.
1497         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498         if(cpLength>0) {
1499             return pos;  // There is a set element at pos.
1500         }
1501 
1502         // Try to match the strings at pos.
1503         const uint8_t *s8=utf8;
1504         int32_t length8;
1505         for(i=0; i<stringsLength; ++i) {
1506             length8=utf8Lengths[i];
1507             // ALL_CP_CONTAINED: Irrelevant string.
1508             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509                 return pos;  // There is a set element at pos.
1510             }
1511             s8+=length8;
1512         }
1513 
1514         // The span(while not contained) ended on a string start/end which is
1515         // not in the original set. Skip this code point and continue.
1516         // cpLength<0
1517         pos+=cpLength;
1518     } while(pos!=0);
1519     return 0;  // Reached the start of the string.
1520 }
1521 
1522 U_NAMESPACE_END
1523