1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2007-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  bmpset.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan29
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/utf8.h"
22 #include "unicode/utf16.h"
23 #include "cmemory.h"
24 #include "bmpset.h"
25 #include "uassert.h"
26 
27 U_NAMESPACE_BEGIN
28 
BMPSet(const int32_t * parentList,int32_t parentListLength)29 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30         list(parentList), listLength(parentListLength) {
31     uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
32     uprv_memset(table7FF, 0, sizeof(table7FF));
33     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34 
35     /*
36      * Set the list indexes for binary searches for
37      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39      * looked up in the bit tables.
40      * The last pair of indexes is for finding supplementary code points.
41      */
42     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43     int32_t i;
44     for(i=1; i<=0x10; ++i) {
45         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46     }
47     list4kStarts[0x11]=listLength-1;
48     containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
49 
50     initBits();
51     overrideIllegal();
52 }
53 
BMPSet(const BMPSet & otherBMPSet,const int32_t * newParentList,int32_t newParentListLength)54 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55         containsFFFD(otherBMPSet.containsFFFD),
56         list(newParentList), listLength(newParentListLength) {
57     uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
58     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
59     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
60     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
61 }
62 
~BMPSet()63 BMPSet::~BMPSet() {
64 }
65 
66 /*
67  * Set bits in a bit rectangle in "vertical" bit organization.
68  * start<limit<=0x800
69  */
set32x64Bits(uint32_t table[64],int32_t start,int32_t limit)70 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
71     U_ASSERT(start<limit);
72     U_ASSERT(limit<=0x800);
73 
74     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
75     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
76 
77     // Set one bit indicating an all-one block.
78     uint32_t bits=(uint32_t)1<<lead;
79     if((start+1)==limit) {  // Single-character shortcut.
80         table[trail]|=bits;
81         return;
82     }
83 
84     int32_t limitLead=limit>>6;
85     int32_t limitTrail=limit&0x3f;
86 
87     if(lead==limitLead) {
88         // Partial vertical bit column.
89         while(trail<limitTrail) {
90             table[trail++]|=bits;
91         }
92     } else {
93         // Partial vertical bit column,
94         // followed by a bit rectangle,
95         // followed by another partial vertical bit column.
96         if(trail>0) {
97             do {
98                 table[trail++]|=bits;
99             } while(trail<64);
100             ++lead;
101         }
102         if(lead<limitLead) {
103             bits=~(((unsigned)1<<lead)-1);
104             if(limitLead<0x20) {
105                 bits&=((unsigned)1<<limitLead)-1;
106             }
107             for(trail=0; trail<64; ++trail) {
108                 table[trail]|=bits;
109             }
110         }
111         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112         // In that case, bits=1<<limitLead is undefined but the bits value
113         // is not used because trail<limitTrail is already false.
114         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
115         for(trail=0; trail<limitTrail; ++trail) {
116             table[trail]|=bits;
117         }
118     }
119 }
120 
initBits()121 void BMPSet::initBits() {
122     UChar32 start, limit;
123     int32_t listIndex=0;
124 
125     // Set latin1Contains[].
126     do {
127         start=list[listIndex++];
128         if(listIndex<listLength) {
129             limit=list[listIndex++];
130         } else {
131             limit=0x110000;
132         }
133         if(start>=0x100) {
134             break;
135         }
136         do {
137             latin1Contains[start++]=1;
138         } while(start<limit && start<0x100);
139     } while(limit<=0x100);
140 
141     // Find the first range overlapping with (or after) 80..FF again,
142     // to include them in table7FF as well.
143     for(listIndex=0;;) {
144         start=list[listIndex++];
145         if(listIndex<listLength) {
146             limit=list[listIndex++];
147         } else {
148             limit=0x110000;
149         }
150         if(limit>0x80) {
151             if(start<0x80) {
152                 start=0x80;
153             }
154             break;
155         }
156     }
157 
158     // Set table7FF[].
159     while(start<0x800) {
160         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
161         if(limit>0x800) {
162             start=0x800;
163             break;
164         }
165 
166         start=list[listIndex++];
167         if(listIndex<listLength) {
168             limit=list[listIndex++];
169         } else {
170             limit=0x110000;
171         }
172     }
173 
174     // Set bmpBlockBits[].
175     int32_t minStart=0x800;
176     while(start<0x10000) {
177         if(limit>0x10000) {
178             limit=0x10000;
179         }
180 
181         if(start<minStart) {
182             start=minStart;
183         }
184         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
185             if(start&0x3f) {
186                 // Mixed-value block of 64 code points.
187                 start>>=6;
188                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
189                 start=(start+1)<<6;  // Round up to the next block boundary.
190                 minStart=start;      // Ignore further ranges in this block.
191             }
192             if(start<limit) {
193                 if(start<(limit&~0x3f)) {
194                     // Multiple all-ones blocks of 64 code points each.
195                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
196                 }
197 
198                 if(limit&0x3f) {
199                     // Mixed-value block of 64 code points.
200                     limit>>=6;
201                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
202                     limit=(limit+1)<<6;  // Round up to the next block boundary.
203                     minStart=limit;      // Ignore further ranges in this block.
204                 }
205             }
206         }
207 
208         if(limit==0x10000) {
209             break;
210         }
211 
212         start=list[listIndex++];
213         if(listIndex<listLength) {
214             limit=list[listIndex++];
215         } else {
216             limit=0x110000;
217         }
218     }
219 }
220 
221 /*
222  * Override some bits and bytes to the result of contains(FFFD)
223  * for faster validity checking at runtime.
224  * No need to set 0 values where they were reset to 0 in the constructor
225  * and not modified by initBits().
226  * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
227  * Need to set 0 values for surrogates D800..DFFF.
228  */
overrideIllegal()229 void BMPSet::overrideIllegal() {
230     uint32_t bits, mask;
231     int32_t i;
232 
233     if(containsFFFD) {
234         bits=3;                 // Lead bytes 0xC0 and 0xC1.
235         for(i=0; i<64; ++i) {
236             table7FF[i]|=bits;
237         }
238 
239         bits=1;                 // Lead byte 0xE0.
240         for(i=0; i<32; ++i) {   // First half of 4k block.
241             bmpBlockBits[i]|=bits;
242         }
243 
244         mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
245         bits=1<<0xd;
246         for(i=32; i<64; ++i) {  // Second half of 4k block.
247             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
248         }
249     } else {
250         mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
251         for(i=32; i<64; ++i) {  // Second half of 4k block.
252             bmpBlockBits[i]&=mask;
253         }
254     }
255 }
256 
findCodePoint(UChar32 c,int32_t lo,int32_t hi) const257 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
258     /* Examples:
259                                        findCodePoint(c)
260        set              list[]         c=0 1 3 4 7 8
261        ===              ==============   ===========
262        []               [110000]         0 0 0 0 0 0
263        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
264        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
265        [:Any:]          [0, 110000]      1 1 1 1 1 1
266      */
267 
268     // Return the smallest i such that c < list[i].  Assume
269     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
270     if (c < list[lo])
271         return lo;
272     // High runner test.  c is often after the last range, so an
273     // initial check for this condition pays off.
274     if (lo >= hi || c >= list[hi-1])
275         return hi;
276     // invariant: c >= list[lo]
277     // invariant: c < list[hi]
278     for (;;) {
279         int32_t i = (lo + hi) >> 1;
280         if (i == lo) {
281             break; // Found!
282         } else if (c < list[i]) {
283             hi = i;
284         } else {
285             lo = i;
286         }
287     }
288     return hi;
289 }
290 
291 UBool
contains(UChar32 c) const292 BMPSet::contains(UChar32 c) const {
293     if((uint32_t)c<=0xff) {
294         return (UBool)latin1Contains[c];
295     } else if((uint32_t)c<=0x7ff) {
296         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
297     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
298         int lead=c>>12;
299         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
300         if(twoBits<=1) {
301             // All 64 code points with the same bits 15..6
302             // are either in the set or not.
303             return (UBool)twoBits;
304         } else {
305             // Look up the code point in its 4k block of code points.
306             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
307         }
308     } else if((uint32_t)c<=0x10ffff) {
309         // surrogate or supplementary code point
310         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
311     } else {
312         // Out-of-range code points get FALSE, consistent with long-standing
313         // behavior of UnicodeSet::contains(c).
314         return FALSE;
315     }
316 }
317 
318 /*
319  * Check for sufficient length for trail unit for each surrogate pair.
320  * Handle single surrogates as surrogate code points as usual in ICU.
321  */
322 const UChar *
span(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const323 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
324     UChar c, c2;
325 
326     if(spanCondition) {
327         // span
328         do {
329             c=*s;
330             if(c<=0xff) {
331                 if(!latin1Contains[c]) {
332                     break;
333                 }
334             } else if(c<=0x7ff) {
335                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
336                     break;
337                 }
338             } else if(c<0xd800 || c>=0xe000) {
339                 int lead=c>>12;
340                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
341                 if(twoBits<=1) {
342                     // All 64 code points with the same bits 15..6
343                     // are either in the set or not.
344                     if(twoBits==0) {
345                         break;
346                     }
347                 } else {
348                     // Look up the code point in its 4k block of code points.
349                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
350                         break;
351                     }
352                 }
353             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
354                 // surrogate code point
355                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
356                     break;
357                 }
358             } else {
359                 // surrogate pair
360                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
361                     break;
362                 }
363                 ++s;
364             }
365         } while(++s<limit);
366     } else {
367         // span not
368         do {
369             c=*s;
370             if(c<=0xff) {
371                 if(latin1Contains[c]) {
372                     break;
373                 }
374             } else if(c<=0x7ff) {
375                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
376                     break;
377                 }
378             } else if(c<0xd800 || c>=0xe000) {
379                 int lead=c>>12;
380                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
381                 if(twoBits<=1) {
382                     // All 64 code points with the same bits 15..6
383                     // are either in the set or not.
384                     if(twoBits!=0) {
385                         break;
386                     }
387                 } else {
388                     // Look up the code point in its 4k block of code points.
389                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
390                         break;
391                     }
392                 }
393             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
394                 // surrogate code point
395                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
396                     break;
397                 }
398             } else {
399                 // surrogate pair
400                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
401                     break;
402                 }
403                 ++s;
404             }
405         } while(++s<limit);
406     }
407     return s;
408 }
409 
410 /* Symmetrical with span(). */
411 const UChar *
spanBack(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const412 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
413     UChar c, c2;
414 
415     if(spanCondition) {
416         // span
417         for(;;) {
418             c=*(--limit);
419             if(c<=0xff) {
420                 if(!latin1Contains[c]) {
421                     break;
422                 }
423             } else if(c<=0x7ff) {
424                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
425                     break;
426                 }
427             } else if(c<0xd800 || c>=0xe000) {
428                 int lead=c>>12;
429                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
430                 if(twoBits<=1) {
431                     // All 64 code points with the same bits 15..6
432                     // are either in the set or not.
433                     if(twoBits==0) {
434                         break;
435                     }
436                 } else {
437                     // Look up the code point in its 4k block of code points.
438                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
439                         break;
440                     }
441                 }
442             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
443                 // surrogate code point
444                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
445                     break;
446                 }
447             } else {
448                 // surrogate pair
449                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
450                     break;
451                 }
452                 --limit;
453             }
454             if(s==limit) {
455                 return s;
456             }
457         }
458     } else {
459         // span not
460         for(;;) {
461             c=*(--limit);
462             if(c<=0xff) {
463                 if(latin1Contains[c]) {
464                     break;
465                 }
466             } else if(c<=0x7ff) {
467                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
468                     break;
469                 }
470             } else if(c<0xd800 || c>=0xe000) {
471                 int lead=c>>12;
472                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
473                 if(twoBits<=1) {
474                     // All 64 code points with the same bits 15..6
475                     // are either in the set or not.
476                     if(twoBits!=0) {
477                         break;
478                     }
479                 } else {
480                     // Look up the code point in its 4k block of code points.
481                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
482                         break;
483                     }
484                 }
485             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
486                 // surrogate code point
487                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
488                     break;
489                 }
490             } else {
491                 // surrogate pair
492                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
493                     break;
494                 }
495                 --limit;
496             }
497             if(s==limit) {
498                 return s;
499             }
500         }
501     }
502     return limit+1;
503 }
504 
505 /*
506  * Precheck for sufficient trail bytes at end of string only once per span.
507  * Check validity.
508  */
509 const uint8_t *
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const510 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
511     const uint8_t *limit=s+length;
512     uint8_t b=*s;
513     if(U8_IS_SINGLE(b)) {
514         // Initial all-ASCII span.
515         if(spanCondition) {
516             do {
517                 if(!latin1Contains[b] || ++s==limit) {
518                     return s;
519                 }
520                 b=*s;
521             } while(U8_IS_SINGLE(b));
522         } else {
523             do {
524                 if(latin1Contains[b] || ++s==limit) {
525                     return s;
526                 }
527                 b=*s;
528             } while(U8_IS_SINGLE(b));
529         }
530         length=(int32_t)(limit-s);
531     }
532 
533     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
534         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
535     }
536 
537     const uint8_t *limit0=limit;
538 
539     /*
540      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541      * or runs into a lead byte.
542      * In the span loop compare s with limit only once
543      * per multi-byte character.
544      *
545      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546      * including it if that is part of the span, otherwise set limit0 to before
547      * the truncated sequence.
548      */
549     b=*(limit-1);
550     if((int8_t)b<0) {
551         // b>=0x80: lead or trail byte
552         if(b<0xc0) {
553             // single trail byte, check for preceding 3- or 4-byte lead byte
554             if(length>=2 && (b=*(limit-2))>=0xe0) {
555                 limit-=2;
556                 if(containsFFFD!=spanCondition) {
557                     limit0=limit;
558                 }
559             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
560                 // 4-byte lead byte with only two trail bytes
561                 limit-=3;
562                 if(containsFFFD!=spanCondition) {
563                     limit0=limit;
564                 }
565             }
566         } else {
567             // lead byte with no trail bytes
568             --limit;
569             if(containsFFFD!=spanCondition) {
570                 limit0=limit;
571             }
572         }
573     }
574 
575     uint8_t t1, t2, t3;
576 
577     while(s<limit) {
578         b=*s;
579         if(U8_IS_SINGLE(b)) {
580             // ASCII
581             if(spanCondition) {
582                 do {
583                     if(!latin1Contains[b]) {
584                         return s;
585                     } else if(++s==limit) {
586                         return limit0;
587                     }
588                     b=*s;
589                 } while(U8_IS_SINGLE(b));
590             } else {
591                 do {
592                     if(latin1Contains[b]) {
593                         return s;
594                     } else if(++s==limit) {
595                         return limit0;
596                     }
597                     b=*s;
598                 } while(U8_IS_SINGLE(b));
599             }
600         }
601         ++s;  // Advance past the lead byte.
602         if(b>=0xe0) {
603             if(b<0xf0) {
604                 if( /* handle U+0000..U+FFFF inline */
605                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
606                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
607                 ) {
608                     b&=0xf;
609                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
610                     if(twoBits<=1) {
611                         // All 64 code points with this lead byte and middle trail byte
612                         // are either in the set or not.
613                         if(twoBits!=(uint32_t)spanCondition) {
614                             return s-1;
615                         }
616                     } else {
617                         // Look up the code point in its 4k block of code points.
618                         UChar32 c=(b<<12)|(t1<<6)|t2;
619                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
620                             return s-1;
621                         }
622                     }
623                     s+=2;
624                     continue;
625                 }
626             } else if( /* handle U+10000..U+10FFFF inline */
627                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
628                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
629                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
630             ) {
631                 // Give an illegal sequence the same value as the result of contains(FFFD).
632                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
633                 if( (   (0x10000<=c && c<=0x10ffff) ?
634                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
635                             containsFFFD
636                     ) != spanCondition
637                 ) {
638                     return s-1;
639                 }
640                 s+=3;
641                 continue;
642             }
643         } else {
644             if( /* handle U+0000..U+07FF inline */
645                 b>=0xc0 &&
646                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
647             ) {
648                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
649                     return s-1;
650                 }
651                 ++s;
652                 continue;
653             }
654         }
655 
656         // Give an illegal sequence the same value as the result of contains(FFFD).
657         // Handle each byte of an illegal sequence separately to simplify the code;
658         // no need to optimize error handling.
659         if(containsFFFD!=spanCondition) {
660             return s-1;
661         }
662     }
663 
664     return limit0;
665 }
666 
667 /*
668  * While going backwards through UTF-8 optimize only for ASCII.
669  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670  * possible to tell from the last byte in a multi-byte sequence how many
671  * preceding bytes there should be. Therefore, going backwards through UTF-8
672  * is much harder than going forward.
673  */
674 int32_t
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const675 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
676     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
677         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
678     }
679 
680     uint8_t b;
681 
682     do {
683         b=s[--length];
684         if(U8_IS_SINGLE(b)) {
685             // ASCII sub-span
686             if(spanCondition) {
687                 do {
688                     if(!latin1Contains[b]) {
689                         return length+1;
690                     } else if(length==0) {
691                         return 0;
692                     }
693                     b=s[--length];
694                 } while(U8_IS_SINGLE(b));
695             } else {
696                 do {
697                     if(latin1Contains[b]) {
698                         return length+1;
699                     } else if(length==0) {
700                         return 0;
701                     }
702                     b=s[--length];
703                 } while(U8_IS_SINGLE(b));
704             }
705         }
706 
707         int32_t prev=length;
708         UChar32 c;
709         // trail byte: collect a multi-byte character
710         // (or  lead byte in last-trail position)
711         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
712         // c is a valid code point, not ASCII, not a surrogate
713         if(c<=0x7ff) {
714             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
715                 return prev+1;
716             }
717         } else if(c<=0xffff) {
718             int lead=c>>12;
719             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
720             if(twoBits<=1) {
721                 // All 64 code points with the same bits 15..6
722                 // are either in the set or not.
723                 if(twoBits!=(uint32_t)spanCondition) {
724                     return prev+1;
725                 }
726             } else {
727                 // Look up the code point in its 4k block of code points.
728                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
729                     return prev+1;
730                 }
731             }
732         } else {
733             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
734                 return prev+1;
735             }
736         }
737     } while(length>0);
738     return 0;
739 }
740 
741 U_NAMESPACE_END
742