1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uiter.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002jan18
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/ustring.h"
21 #include "unicode/chariter.h"
22 #include "unicode/rep.h"
23 #include "unicode/uiter.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf8.h"
26 #include "unicode/utf16.h"
27 #include "cstring.h"
28 
29 U_NAMESPACE_USE
30 
31 #define IS_EVEN(n) (((n)&1)==0)
32 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
33 
34 U_CDECL_BEGIN
35 
36 /* No-Op UCharIterator implementation for illegal input --------------------- */
37 
38 static int32_t U_CALLCONV
noopGetIndex(UCharIterator *,UCharIteratorOrigin)39 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
40     return 0;
41 }
42 
43 static int32_t U_CALLCONV
noopMove(UCharIterator *,int32_t,UCharIteratorOrigin)44 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
45     return 0;
46 }
47 
48 static UBool U_CALLCONV
noopHasNext(UCharIterator *)49 noopHasNext(UCharIterator * /*iter*/) {
50     return FALSE;
51 }
52 
53 static UChar32 U_CALLCONV
noopCurrent(UCharIterator *)54 noopCurrent(UCharIterator * /*iter*/) {
55     return U_SENTINEL;
56 }
57 
58 static uint32_t U_CALLCONV
noopGetState(const UCharIterator *)59 noopGetState(const UCharIterator * /*iter*/) {
60     return UITER_NO_STATE;
61 }
62 
63 static void U_CALLCONV
noopSetState(UCharIterator *,uint32_t,UErrorCode * pErrorCode)64 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
65     *pErrorCode=U_UNSUPPORTED_ERROR;
66 }
67 
68 static const UCharIterator noopIterator={
69     0, 0, 0, 0, 0, 0,
70     noopGetIndex,
71     noopMove,
72     noopHasNext,
73     noopHasNext,
74     noopCurrent,
75     noopCurrent,
76     noopCurrent,
77     NULL,
78     noopGetState,
79     noopSetState
80 };
81 
82 /* UCharIterator implementation for simple strings -------------------------- */
83 
84 /*
85  * This is an implementation of a code unit (UChar) iterator
86  * for UChar * strings.
87  *
88  * The UCharIterator.context field holds a pointer to the string.
89  */
90 
91 static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)92 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
93     switch(origin) {
94     case UITER_ZERO:
95         return 0;
96     case UITER_START:
97         return iter->start;
98     case UITER_CURRENT:
99         return iter->index;
100     case UITER_LIMIT:
101         return iter->limit;
102     case UITER_LENGTH:
103         return iter->length;
104     default:
105         /* not a valid origin */
106         /* Should never get here! */
107         return -1;
108     }
109 }
110 
111 static int32_t U_CALLCONV
stringIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)112 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
113     int32_t pos;
114 
115     switch(origin) {
116     case UITER_ZERO:
117         pos=delta;
118         break;
119     case UITER_START:
120         pos=iter->start+delta;
121         break;
122     case UITER_CURRENT:
123         pos=iter->index+delta;
124         break;
125     case UITER_LIMIT:
126         pos=iter->limit+delta;
127         break;
128     case UITER_LENGTH:
129         pos=iter->length+delta;
130         break;
131     default:
132         return -1;  /* Error */
133     }
134 
135     if(pos<iter->start) {
136         pos=iter->start;
137     } else if(pos>iter->limit) {
138         pos=iter->limit;
139     }
140 
141     return iter->index=pos;
142 }
143 
144 static UBool U_CALLCONV
stringIteratorHasNext(UCharIterator * iter)145 stringIteratorHasNext(UCharIterator *iter) {
146     return iter->index<iter->limit;
147 }
148 
149 static UBool U_CALLCONV
stringIteratorHasPrevious(UCharIterator * iter)150 stringIteratorHasPrevious(UCharIterator *iter) {
151     return iter->index>iter->start;
152 }
153 
154 static UChar32 U_CALLCONV
stringIteratorCurrent(UCharIterator * iter)155 stringIteratorCurrent(UCharIterator *iter) {
156     if(iter->index<iter->limit) {
157         return ((const UChar *)(iter->context))[iter->index];
158     } else {
159         return U_SENTINEL;
160     }
161 }
162 
163 static UChar32 U_CALLCONV
stringIteratorNext(UCharIterator * iter)164 stringIteratorNext(UCharIterator *iter) {
165     if(iter->index<iter->limit) {
166         return ((const UChar *)(iter->context))[iter->index++];
167     } else {
168         return U_SENTINEL;
169     }
170 }
171 
172 static UChar32 U_CALLCONV
stringIteratorPrevious(UCharIterator * iter)173 stringIteratorPrevious(UCharIterator *iter) {
174     if(iter->index>iter->start) {
175         return ((const UChar *)(iter->context))[--iter->index];
176     } else {
177         return U_SENTINEL;
178     }
179 }
180 
181 static uint32_t U_CALLCONV
stringIteratorGetState(const UCharIterator * iter)182 stringIteratorGetState(const UCharIterator *iter) {
183     return (uint32_t)iter->index;
184 }
185 
186 static void U_CALLCONV
stringIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)187 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
188     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
189         /* do nothing */
190     } else if(iter==NULL) {
191         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
192     } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
193         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
194     } else {
195         iter->index=(int32_t)state;
196     }
197 }
198 
199 static const UCharIterator stringIterator={
200     0, 0, 0, 0, 0, 0,
201     stringIteratorGetIndex,
202     stringIteratorMove,
203     stringIteratorHasNext,
204     stringIteratorHasPrevious,
205     stringIteratorCurrent,
206     stringIteratorNext,
207     stringIteratorPrevious,
208     NULL,
209     stringIteratorGetState,
210     stringIteratorSetState
211 };
212 
213 U_CAPI void U_EXPORT2
uiter_setString(UCharIterator * iter,const UChar * s,int32_t length)214 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
215     if(iter!=0) {
216         if(s!=0 && length>=-1) {
217             *iter=stringIterator;
218             iter->context=s;
219             if(length>=0) {
220                 iter->length=length;
221             } else {
222                 iter->length=u_strlen(s);
223             }
224             iter->limit=iter->length;
225         } else {
226             *iter=noopIterator;
227         }
228     }
229 }
230 
231 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
232 
233 /*
234  * This is an implementation of a code unit (UChar) iterator
235  * for UTF-16BE strings, i.e., strings in byte-vectors where
236  * each UChar is stored as a big-endian pair of bytes.
237  *
238  * The UCharIterator.context field holds a pointer to the string.
239  * Everything works just like with a normal UChar iterator (uiter_setString),
240  * except that UChars are assembled from byte pairs.
241  */
242 
243 /* internal helper function */
244 static inline UChar32
utf16BEIteratorGet(UCharIterator * iter,int32_t index)245 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
246     const uint8_t *p=(const uint8_t *)iter->context;
247     return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
248 }
249 
250 static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator * iter)251 utf16BEIteratorCurrent(UCharIterator *iter) {
252     int32_t index;
253 
254     if((index=iter->index)<iter->limit) {
255         return utf16BEIteratorGet(iter, index);
256     } else {
257         return U_SENTINEL;
258     }
259 }
260 
261 static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator * iter)262 utf16BEIteratorNext(UCharIterator *iter) {
263     int32_t index;
264 
265     if((index=iter->index)<iter->limit) {
266         iter->index=index+1;
267         return utf16BEIteratorGet(iter, index);
268     } else {
269         return U_SENTINEL;
270     }
271 }
272 
273 static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator * iter)274 utf16BEIteratorPrevious(UCharIterator *iter) {
275     int32_t index;
276 
277     if((index=iter->index)>iter->start) {
278         iter->index=--index;
279         return utf16BEIteratorGet(iter, index);
280     } else {
281         return U_SENTINEL;
282     }
283 }
284 
285 static const UCharIterator utf16BEIterator={
286     0, 0, 0, 0, 0, 0,
287     stringIteratorGetIndex,
288     stringIteratorMove,
289     stringIteratorHasNext,
290     stringIteratorHasPrevious,
291     utf16BEIteratorCurrent,
292     utf16BEIteratorNext,
293     utf16BEIteratorPrevious,
294     NULL,
295     stringIteratorGetState,
296     stringIteratorSetState
297 };
298 
299 /*
300  * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
301  * i.e., before a pair of 0 bytes where the first 0 byte is at an even
302  * offset from s.
303  */
304 static int32_t
utf16BE_strlen(const char * s)305 utf16BE_strlen(const char *s) {
306     if(IS_POINTER_EVEN(s)) {
307         /*
308          * even-aligned, call u_strlen(s)
309          * we are probably on a little-endian machine, but searching for UChar NUL
310          * does not care about endianness
311          */
312         return u_strlen((const UChar *)s);
313     } else {
314         /* odd-aligned, search for pair of 0 bytes */
315         const char *p=s;
316 
317         while(!(*p==0 && p[1]==0)) {
318             p+=2;
319         }
320         return (int32_t)((p-s)/2);
321     }
322 }
323 
324 U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator * iter,const char * s,int32_t length)325 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
326     if(iter!=NULL) {
327         /* allow only even-length strings (the input length counts bytes) */
328         if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
329             /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
330             length>>=1;
331 
332             if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
333                 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
334                 uiter_setString(iter, (const UChar *)s, length);
335                 return;
336             }
337 
338             *iter=utf16BEIterator;
339             iter->context=s;
340             if(length>=0) {
341                 iter->length=length;
342             } else {
343                 iter->length=utf16BE_strlen(s);
344             }
345             iter->limit=iter->length;
346         } else {
347             *iter=noopIterator;
348         }
349     }
350 }
351 
352 /* UCharIterator wrapper around CharacterIterator --------------------------- */
353 
354 /*
355  * This is wrapper code around a C++ CharacterIterator to
356  * look like a C UCharIterator.
357  *
358  * The UCharIterator.context field holds a pointer to the CharacterIterator.
359  */
360 
361 static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)362 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
363     switch(origin) {
364     case UITER_ZERO:
365         return 0;
366     case UITER_START:
367         return ((CharacterIterator *)(iter->context))->startIndex();
368     case UITER_CURRENT:
369         return ((CharacterIterator *)(iter->context))->getIndex();
370     case UITER_LIMIT:
371         return ((CharacterIterator *)(iter->context))->endIndex();
372     case UITER_LENGTH:
373         return ((CharacterIterator *)(iter->context))->getLength();
374     default:
375         /* not a valid origin */
376         /* Should never get here! */
377         return -1;
378     }
379 }
380 
381 static int32_t U_CALLCONV
characterIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)382 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
383     switch(origin) {
384     case UITER_ZERO:
385         ((CharacterIterator *)(iter->context))->setIndex(delta);
386         return ((CharacterIterator *)(iter->context))->getIndex();
387     case UITER_START:
388     case UITER_CURRENT:
389     case UITER_LIMIT:
390         return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
391     case UITER_LENGTH:
392         ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
393         return ((CharacterIterator *)(iter->context))->getIndex();
394     default:
395         /* not a valid origin */
396         /* Should never get here! */
397         return -1;
398     }
399 }
400 
401 static UBool U_CALLCONV
characterIteratorHasNext(UCharIterator * iter)402 characterIteratorHasNext(UCharIterator *iter) {
403     return ((CharacterIterator *)(iter->context))->hasNext();
404 }
405 
406 static UBool U_CALLCONV
characterIteratorHasPrevious(UCharIterator * iter)407 characterIteratorHasPrevious(UCharIterator *iter) {
408     return ((CharacterIterator *)(iter->context))->hasPrevious();
409 }
410 
411 static UChar32 U_CALLCONV
characterIteratorCurrent(UCharIterator * iter)412 characterIteratorCurrent(UCharIterator *iter) {
413     UChar32 c;
414 
415     c=((CharacterIterator *)(iter->context))->current();
416     if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
417         return c;
418     } else {
419         return U_SENTINEL;
420     }
421 }
422 
423 static UChar32 U_CALLCONV
characterIteratorNext(UCharIterator * iter)424 characterIteratorNext(UCharIterator *iter) {
425     if(((CharacterIterator *)(iter->context))->hasNext()) {
426         return ((CharacterIterator *)(iter->context))->nextPostInc();
427     } else {
428         return U_SENTINEL;
429     }
430 }
431 
432 static UChar32 U_CALLCONV
characterIteratorPrevious(UCharIterator * iter)433 characterIteratorPrevious(UCharIterator *iter) {
434     if(((CharacterIterator *)(iter->context))->hasPrevious()) {
435         return ((CharacterIterator *)(iter->context))->previous();
436     } else {
437         return U_SENTINEL;
438     }
439 }
440 
441 static uint32_t U_CALLCONV
characterIteratorGetState(const UCharIterator * iter)442 characterIteratorGetState(const UCharIterator *iter) {
443     return ((CharacterIterator *)(iter->context))->getIndex();
444 }
445 
446 static void U_CALLCONV
characterIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)447 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
448     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
449         /* do nothing */
450     } else if(iter==NULL || iter->context==NULL) {
451         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
452     } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
453         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
454     } else {
455         ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
456     }
457 }
458 
459 static const UCharIterator characterIteratorWrapper={
460     0, 0, 0, 0, 0, 0,
461     characterIteratorGetIndex,
462     characterIteratorMove,
463     characterIteratorHasNext,
464     characterIteratorHasPrevious,
465     characterIteratorCurrent,
466     characterIteratorNext,
467     characterIteratorPrevious,
468     NULL,
469     characterIteratorGetState,
470     characterIteratorSetState
471 };
472 
473 U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator * iter,CharacterIterator * charIter)474 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
475     if(iter!=0) {
476         if(charIter!=0) {
477             *iter=characterIteratorWrapper;
478             iter->context=charIter;
479         } else {
480             *iter=noopIterator;
481         }
482     }
483 }
484 
485 /* UCharIterator wrapper around Replaceable --------------------------------- */
486 
487 /*
488  * This is an implementation of a code unit (UChar) iterator
489  * based on a Replaceable object.
490  *
491  * The UCharIterator.context field holds a pointer to the Replaceable.
492  * UCharIterator.length and UCharIterator.index hold Replaceable.length()
493  * and the iteration index.
494  */
495 
496 static UChar32 U_CALLCONV
replaceableIteratorCurrent(UCharIterator * iter)497 replaceableIteratorCurrent(UCharIterator *iter) {
498     if(iter->index<iter->limit) {
499         return ((Replaceable *)(iter->context))->charAt(iter->index);
500     } else {
501         return U_SENTINEL;
502     }
503 }
504 
505 static UChar32 U_CALLCONV
replaceableIteratorNext(UCharIterator * iter)506 replaceableIteratorNext(UCharIterator *iter) {
507     if(iter->index<iter->limit) {
508         return ((Replaceable *)(iter->context))->charAt(iter->index++);
509     } else {
510         return U_SENTINEL;
511     }
512 }
513 
514 static UChar32 U_CALLCONV
replaceableIteratorPrevious(UCharIterator * iter)515 replaceableIteratorPrevious(UCharIterator *iter) {
516     if(iter->index>iter->start) {
517         return ((Replaceable *)(iter->context))->charAt(--iter->index);
518     } else {
519         return U_SENTINEL;
520     }
521 }
522 
523 static const UCharIterator replaceableIterator={
524     0, 0, 0, 0, 0, 0,
525     stringIteratorGetIndex,
526     stringIteratorMove,
527     stringIteratorHasNext,
528     stringIteratorHasPrevious,
529     replaceableIteratorCurrent,
530     replaceableIteratorNext,
531     replaceableIteratorPrevious,
532     NULL,
533     stringIteratorGetState,
534     stringIteratorSetState
535 };
536 
537 U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator * iter,const Replaceable * rep)538 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
539     if(iter!=0) {
540         if(rep!=0) {
541             *iter=replaceableIterator;
542             iter->context=rep;
543             iter->limit=iter->length=rep->length();
544         } else {
545             *iter=noopIterator;
546         }
547     }
548 }
549 
550 /* UCharIterator implementation for UTF-8 strings --------------------------- */
551 
552 /*
553  * Possible, probably necessary only for an implementation for arbitrary
554  * converters:
555  * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
556  * This would require to turn reservedFn into a close function and
557  * to introduce a uiter_close(iter).
558  */
559 
560 #define UITER_CNV_CAPACITY 16
561 
562 /*
563  * Minimal implementation:
564  * Maintain a single-UChar buffer for an additional surrogate.
565  * The caller must not modify start and limit because they are used internally.
566  *
567  * Use UCharIterator fields as follows:
568  *   context        pointer to UTF-8 string
569  *   length         UTF-16 length of the string; -1 until lazy evaluation
570  *   start          current UTF-8 index
571  *   index          current UTF-16 index; may be -1="unknown" after setState()
572  *   limit          UTF-8 length of the string
573  *   reservedField  supplementary code point
574  *
575  * Since UCharIterator delivers 16-bit code units, the iteration can be
576  * currently in the middle of the byte sequence for a supplementary code point.
577  * In this case, reservedField will contain that code point and start will
578  * point to after the corresponding byte sequence. The UTF-16 index will be
579  * one less than what it would otherwise be corresponding to the UTF-8 index.
580  * Otherwise, reservedField will be 0.
581  */
582 
583 /*
584  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
585  * Add implementations that do not call strlen() for iteration but check for NUL.
586  */
587 
588 static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)589 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
590     switch(origin) {
591     case UITER_ZERO:
592     case UITER_START:
593         return 0;
594     case UITER_CURRENT:
595         if(iter->index<0) {
596             /* the current UTF-16 index is unknown after setState(), count from the beginning */
597             const uint8_t *s;
598             UChar32 c;
599             int32_t i, limit, index;
600 
601             s=(const uint8_t *)iter->context;
602             i=index=0;
603             limit=iter->start; /* count up to the UTF-8 index */
604             while(i<limit) {
605                 U8_NEXT_OR_FFFD(s, i, limit, c);
606                 index+=U16_LENGTH(c);
607             }
608 
609             iter->start=i; /* just in case setState() did not get us to a code point boundary */
610             if(i==iter->limit) {
611                 iter->length=index; /* in case it was <0 or wrong */
612             }
613             if(iter->reservedField!=0) {
614                 --index; /* we are in the middle of a supplementary code point */
615             }
616             iter->index=index;
617         }
618         return iter->index;
619     case UITER_LIMIT:
620     case UITER_LENGTH:
621         if(iter->length<0) {
622             const uint8_t *s;
623             UChar32 c;
624             int32_t i, limit, length;
625 
626             s=(const uint8_t *)iter->context;
627             if(iter->index<0) {
628                 /*
629                  * the current UTF-16 index is unknown after setState(),
630                  * we must first count from the beginning to here
631                  */
632                 i=length=0;
633                 limit=iter->start;
634 
635                 /* count from the beginning to the current index */
636                 while(i<limit) {
637                     U8_NEXT_OR_FFFD(s, i, limit, c);
638                     length+=U16_LENGTH(c);
639                 }
640 
641                 /* assume i==limit==iter->start, set the UTF-16 index */
642                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
643                 iter->index= iter->reservedField!=0 ? length-1 : length;
644             } else {
645                 i=iter->start;
646                 length=iter->index;
647                 if(iter->reservedField!=0) {
648                     ++length;
649                 }
650             }
651 
652             /* count from the current index to the end */
653             limit=iter->limit;
654             while(i<limit) {
655                 U8_NEXT_OR_FFFD(s, i, limit, c);
656                 length+=U16_LENGTH(c);
657             }
658             iter->length=length;
659         }
660         return iter->length;
661     default:
662         /* not a valid origin */
663         /* Should never get here! */
664         return -1;
665     }
666 }
667 
668 static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)669 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
670     const uint8_t *s;
671     UChar32 c;
672     int32_t pos; /* requested UTF-16 index */
673     int32_t i; /* UTF-8 index */
674     UBool havePos;
675 
676     /* calculate the requested UTF-16 index */
677     switch(origin) {
678     case UITER_ZERO:
679     case UITER_START:
680         pos=delta;
681         havePos=TRUE;
682         /* iter->index<0 (unknown) is possible */
683         break;
684     case UITER_CURRENT:
685         if(iter->index>=0) {
686             pos=iter->index+delta;
687             havePos=TRUE;
688         } else {
689             /* the current UTF-16 index is unknown after setState(), use only delta */
690             pos=0;
691             havePos=FALSE;
692         }
693         break;
694     case UITER_LIMIT:
695     case UITER_LENGTH:
696         if(iter->length>=0) {
697             pos=iter->length+delta;
698             havePos=TRUE;
699         } else {
700             /* pin to the end, avoid counting the length */
701             iter->index=-1;
702             iter->start=iter->limit;
703             iter->reservedField=0;
704             if(delta>=0) {
705                 return UITER_UNKNOWN_INDEX;
706             } else {
707                 /* the current UTF-16 index is unknown, use only delta */
708                 pos=0;
709                 havePos=FALSE;
710             }
711         }
712         break;
713     default:
714         return -1;  /* Error */
715     }
716 
717     if(havePos) {
718         /* shortcuts: pinning to the edges of the string */
719         if(pos<=0) {
720             iter->index=iter->start=iter->reservedField=0;
721             return 0;
722         } else if(iter->length>=0 && pos>=iter->length) {
723             iter->index=iter->length;
724             iter->start=iter->limit;
725             iter->reservedField=0;
726             return iter->index;
727         }
728 
729         /* minimize the number of U8_NEXT/PREV operations */
730         if(iter->index<0 || pos<iter->index/2) {
731             /* go forward from the start instead of backward from the current index */
732             iter->index=iter->start=iter->reservedField=0;
733         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
734             /*
735              * if we have the UTF-16 index and length and the new position is
736              * closer to the end than the current index,
737              * then go backward from the end instead of forward from the current index
738              */
739             iter->index=iter->length;
740             iter->start=iter->limit;
741             iter->reservedField=0;
742         }
743 
744         delta=pos-iter->index;
745         if(delta==0) {
746             return iter->index; /* nothing to do */
747         }
748     } else {
749         /* move relative to unknown UTF-16 index */
750         if(delta==0) {
751             return UITER_UNKNOWN_INDEX; /* nothing to do */
752         } else if(-delta>=iter->start) {
753             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
754             iter->index=iter->start=iter->reservedField=0;
755             return 0;
756         } else if(delta>=(iter->limit-iter->start)) {
757             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
758             iter->index=iter->length; /* may or may not be <0 (unknown) */
759             iter->start=iter->limit;
760             iter->reservedField=0;
761             return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
762         }
763     }
764 
765     /* delta!=0 */
766 
767     /* move towards the requested position, pin to the edges of the string */
768     s=(const uint8_t *)iter->context;
769     pos=iter->index; /* could be <0 (unknown) */
770     i=iter->start;
771     if(delta>0) {
772         /* go forward */
773         int32_t limit=iter->limit;
774         if(iter->reservedField!=0) {
775             iter->reservedField=0;
776             ++pos;
777             --delta;
778         }
779         while(delta>0 && i<limit) {
780             U8_NEXT_OR_FFFD(s, i, limit, c);
781             if(c<=0xffff) {
782                 ++pos;
783                 --delta;
784             } else if(delta>=2) {
785                 pos+=2;
786                 delta-=2;
787             } else /* delta==1 */ {
788                 /* stop in the middle of a supplementary code point */
789                 iter->reservedField=c;
790                 ++pos;
791                 break; /* delta=0; */
792             }
793         }
794         if(i==limit) {
795             if(iter->length<0 && iter->index>=0) {
796                 iter->length= iter->reservedField==0 ? pos : pos+1;
797             } else if(iter->index<0 && iter->length>=0) {
798                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
799             }
800         }
801     } else /* delta<0 */ {
802         /* go backward */
803         if(iter->reservedField!=0) {
804             iter->reservedField=0;
805             i-=4; /* we stayed behind the supplementary code point; go before it now */
806             --pos;
807             ++delta;
808         }
809         while(delta<0 && i>0) {
810             U8_PREV_OR_FFFD(s, 0, i, c);
811             if(c<=0xffff) {
812                 --pos;
813                 ++delta;
814             } else if(delta<=-2) {
815                 pos-=2;
816                 delta+=2;
817             } else /* delta==-1 */ {
818                 /* stop in the middle of a supplementary code point */
819                 i+=4; /* back to behind this supplementary code point for consistent state */
820                 iter->reservedField=c;
821                 --pos;
822                 break; /* delta=0; */
823             }
824         }
825     }
826 
827     iter->start=i;
828     if(iter->index>=0) {
829         return iter->index=pos;
830     } else {
831         /* we started with index<0 (unknown) so pos is bogus */
832         if(i<=1) {
833             return iter->index=i; /* reached the beginning */
834         } else {
835             /* we still don't know the UTF-16 index */
836             return UITER_UNKNOWN_INDEX;
837         }
838     }
839 }
840 
841 static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator * iter)842 utf8IteratorHasNext(UCharIterator *iter) {
843     return iter->start<iter->limit || iter->reservedField!=0;
844 }
845 
846 static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator * iter)847 utf8IteratorHasPrevious(UCharIterator *iter) {
848     return iter->start>0;
849 }
850 
851 static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator * iter)852 utf8IteratorCurrent(UCharIterator *iter) {
853     if(iter->reservedField!=0) {
854         return U16_TRAIL(iter->reservedField);
855     } else if(iter->start<iter->limit) {
856         const uint8_t *s=(const uint8_t *)iter->context;
857         UChar32 c;
858         int32_t i=iter->start;
859 
860         U8_NEXT_OR_FFFD(s, i, iter->limit, c);
861         if(c<=0xffff) {
862             return c;
863         } else {
864             return U16_LEAD(c);
865         }
866     } else {
867         return U_SENTINEL;
868     }
869 }
870 
871 static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator * iter)872 utf8IteratorNext(UCharIterator *iter) {
873     int32_t index;
874 
875     if(iter->reservedField!=0) {
876         UChar trail=U16_TRAIL(iter->reservedField);
877         iter->reservedField=0;
878         if((index=iter->index)>=0) {
879             iter->index=index+1;
880         }
881         return trail;
882     } else if(iter->start<iter->limit) {
883         const uint8_t *s=(const uint8_t *)iter->context;
884         UChar32 c;
885 
886         U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
887         if((index=iter->index)>=0) {
888             iter->index=++index;
889             if(iter->length<0 && iter->start==iter->limit) {
890                 iter->length= c<=0xffff ? index : index+1;
891             }
892         } else if(iter->start==iter->limit && iter->length>=0) {
893             iter->index= c<=0xffff ? iter->length : iter->length-1;
894         }
895         if(c<=0xffff) {
896             return c;
897         } else {
898             iter->reservedField=c;
899             return U16_LEAD(c);
900         }
901     } else {
902         return U_SENTINEL;
903     }
904 }
905 
906 static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator * iter)907 utf8IteratorPrevious(UCharIterator *iter) {
908     int32_t index;
909 
910     if(iter->reservedField!=0) {
911         UChar lead=U16_LEAD(iter->reservedField);
912         iter->reservedField=0;
913         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
914         if((index=iter->index)>0) {
915             iter->index=index-1;
916         }
917         return lead;
918     } else if(iter->start>0) {
919         const uint8_t *s=(const uint8_t *)iter->context;
920         UChar32 c;
921 
922         U8_PREV_OR_FFFD(s, 0, iter->start, c);
923         if((index=iter->index)>0) {
924             iter->index=index-1;
925         } else if(iter->start<=1) {
926             iter->index= c<=0xffff ? iter->start : iter->start+1;
927         }
928         if(c<=0xffff) {
929             return c;
930         } else {
931             iter->start+=4; /* back to behind this supplementary code point for consistent state */
932             iter->reservedField=c;
933             return U16_TRAIL(c);
934         }
935     } else {
936         return U_SENTINEL;
937     }
938 }
939 
940 static uint32_t U_CALLCONV
utf8IteratorGetState(const UCharIterator * iter)941 utf8IteratorGetState(const UCharIterator *iter) {
942     uint32_t state=(uint32_t)(iter->start<<1);
943     if(iter->reservedField!=0) {
944         state|=1;
945     }
946     return state;
947 }
948 
949 static void U_CALLCONV
utf8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)950 utf8IteratorSetState(UCharIterator *iter,
951                      uint32_t state,
952                      UErrorCode *pErrorCode)
953 {
954     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
955         /* do nothing */
956     } else if(iter==NULL) {
957         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
958     } else if(state==utf8IteratorGetState(iter)) {
959         /* setting to the current state: no-op */
960     } else {
961         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
962         state&=1; /* 1 if in surrogate pair, must be index>=4 */
963 
964         if((state==0 ? index<0 : index<4) || iter->limit<index) {
965             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
966         } else {
967             iter->start=index; /* restore UTF-8 byte index */
968             if(index<=1) {
969                 iter->index=index;
970             } else {
971                 iter->index=-1; /* unknown UTF-16 index */
972             }
973             if(state==0) {
974                 iter->reservedField=0;
975             } else {
976                 /* verified index>=4 above */
977                 UChar32 c;
978                 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
979                 if(c<=0xffff) {
980                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
981                 } else {
982                     iter->reservedField=c;
983                 }
984             }
985         }
986     }
987 }
988 
989 static const UCharIterator utf8Iterator={
990     0, 0, 0, 0, 0, 0,
991     utf8IteratorGetIndex,
992     utf8IteratorMove,
993     utf8IteratorHasNext,
994     utf8IteratorHasPrevious,
995     utf8IteratorCurrent,
996     utf8IteratorNext,
997     utf8IteratorPrevious,
998     NULL,
999     utf8IteratorGetState,
1000     utf8IteratorSetState
1001 };
1002 
1003 U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator * iter,const char * s,int32_t length)1004 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1005     if(iter!=0) {
1006         if(s!=0 && length>=-1) {
1007             *iter=utf8Iterator;
1008             iter->context=s;
1009             if(length>=0) {
1010                 iter->limit=length;
1011             } else {
1012                 iter->limit=(int32_t)uprv_strlen(s);
1013             }
1014             iter->length= iter->limit<=1 ? iter->limit : -1;
1015         } else {
1016             *iter=noopIterator;
1017         }
1018     }
1019 }
1020 
1021 /* Helper functions --------------------------------------------------------- */
1022 
1023 U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator * iter)1024 uiter_current32(UCharIterator *iter) {
1025     UChar32 c, c2;
1026 
1027     c=iter->current(iter);
1028     if(U16_IS_SURROGATE(c)) {
1029         if(U16_IS_SURROGATE_LEAD(c)) {
1030             /*
1031              * go to the next code unit
1032              * we know that we are not at the limit because c!=U_SENTINEL
1033              */
1034             iter->move(iter, 1, UITER_CURRENT);
1035             if(U16_IS_TRAIL(c2=iter->current(iter))) {
1036                 c=U16_GET_SUPPLEMENTARY(c, c2);
1037             }
1038 
1039             /* undo index movement */
1040             iter->move(iter, -1, UITER_CURRENT);
1041         } else {
1042             if(U16_IS_LEAD(c2=iter->previous(iter))) {
1043                 c=U16_GET_SUPPLEMENTARY(c2, c);
1044             }
1045             if(c2>=0) {
1046                 /* undo index movement */
1047                 iter->move(iter, 1, UITER_CURRENT);
1048             }
1049         }
1050     }
1051     return c;
1052 }
1053 
1054 U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator * iter)1055 uiter_next32(UCharIterator *iter) {
1056     UChar32 c, c2;
1057 
1058     c=iter->next(iter);
1059     if(U16_IS_LEAD(c)) {
1060         if(U16_IS_TRAIL(c2=iter->next(iter))) {
1061             c=U16_GET_SUPPLEMENTARY(c, c2);
1062         } else if(c2>=0) {
1063             /* unmatched first surrogate, undo index movement */
1064             iter->move(iter, -1, UITER_CURRENT);
1065         }
1066     }
1067     return c;
1068 }
1069 
1070 U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator * iter)1071 uiter_previous32(UCharIterator *iter) {
1072     UChar32 c, c2;
1073 
1074     c=iter->previous(iter);
1075     if(U16_IS_TRAIL(c)) {
1076         if(U16_IS_LEAD(c2=iter->previous(iter))) {
1077             c=U16_GET_SUPPLEMENTARY(c2, c);
1078         } else if(c2>=0) {
1079             /* unmatched second surrogate, undo index movement */
1080             iter->move(iter, 1, UITER_CURRENT);
1081         }
1082     }
1083     return c;
1084 }
1085 
1086 U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator * iter)1087 uiter_getState(const UCharIterator *iter) {
1088     if(iter==NULL || iter->getState==NULL) {
1089         return UITER_NO_STATE;
1090     } else {
1091         return iter->getState(iter);
1092     }
1093 }
1094 
1095 U_CAPI void U_EXPORT2
uiter_setState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)1096 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1097     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1098         /* do nothing */
1099     } else if(iter==NULL) {
1100         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1101     } else if(iter->setState==NULL) {
1102         *pErrorCode=U_UNSUPPORTED_ERROR;
1103     } else {
1104         iter->setState(iter, state, pErrorCode);
1105     }
1106 }
1107 
1108 U_CDECL_END
1109