1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2002-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: uiter.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002jan18
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/ustring.h"
21 #include "unicode/chariter.h"
22 #include "unicode/rep.h"
23 #include "unicode/uiter.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf8.h"
26 #include "unicode/utf16.h"
27 #include "cstring.h"
28
29 U_NAMESPACE_USE
30
31 #define IS_EVEN(n) (((n)&1)==0)
32 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
33
34 U_CDECL_BEGIN
35
36 /* No-Op UCharIterator implementation for illegal input --------------------- */
37
38 static int32_t U_CALLCONV
noopGetIndex(UCharIterator *,UCharIteratorOrigin)39 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
40 return 0;
41 }
42
43 static int32_t U_CALLCONV
noopMove(UCharIterator *,int32_t,UCharIteratorOrigin)44 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
45 return 0;
46 }
47
48 static UBool U_CALLCONV
noopHasNext(UCharIterator *)49 noopHasNext(UCharIterator * /*iter*/) {
50 return FALSE;
51 }
52
53 static UChar32 U_CALLCONV
noopCurrent(UCharIterator *)54 noopCurrent(UCharIterator * /*iter*/) {
55 return U_SENTINEL;
56 }
57
58 static uint32_t U_CALLCONV
noopGetState(const UCharIterator *)59 noopGetState(const UCharIterator * /*iter*/) {
60 return UITER_NO_STATE;
61 }
62
63 static void U_CALLCONV
noopSetState(UCharIterator *,uint32_t,UErrorCode * pErrorCode)64 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
65 *pErrorCode=U_UNSUPPORTED_ERROR;
66 }
67
68 static const UCharIterator noopIterator={
69 0, 0, 0, 0, 0, 0,
70 noopGetIndex,
71 noopMove,
72 noopHasNext,
73 noopHasNext,
74 noopCurrent,
75 noopCurrent,
76 noopCurrent,
77 NULL,
78 noopGetState,
79 noopSetState
80 };
81
82 /* UCharIterator implementation for simple strings -------------------------- */
83
84 /*
85 * This is an implementation of a code unit (UChar) iterator
86 * for UChar * strings.
87 *
88 * The UCharIterator.context field holds a pointer to the string.
89 */
90
91 static int32_t U_CALLCONV
stringIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)92 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
93 switch(origin) {
94 case UITER_ZERO:
95 return 0;
96 case UITER_START:
97 return iter->start;
98 case UITER_CURRENT:
99 return iter->index;
100 case UITER_LIMIT:
101 return iter->limit;
102 case UITER_LENGTH:
103 return iter->length;
104 default:
105 /* not a valid origin */
106 /* Should never get here! */
107 return -1;
108 }
109 }
110
111 static int32_t U_CALLCONV
stringIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)112 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
113 int32_t pos;
114
115 switch(origin) {
116 case UITER_ZERO:
117 pos=delta;
118 break;
119 case UITER_START:
120 pos=iter->start+delta;
121 break;
122 case UITER_CURRENT:
123 pos=iter->index+delta;
124 break;
125 case UITER_LIMIT:
126 pos=iter->limit+delta;
127 break;
128 case UITER_LENGTH:
129 pos=iter->length+delta;
130 break;
131 default:
132 return -1; /* Error */
133 }
134
135 if(pos<iter->start) {
136 pos=iter->start;
137 } else if(pos>iter->limit) {
138 pos=iter->limit;
139 }
140
141 return iter->index=pos;
142 }
143
144 static UBool U_CALLCONV
stringIteratorHasNext(UCharIterator * iter)145 stringIteratorHasNext(UCharIterator *iter) {
146 return iter->index<iter->limit;
147 }
148
149 static UBool U_CALLCONV
stringIteratorHasPrevious(UCharIterator * iter)150 stringIteratorHasPrevious(UCharIterator *iter) {
151 return iter->index>iter->start;
152 }
153
154 static UChar32 U_CALLCONV
stringIteratorCurrent(UCharIterator * iter)155 stringIteratorCurrent(UCharIterator *iter) {
156 if(iter->index<iter->limit) {
157 return ((const UChar *)(iter->context))[iter->index];
158 } else {
159 return U_SENTINEL;
160 }
161 }
162
163 static UChar32 U_CALLCONV
stringIteratorNext(UCharIterator * iter)164 stringIteratorNext(UCharIterator *iter) {
165 if(iter->index<iter->limit) {
166 return ((const UChar *)(iter->context))[iter->index++];
167 } else {
168 return U_SENTINEL;
169 }
170 }
171
172 static UChar32 U_CALLCONV
stringIteratorPrevious(UCharIterator * iter)173 stringIteratorPrevious(UCharIterator *iter) {
174 if(iter->index>iter->start) {
175 return ((const UChar *)(iter->context))[--iter->index];
176 } else {
177 return U_SENTINEL;
178 }
179 }
180
181 static uint32_t U_CALLCONV
stringIteratorGetState(const UCharIterator * iter)182 stringIteratorGetState(const UCharIterator *iter) {
183 return (uint32_t)iter->index;
184 }
185
186 static void U_CALLCONV
stringIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)187 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
188 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
189 /* do nothing */
190 } else if(iter==NULL) {
191 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
192 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
193 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
194 } else {
195 iter->index=(int32_t)state;
196 }
197 }
198
199 static const UCharIterator stringIterator={
200 0, 0, 0, 0, 0, 0,
201 stringIteratorGetIndex,
202 stringIteratorMove,
203 stringIteratorHasNext,
204 stringIteratorHasPrevious,
205 stringIteratorCurrent,
206 stringIteratorNext,
207 stringIteratorPrevious,
208 NULL,
209 stringIteratorGetState,
210 stringIteratorSetState
211 };
212
213 U_CAPI void U_EXPORT2
uiter_setString(UCharIterator * iter,const UChar * s,int32_t length)214 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
215 if(iter!=0) {
216 if(s!=0 && length>=-1) {
217 *iter=stringIterator;
218 iter->context=s;
219 if(length>=0) {
220 iter->length=length;
221 } else {
222 iter->length=u_strlen(s);
223 }
224 iter->limit=iter->length;
225 } else {
226 *iter=noopIterator;
227 }
228 }
229 }
230
231 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
232
233 /*
234 * This is an implementation of a code unit (UChar) iterator
235 * for UTF-16BE strings, i.e., strings in byte-vectors where
236 * each UChar is stored as a big-endian pair of bytes.
237 *
238 * The UCharIterator.context field holds a pointer to the string.
239 * Everything works just like with a normal UChar iterator (uiter_setString),
240 * except that UChars are assembled from byte pairs.
241 */
242
243 /* internal helper function */
244 static inline UChar32
utf16BEIteratorGet(UCharIterator * iter,int32_t index)245 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
246 const uint8_t *p=(const uint8_t *)iter->context;
247 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
248 }
249
250 static UChar32 U_CALLCONV
utf16BEIteratorCurrent(UCharIterator * iter)251 utf16BEIteratorCurrent(UCharIterator *iter) {
252 int32_t index;
253
254 if((index=iter->index)<iter->limit) {
255 return utf16BEIteratorGet(iter, index);
256 } else {
257 return U_SENTINEL;
258 }
259 }
260
261 static UChar32 U_CALLCONV
utf16BEIteratorNext(UCharIterator * iter)262 utf16BEIteratorNext(UCharIterator *iter) {
263 int32_t index;
264
265 if((index=iter->index)<iter->limit) {
266 iter->index=index+1;
267 return utf16BEIteratorGet(iter, index);
268 } else {
269 return U_SENTINEL;
270 }
271 }
272
273 static UChar32 U_CALLCONV
utf16BEIteratorPrevious(UCharIterator * iter)274 utf16BEIteratorPrevious(UCharIterator *iter) {
275 int32_t index;
276
277 if((index=iter->index)>iter->start) {
278 iter->index=--index;
279 return utf16BEIteratorGet(iter, index);
280 } else {
281 return U_SENTINEL;
282 }
283 }
284
285 static const UCharIterator utf16BEIterator={
286 0, 0, 0, 0, 0, 0,
287 stringIteratorGetIndex,
288 stringIteratorMove,
289 stringIteratorHasNext,
290 stringIteratorHasPrevious,
291 utf16BEIteratorCurrent,
292 utf16BEIteratorNext,
293 utf16BEIteratorPrevious,
294 NULL,
295 stringIteratorGetState,
296 stringIteratorSetState
297 };
298
299 /*
300 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
301 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
302 * offset from s.
303 */
304 static int32_t
utf16BE_strlen(const char * s)305 utf16BE_strlen(const char *s) {
306 if(IS_POINTER_EVEN(s)) {
307 /*
308 * even-aligned, call u_strlen(s)
309 * we are probably on a little-endian machine, but searching for UChar NUL
310 * does not care about endianness
311 */
312 return u_strlen((const UChar *)s);
313 } else {
314 /* odd-aligned, search for pair of 0 bytes */
315 const char *p=s;
316
317 while(!(*p==0 && p[1]==0)) {
318 p+=2;
319 }
320 return (int32_t)((p-s)/2);
321 }
322 }
323
324 U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator * iter,const char * s,int32_t length)325 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
326 if(iter!=NULL) {
327 /* allow only even-length strings (the input length counts bytes) */
328 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
329 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
330 length>>=1;
331
332 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
333 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
334 uiter_setString(iter, (const UChar *)s, length);
335 return;
336 }
337
338 *iter=utf16BEIterator;
339 iter->context=s;
340 if(length>=0) {
341 iter->length=length;
342 } else {
343 iter->length=utf16BE_strlen(s);
344 }
345 iter->limit=iter->length;
346 } else {
347 *iter=noopIterator;
348 }
349 }
350 }
351
352 /* UCharIterator wrapper around CharacterIterator --------------------------- */
353
354 /*
355 * This is wrapper code around a C++ CharacterIterator to
356 * look like a C UCharIterator.
357 *
358 * The UCharIterator.context field holds a pointer to the CharacterIterator.
359 */
360
361 static int32_t U_CALLCONV
characterIteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)362 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
363 switch(origin) {
364 case UITER_ZERO:
365 return 0;
366 case UITER_START:
367 return ((CharacterIterator *)(iter->context))->startIndex();
368 case UITER_CURRENT:
369 return ((CharacterIterator *)(iter->context))->getIndex();
370 case UITER_LIMIT:
371 return ((CharacterIterator *)(iter->context))->endIndex();
372 case UITER_LENGTH:
373 return ((CharacterIterator *)(iter->context))->getLength();
374 default:
375 /* not a valid origin */
376 /* Should never get here! */
377 return -1;
378 }
379 }
380
381 static int32_t U_CALLCONV
characterIteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)382 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
383 switch(origin) {
384 case UITER_ZERO:
385 ((CharacterIterator *)(iter->context))->setIndex(delta);
386 return ((CharacterIterator *)(iter->context))->getIndex();
387 case UITER_START:
388 case UITER_CURRENT:
389 case UITER_LIMIT:
390 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
391 case UITER_LENGTH:
392 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
393 return ((CharacterIterator *)(iter->context))->getIndex();
394 default:
395 /* not a valid origin */
396 /* Should never get here! */
397 return -1;
398 }
399 }
400
401 static UBool U_CALLCONV
characterIteratorHasNext(UCharIterator * iter)402 characterIteratorHasNext(UCharIterator *iter) {
403 return ((CharacterIterator *)(iter->context))->hasNext();
404 }
405
406 static UBool U_CALLCONV
characterIteratorHasPrevious(UCharIterator * iter)407 characterIteratorHasPrevious(UCharIterator *iter) {
408 return ((CharacterIterator *)(iter->context))->hasPrevious();
409 }
410
411 static UChar32 U_CALLCONV
characterIteratorCurrent(UCharIterator * iter)412 characterIteratorCurrent(UCharIterator *iter) {
413 UChar32 c;
414
415 c=((CharacterIterator *)(iter->context))->current();
416 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
417 return c;
418 } else {
419 return U_SENTINEL;
420 }
421 }
422
423 static UChar32 U_CALLCONV
characterIteratorNext(UCharIterator * iter)424 characterIteratorNext(UCharIterator *iter) {
425 if(((CharacterIterator *)(iter->context))->hasNext()) {
426 return ((CharacterIterator *)(iter->context))->nextPostInc();
427 } else {
428 return U_SENTINEL;
429 }
430 }
431
432 static UChar32 U_CALLCONV
characterIteratorPrevious(UCharIterator * iter)433 characterIteratorPrevious(UCharIterator *iter) {
434 if(((CharacterIterator *)(iter->context))->hasPrevious()) {
435 return ((CharacterIterator *)(iter->context))->previous();
436 } else {
437 return U_SENTINEL;
438 }
439 }
440
441 static uint32_t U_CALLCONV
characterIteratorGetState(const UCharIterator * iter)442 characterIteratorGetState(const UCharIterator *iter) {
443 return ((CharacterIterator *)(iter->context))->getIndex();
444 }
445
446 static void U_CALLCONV
characterIteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)447 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
448 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
449 /* do nothing */
450 } else if(iter==NULL || iter->context==NULL) {
451 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
452 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
453 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
454 } else {
455 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
456 }
457 }
458
459 static const UCharIterator characterIteratorWrapper={
460 0, 0, 0, 0, 0, 0,
461 characterIteratorGetIndex,
462 characterIteratorMove,
463 characterIteratorHasNext,
464 characterIteratorHasPrevious,
465 characterIteratorCurrent,
466 characterIteratorNext,
467 characterIteratorPrevious,
468 NULL,
469 characterIteratorGetState,
470 characterIteratorSetState
471 };
472
473 U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator * iter,CharacterIterator * charIter)474 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
475 if(iter!=0) {
476 if(charIter!=0) {
477 *iter=characterIteratorWrapper;
478 iter->context=charIter;
479 } else {
480 *iter=noopIterator;
481 }
482 }
483 }
484
485 /* UCharIterator wrapper around Replaceable --------------------------------- */
486
487 /*
488 * This is an implementation of a code unit (UChar) iterator
489 * based on a Replaceable object.
490 *
491 * The UCharIterator.context field holds a pointer to the Replaceable.
492 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
493 * and the iteration index.
494 */
495
496 static UChar32 U_CALLCONV
replaceableIteratorCurrent(UCharIterator * iter)497 replaceableIteratorCurrent(UCharIterator *iter) {
498 if(iter->index<iter->limit) {
499 return ((Replaceable *)(iter->context))->charAt(iter->index);
500 } else {
501 return U_SENTINEL;
502 }
503 }
504
505 static UChar32 U_CALLCONV
replaceableIteratorNext(UCharIterator * iter)506 replaceableIteratorNext(UCharIterator *iter) {
507 if(iter->index<iter->limit) {
508 return ((Replaceable *)(iter->context))->charAt(iter->index++);
509 } else {
510 return U_SENTINEL;
511 }
512 }
513
514 static UChar32 U_CALLCONV
replaceableIteratorPrevious(UCharIterator * iter)515 replaceableIteratorPrevious(UCharIterator *iter) {
516 if(iter->index>iter->start) {
517 return ((Replaceable *)(iter->context))->charAt(--iter->index);
518 } else {
519 return U_SENTINEL;
520 }
521 }
522
523 static const UCharIterator replaceableIterator={
524 0, 0, 0, 0, 0, 0,
525 stringIteratorGetIndex,
526 stringIteratorMove,
527 stringIteratorHasNext,
528 stringIteratorHasPrevious,
529 replaceableIteratorCurrent,
530 replaceableIteratorNext,
531 replaceableIteratorPrevious,
532 NULL,
533 stringIteratorGetState,
534 stringIteratorSetState
535 };
536
537 U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator * iter,const Replaceable * rep)538 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
539 if(iter!=0) {
540 if(rep!=0) {
541 *iter=replaceableIterator;
542 iter->context=rep;
543 iter->limit=iter->length=rep->length();
544 } else {
545 *iter=noopIterator;
546 }
547 }
548 }
549
550 /* UCharIterator implementation for UTF-8 strings --------------------------- */
551
552 /*
553 * Possible, probably necessary only for an implementation for arbitrary
554 * converters:
555 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
556 * This would require to turn reservedFn into a close function and
557 * to introduce a uiter_close(iter).
558 */
559
560 #define UITER_CNV_CAPACITY 16
561
562 /*
563 * Minimal implementation:
564 * Maintain a single-UChar buffer for an additional surrogate.
565 * The caller must not modify start and limit because they are used internally.
566 *
567 * Use UCharIterator fields as follows:
568 * context pointer to UTF-8 string
569 * length UTF-16 length of the string; -1 until lazy evaluation
570 * start current UTF-8 index
571 * index current UTF-16 index; may be -1="unknown" after setState()
572 * limit UTF-8 length of the string
573 * reservedField supplementary code point
574 *
575 * Since UCharIterator delivers 16-bit code units, the iteration can be
576 * currently in the middle of the byte sequence for a supplementary code point.
577 * In this case, reservedField will contain that code point and start will
578 * point to after the corresponding byte sequence. The UTF-16 index will be
579 * one less than what it would otherwise be corresponding to the UTF-8 index.
580 * Otherwise, reservedField will be 0.
581 */
582
583 /*
584 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
585 * Add implementations that do not call strlen() for iteration but check for NUL.
586 */
587
588 static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator * iter,UCharIteratorOrigin origin)589 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
590 switch(origin) {
591 case UITER_ZERO:
592 case UITER_START:
593 return 0;
594 case UITER_CURRENT:
595 if(iter->index<0) {
596 /* the current UTF-16 index is unknown after setState(), count from the beginning */
597 const uint8_t *s;
598 UChar32 c;
599 int32_t i, limit, index;
600
601 s=(const uint8_t *)iter->context;
602 i=index=0;
603 limit=iter->start; /* count up to the UTF-8 index */
604 while(i<limit) {
605 U8_NEXT_OR_FFFD(s, i, limit, c);
606 index+=U16_LENGTH(c);
607 }
608
609 iter->start=i; /* just in case setState() did not get us to a code point boundary */
610 if(i==iter->limit) {
611 iter->length=index; /* in case it was <0 or wrong */
612 }
613 if(iter->reservedField!=0) {
614 --index; /* we are in the middle of a supplementary code point */
615 }
616 iter->index=index;
617 }
618 return iter->index;
619 case UITER_LIMIT:
620 case UITER_LENGTH:
621 if(iter->length<0) {
622 const uint8_t *s;
623 UChar32 c;
624 int32_t i, limit, length;
625
626 s=(const uint8_t *)iter->context;
627 if(iter->index<0) {
628 /*
629 * the current UTF-16 index is unknown after setState(),
630 * we must first count from the beginning to here
631 */
632 i=length=0;
633 limit=iter->start;
634
635 /* count from the beginning to the current index */
636 while(i<limit) {
637 U8_NEXT_OR_FFFD(s, i, limit, c);
638 length+=U16_LENGTH(c);
639 }
640
641 /* assume i==limit==iter->start, set the UTF-16 index */
642 iter->start=i; /* just in case setState() did not get us to a code point boundary */
643 iter->index= iter->reservedField!=0 ? length-1 : length;
644 } else {
645 i=iter->start;
646 length=iter->index;
647 if(iter->reservedField!=0) {
648 ++length;
649 }
650 }
651
652 /* count from the current index to the end */
653 limit=iter->limit;
654 while(i<limit) {
655 U8_NEXT_OR_FFFD(s, i, limit, c);
656 length+=U16_LENGTH(c);
657 }
658 iter->length=length;
659 }
660 return iter->length;
661 default:
662 /* not a valid origin */
663 /* Should never get here! */
664 return -1;
665 }
666 }
667
668 static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator * iter,int32_t delta,UCharIteratorOrigin origin)669 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
670 const uint8_t *s;
671 UChar32 c;
672 int32_t pos; /* requested UTF-16 index */
673 int32_t i; /* UTF-8 index */
674 UBool havePos;
675
676 /* calculate the requested UTF-16 index */
677 switch(origin) {
678 case UITER_ZERO:
679 case UITER_START:
680 pos=delta;
681 havePos=TRUE;
682 /* iter->index<0 (unknown) is possible */
683 break;
684 case UITER_CURRENT:
685 if(iter->index>=0) {
686 pos=iter->index+delta;
687 havePos=TRUE;
688 } else {
689 /* the current UTF-16 index is unknown after setState(), use only delta */
690 pos=0;
691 havePos=FALSE;
692 }
693 break;
694 case UITER_LIMIT:
695 case UITER_LENGTH:
696 if(iter->length>=0) {
697 pos=iter->length+delta;
698 havePos=TRUE;
699 } else {
700 /* pin to the end, avoid counting the length */
701 iter->index=-1;
702 iter->start=iter->limit;
703 iter->reservedField=0;
704 if(delta>=0) {
705 return UITER_UNKNOWN_INDEX;
706 } else {
707 /* the current UTF-16 index is unknown, use only delta */
708 pos=0;
709 havePos=FALSE;
710 }
711 }
712 break;
713 default:
714 return -1; /* Error */
715 }
716
717 if(havePos) {
718 /* shortcuts: pinning to the edges of the string */
719 if(pos<=0) {
720 iter->index=iter->start=iter->reservedField=0;
721 return 0;
722 } else if(iter->length>=0 && pos>=iter->length) {
723 iter->index=iter->length;
724 iter->start=iter->limit;
725 iter->reservedField=0;
726 return iter->index;
727 }
728
729 /* minimize the number of U8_NEXT/PREV operations */
730 if(iter->index<0 || pos<iter->index/2) {
731 /* go forward from the start instead of backward from the current index */
732 iter->index=iter->start=iter->reservedField=0;
733 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
734 /*
735 * if we have the UTF-16 index and length and the new position is
736 * closer to the end than the current index,
737 * then go backward from the end instead of forward from the current index
738 */
739 iter->index=iter->length;
740 iter->start=iter->limit;
741 iter->reservedField=0;
742 }
743
744 delta=pos-iter->index;
745 if(delta==0) {
746 return iter->index; /* nothing to do */
747 }
748 } else {
749 /* move relative to unknown UTF-16 index */
750 if(delta==0) {
751 return UITER_UNKNOWN_INDEX; /* nothing to do */
752 } else if(-delta>=iter->start) {
753 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
754 iter->index=iter->start=iter->reservedField=0;
755 return 0;
756 } else if(delta>=(iter->limit-iter->start)) {
757 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
758 iter->index=iter->length; /* may or may not be <0 (unknown) */
759 iter->start=iter->limit;
760 iter->reservedField=0;
761 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
762 }
763 }
764
765 /* delta!=0 */
766
767 /* move towards the requested position, pin to the edges of the string */
768 s=(const uint8_t *)iter->context;
769 pos=iter->index; /* could be <0 (unknown) */
770 i=iter->start;
771 if(delta>0) {
772 /* go forward */
773 int32_t limit=iter->limit;
774 if(iter->reservedField!=0) {
775 iter->reservedField=0;
776 ++pos;
777 --delta;
778 }
779 while(delta>0 && i<limit) {
780 U8_NEXT_OR_FFFD(s, i, limit, c);
781 if(c<=0xffff) {
782 ++pos;
783 --delta;
784 } else if(delta>=2) {
785 pos+=2;
786 delta-=2;
787 } else /* delta==1 */ {
788 /* stop in the middle of a supplementary code point */
789 iter->reservedField=c;
790 ++pos;
791 break; /* delta=0; */
792 }
793 }
794 if(i==limit) {
795 if(iter->length<0 && iter->index>=0) {
796 iter->length= iter->reservedField==0 ? pos : pos+1;
797 } else if(iter->index<0 && iter->length>=0) {
798 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
799 }
800 }
801 } else /* delta<0 */ {
802 /* go backward */
803 if(iter->reservedField!=0) {
804 iter->reservedField=0;
805 i-=4; /* we stayed behind the supplementary code point; go before it now */
806 --pos;
807 ++delta;
808 }
809 while(delta<0 && i>0) {
810 U8_PREV_OR_FFFD(s, 0, i, c);
811 if(c<=0xffff) {
812 --pos;
813 ++delta;
814 } else if(delta<=-2) {
815 pos-=2;
816 delta+=2;
817 } else /* delta==-1 */ {
818 /* stop in the middle of a supplementary code point */
819 i+=4; /* back to behind this supplementary code point for consistent state */
820 iter->reservedField=c;
821 --pos;
822 break; /* delta=0; */
823 }
824 }
825 }
826
827 iter->start=i;
828 if(iter->index>=0) {
829 return iter->index=pos;
830 } else {
831 /* we started with index<0 (unknown) so pos is bogus */
832 if(i<=1) {
833 return iter->index=i; /* reached the beginning */
834 } else {
835 /* we still don't know the UTF-16 index */
836 return UITER_UNKNOWN_INDEX;
837 }
838 }
839 }
840
841 static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator * iter)842 utf8IteratorHasNext(UCharIterator *iter) {
843 return iter->start<iter->limit || iter->reservedField!=0;
844 }
845
846 static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator * iter)847 utf8IteratorHasPrevious(UCharIterator *iter) {
848 return iter->start>0;
849 }
850
851 static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator * iter)852 utf8IteratorCurrent(UCharIterator *iter) {
853 if(iter->reservedField!=0) {
854 return U16_TRAIL(iter->reservedField);
855 } else if(iter->start<iter->limit) {
856 const uint8_t *s=(const uint8_t *)iter->context;
857 UChar32 c;
858 int32_t i=iter->start;
859
860 U8_NEXT_OR_FFFD(s, i, iter->limit, c);
861 if(c<=0xffff) {
862 return c;
863 } else {
864 return U16_LEAD(c);
865 }
866 } else {
867 return U_SENTINEL;
868 }
869 }
870
871 static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator * iter)872 utf8IteratorNext(UCharIterator *iter) {
873 int32_t index;
874
875 if(iter->reservedField!=0) {
876 UChar trail=U16_TRAIL(iter->reservedField);
877 iter->reservedField=0;
878 if((index=iter->index)>=0) {
879 iter->index=index+1;
880 }
881 return trail;
882 } else if(iter->start<iter->limit) {
883 const uint8_t *s=(const uint8_t *)iter->context;
884 UChar32 c;
885
886 U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
887 if((index=iter->index)>=0) {
888 iter->index=++index;
889 if(iter->length<0 && iter->start==iter->limit) {
890 iter->length= c<=0xffff ? index : index+1;
891 }
892 } else if(iter->start==iter->limit && iter->length>=0) {
893 iter->index= c<=0xffff ? iter->length : iter->length-1;
894 }
895 if(c<=0xffff) {
896 return c;
897 } else {
898 iter->reservedField=c;
899 return U16_LEAD(c);
900 }
901 } else {
902 return U_SENTINEL;
903 }
904 }
905
906 static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator * iter)907 utf8IteratorPrevious(UCharIterator *iter) {
908 int32_t index;
909
910 if(iter->reservedField!=0) {
911 UChar lead=U16_LEAD(iter->reservedField);
912 iter->reservedField=0;
913 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
914 if((index=iter->index)>0) {
915 iter->index=index-1;
916 }
917 return lead;
918 } else if(iter->start>0) {
919 const uint8_t *s=(const uint8_t *)iter->context;
920 UChar32 c;
921
922 U8_PREV_OR_FFFD(s, 0, iter->start, c);
923 if((index=iter->index)>0) {
924 iter->index=index-1;
925 } else if(iter->start<=1) {
926 iter->index= c<=0xffff ? iter->start : iter->start+1;
927 }
928 if(c<=0xffff) {
929 return c;
930 } else {
931 iter->start+=4; /* back to behind this supplementary code point for consistent state */
932 iter->reservedField=c;
933 return U16_TRAIL(c);
934 }
935 } else {
936 return U_SENTINEL;
937 }
938 }
939
940 static uint32_t U_CALLCONV
utf8IteratorGetState(const UCharIterator * iter)941 utf8IteratorGetState(const UCharIterator *iter) {
942 uint32_t state=(uint32_t)(iter->start<<1);
943 if(iter->reservedField!=0) {
944 state|=1;
945 }
946 return state;
947 }
948
949 static void U_CALLCONV
utf8IteratorSetState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)950 utf8IteratorSetState(UCharIterator *iter,
951 uint32_t state,
952 UErrorCode *pErrorCode)
953 {
954 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
955 /* do nothing */
956 } else if(iter==NULL) {
957 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
958 } else if(state==utf8IteratorGetState(iter)) {
959 /* setting to the current state: no-op */
960 } else {
961 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
962 state&=1; /* 1 if in surrogate pair, must be index>=4 */
963
964 if((state==0 ? index<0 : index<4) || iter->limit<index) {
965 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
966 } else {
967 iter->start=index; /* restore UTF-8 byte index */
968 if(index<=1) {
969 iter->index=index;
970 } else {
971 iter->index=-1; /* unknown UTF-16 index */
972 }
973 if(state==0) {
974 iter->reservedField=0;
975 } else {
976 /* verified index>=4 above */
977 UChar32 c;
978 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
979 if(c<=0xffff) {
980 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
981 } else {
982 iter->reservedField=c;
983 }
984 }
985 }
986 }
987 }
988
989 static const UCharIterator utf8Iterator={
990 0, 0, 0, 0, 0, 0,
991 utf8IteratorGetIndex,
992 utf8IteratorMove,
993 utf8IteratorHasNext,
994 utf8IteratorHasPrevious,
995 utf8IteratorCurrent,
996 utf8IteratorNext,
997 utf8IteratorPrevious,
998 NULL,
999 utf8IteratorGetState,
1000 utf8IteratorSetState
1001 };
1002
1003 U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator * iter,const char * s,int32_t length)1004 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1005 if(iter!=0) {
1006 if(s!=0 && length>=-1) {
1007 *iter=utf8Iterator;
1008 iter->context=s;
1009 if(length>=0) {
1010 iter->limit=length;
1011 } else {
1012 iter->limit=(int32_t)uprv_strlen(s);
1013 }
1014 iter->length= iter->limit<=1 ? iter->limit : -1;
1015 } else {
1016 *iter=noopIterator;
1017 }
1018 }
1019 }
1020
1021 /* Helper functions --------------------------------------------------------- */
1022
1023 U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator * iter)1024 uiter_current32(UCharIterator *iter) {
1025 UChar32 c, c2;
1026
1027 c=iter->current(iter);
1028 if(U16_IS_SURROGATE(c)) {
1029 if(U16_IS_SURROGATE_LEAD(c)) {
1030 /*
1031 * go to the next code unit
1032 * we know that we are not at the limit because c!=U_SENTINEL
1033 */
1034 iter->move(iter, 1, UITER_CURRENT);
1035 if(U16_IS_TRAIL(c2=iter->current(iter))) {
1036 c=U16_GET_SUPPLEMENTARY(c, c2);
1037 }
1038
1039 /* undo index movement */
1040 iter->move(iter, -1, UITER_CURRENT);
1041 } else {
1042 if(U16_IS_LEAD(c2=iter->previous(iter))) {
1043 c=U16_GET_SUPPLEMENTARY(c2, c);
1044 }
1045 if(c2>=0) {
1046 /* undo index movement */
1047 iter->move(iter, 1, UITER_CURRENT);
1048 }
1049 }
1050 }
1051 return c;
1052 }
1053
1054 U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator * iter)1055 uiter_next32(UCharIterator *iter) {
1056 UChar32 c, c2;
1057
1058 c=iter->next(iter);
1059 if(U16_IS_LEAD(c)) {
1060 if(U16_IS_TRAIL(c2=iter->next(iter))) {
1061 c=U16_GET_SUPPLEMENTARY(c, c2);
1062 } else if(c2>=0) {
1063 /* unmatched first surrogate, undo index movement */
1064 iter->move(iter, -1, UITER_CURRENT);
1065 }
1066 }
1067 return c;
1068 }
1069
1070 U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator * iter)1071 uiter_previous32(UCharIterator *iter) {
1072 UChar32 c, c2;
1073
1074 c=iter->previous(iter);
1075 if(U16_IS_TRAIL(c)) {
1076 if(U16_IS_LEAD(c2=iter->previous(iter))) {
1077 c=U16_GET_SUPPLEMENTARY(c2, c);
1078 } else if(c2>=0) {
1079 /* unmatched second surrogate, undo index movement */
1080 iter->move(iter, 1, UITER_CURRENT);
1081 }
1082 }
1083 return c;
1084 }
1085
1086 U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator * iter)1087 uiter_getState(const UCharIterator *iter) {
1088 if(iter==NULL || iter->getState==NULL) {
1089 return UITER_NO_STATE;
1090 } else {
1091 return iter->getState(iter);
1092 }
1093 }
1094
1095 U_CAPI void U_EXPORT2
uiter_setState(UCharIterator * iter,uint32_t state,UErrorCode * pErrorCode)1096 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1097 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1098 /* do nothing */
1099 } else if(iter==NULL) {
1100 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1101 } else if(iter->setState==NULL) {
1102 *pErrorCode=U_UNSUPPORTED_ERROR;
1103 } else {
1104 iter->setState(iter, state, pErrorCode);
1105 }
1106 }
1107
1108 U_CDECL_END
1109