1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2011 International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uiter.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002jan18
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __UITER_H__
20 #define __UITER_H__
21 
22 /**
23  * \file
24  * \brief C API: Unicode Character Iteration
25  *
26  * @see UCharIterator
27  */
28 
29 #include "unicode/utypes.h"
30 
31 #if U_SHOW_CPLUSPLUS_API
32     U_NAMESPACE_BEGIN
33 
34     class CharacterIterator;
35     class Replaceable;
36 
37     U_NAMESPACE_END
38 #endif
39 
40 U_CDECL_BEGIN
41 
42 struct UCharIterator;
43 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
44 
45 /**
46  * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
47  * @see UCharIteratorMove
48  * @see UCharIterator
49  * @stable ICU 2.1
50  */
51 typedef enum UCharIteratorOrigin {
52     UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
53 } UCharIteratorOrigin;
54 
55 /** Constants for UCharIterator. @stable ICU 2.6 */
56 enum {
57     /**
58      * Constant value that may be returned by UCharIteratorMove
59      * indicating that the final UTF-16 index is not known, but that the move succeeded.
60      * This can occur when moving relative to limit or length, or
61      * when moving relative to the current index after a setState()
62      * when the current UTF-16 index is not known.
63      *
64      * It would be very inefficient to have to count from the beginning of the text
65      * just to get the current/limit/length index after moving relative to it.
66      * The actual index can be determined with getIndex(UITER_CURRENT)
67      * which will count the UChars if necessary.
68      *
69      * @stable ICU 2.6
70      */
71     UITER_UNKNOWN_INDEX=-2
72 };
73 
74 
75 /**
76  * Constant for UCharIterator getState() indicating an error or
77  * an unknown state.
78  * Returned by uiter_getState()/UCharIteratorGetState
79  * when an error occurs.
80  * Also, some UCharIterator implementations may not be able to return
81  * a valid state for each position. This will be clearly documented
82  * for each such iterator (none of the public ones here).
83  *
84  * @stable ICU 2.6
85  */
86 #define UITER_NO_STATE ((uint32_t)0xffffffff)
87 
88 /**
89  * Function type declaration for UCharIterator.getIndex().
90  *
91  * Gets the current position, or the start or limit of the
92  * iteration range.
93  *
94  * This function may perform slowly for UITER_CURRENT after setState() was called,
95  * or for UITER_LENGTH, because an iterator implementation may have to count
96  * UChars if the underlying storage is not UTF-16.
97  *
98  * @param iter the UCharIterator structure ("this pointer")
99  * @param origin get the 0, start, limit, length, or current index
100  * @return the requested index, or U_SENTINEL in an error condition
101  *
102  * @see UCharIteratorOrigin
103  * @see UCharIterator
104  * @stable ICU 2.1
105  */
106 typedef int32_t U_CALLCONV
107 UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
108 
109 /**
110  * Function type declaration for UCharIterator.move().
111  *
112  * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
113  *
114  * Moves the current position relative to the start or limit of the
115  * iteration range, or relative to the current position itself.
116  * The movement is expressed in numbers of code units forward
117  * or backward by specifying a positive or negative delta.
118  * Out of bounds movement will be pinned to the start or limit.
119  *
120  * This function may perform slowly for moving relative to UITER_LENGTH
121  * because an iterator implementation may have to count the rest of the
122  * UChars if the native storage is not UTF-16.
123  *
124  * When moving relative to the limit or length, or
125  * relative to the current position after setState() was called,
126  * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
127  * determination of the actual UTF-16 index.
128  * The actual index can be determined with getIndex(UITER_CURRENT)
129  * which will count the UChars if necessary.
130  * See UITER_UNKNOWN_INDEX for details.
131  *
132  * @param iter the UCharIterator structure ("this pointer")
133  * @param delta can be positive, zero, or negative
134  * @param origin move relative to the 0, start, limit, length, or current index
135  * @return the new index, or U_SENTINEL on an error condition,
136  *         or UITER_UNKNOWN_INDEX when the index is not known.
137  *
138  * @see UCharIteratorOrigin
139  * @see UCharIterator
140  * @see UITER_UNKNOWN_INDEX
141  * @stable ICU 2.1
142  */
143 typedef int32_t U_CALLCONV
144 UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
145 
146 /**
147  * Function type declaration for UCharIterator.hasNext().
148  *
149  * Check if current() and next() can still
150  * return another code unit.
151  *
152  * @param iter the UCharIterator structure ("this pointer")
153  * @return boolean value for whether current() and next() can still return another code unit
154  *
155  * @see UCharIterator
156  * @stable ICU 2.1
157  */
158 typedef UBool U_CALLCONV
159 UCharIteratorHasNext(UCharIterator *iter);
160 
161 /**
162  * Function type declaration for UCharIterator.hasPrevious().
163  *
164  * Check if previous() can still return another code unit.
165  *
166  * @param iter the UCharIterator structure ("this pointer")
167  * @return boolean value for whether previous() can still return another code unit
168  *
169  * @see UCharIterator
170  * @stable ICU 2.1
171  */
172 typedef UBool U_CALLCONV
173 UCharIteratorHasPrevious(UCharIterator *iter);
174 
175 /**
176  * Function type declaration for UCharIterator.current().
177  *
178  * Return the code unit at the current position,
179  * or U_SENTINEL if there is none (index is at the limit).
180  *
181  * @param iter the UCharIterator structure ("this pointer")
182  * @return the current code unit
183  *
184  * @see UCharIterator
185  * @stable ICU 2.1
186  */
187 typedef UChar32 U_CALLCONV
188 UCharIteratorCurrent(UCharIterator *iter);
189 
190 /**
191  * Function type declaration for UCharIterator.next().
192  *
193  * Return the code unit at the current index and increment
194  * the index (post-increment, like s[i++]),
195  * or return U_SENTINEL if there is none (index is at the limit).
196  *
197  * @param iter the UCharIterator structure ("this pointer")
198  * @return the current code unit (and post-increment the current index)
199  *
200  * @see UCharIterator
201  * @stable ICU 2.1
202  */
203 typedef UChar32 U_CALLCONV
204 UCharIteratorNext(UCharIterator *iter);
205 
206 /**
207  * Function type declaration for UCharIterator.previous().
208  *
209  * Decrement the index and return the code unit from there
210  * (pre-decrement, like s[--i]),
211  * or return U_SENTINEL if there is none (index is at the start).
212  *
213  * @param iter the UCharIterator structure ("this pointer")
214  * @return the previous code unit (after pre-decrementing the current index)
215  *
216  * @see UCharIterator
217  * @stable ICU 2.1
218  */
219 typedef UChar32 U_CALLCONV
220 UCharIteratorPrevious(UCharIterator *iter);
221 
222 /**
223  * Function type declaration for UCharIterator.reservedFn().
224  * Reserved for future use.
225  *
226  * @param iter the UCharIterator structure ("this pointer")
227  * @param something some integer argument
228  * @return some integer
229  *
230  * @see UCharIterator
231  * @stable ICU 2.1
232  */
233 typedef int32_t U_CALLCONV
234 UCharIteratorReserved(UCharIterator *iter, int32_t something);
235 
236 /**
237  * Function type declaration for UCharIterator.getState().
238  *
239  * Get the "state" of the iterator in the form of a single 32-bit word.
240  * It is recommended that the state value be calculated to be as small as
241  * is feasible. For strings with limited lengths, fewer than 32 bits may
242  * be sufficient.
243  *
244  * This is used together with setState()/UCharIteratorSetState
245  * to save and restore the iterator position more efficiently than with
246  * getIndex()/move().
247  *
248  * The iterator state is defined as a uint32_t value because it is designed
249  * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
250  * of the character iterator.
251  *
252  * With some UCharIterator implementations (e.g., UTF-8),
253  * getting and setting the UTF-16 index with existing functions
254  * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
255  * relatively slow because the iterator has to "walk" from a known index
256  * to the requested one.
257  * This takes more time the farther it needs to go.
258  *
259  * An opaque state value allows an iterator implementation to provide
260  * an internal index (UTF-8: the source byte array index) for
261  * fast, constant-time restoration.
262  *
263  * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
264  * the UTF-16 index may not be restored as well, but the iterator can deliver
265  * the correct text contents and move relative to the current position
266  * without performance degradation.
267  *
268  * Some UCharIterator implementations may not be able to return
269  * a valid state for each position, in which case they return UITER_NO_STATE instead.
270  * This will be clearly documented for each such iterator (none of the public ones here).
271  *
272  * @param iter the UCharIterator structure ("this pointer")
273  * @return the state word
274  *
275  * @see UCharIterator
276  * @see UCharIteratorSetState
277  * @see UITER_NO_STATE
278  * @stable ICU 2.6
279  */
280 typedef uint32_t U_CALLCONV
281 UCharIteratorGetState(const UCharIterator *iter);
282 
283 /**
284  * Function type declaration for UCharIterator.setState().
285  *
286  * Restore the "state" of the iterator using a state word from a getState() call.
287  * The iterator object need not be the same one as for which getState() was called,
288  * but it must be of the same type (set up using the same uiter_setXYZ function)
289  * and it must iterate over the same string
290  * (binary identical regardless of memory address).
291  * For more about the state word see UCharIteratorGetState.
292  *
293  * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
294  * the UTF-16 index may not be restored as well, but the iterator can deliver
295  * the correct text contents and move relative to the current position
296  * without performance degradation.
297  *
298  * @param iter the UCharIterator structure ("this pointer")
299  * @param state the state word from a getState() call
300  *              on a same-type, same-string iterator
301  * @param pErrorCode Must be a valid pointer to an error code value,
302  *                   which must not indicate a failure before the function call.
303  *
304  * @see UCharIterator
305  * @see UCharIteratorGetState
306  * @stable ICU 2.6
307  */
308 typedef void U_CALLCONV
309 UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
310 
311 
312 /**
313  * C API for code unit iteration.
314  * This can be used as a C wrapper around
315  * CharacterIterator, Replaceable, or implemented using simple strings, etc.
316  *
317  * There are two roles for using UCharIterator:
318  *
319  * A "provider" sets the necessary function pointers and controls the "protected"
320  * fields of the UCharIterator structure. A "provider" passes a UCharIterator
321  * into C APIs that need a UCharIterator as an abstract, flexible string interface.
322  *
323  * Implementations of such C APIs are "callers" of UCharIterator functions;
324  * they only use the "public" function pointers and never access the "protected"
325  * fields directly.
326  *
327  * The current() and next() functions only check the current index against the
328  * limit, and previous() only checks the current index against the start,
329  * to see if the iterator already reached the end of the iteration range.
330  *
331  * The assumption - in all iterators - is that the index is moved via the API,
332  * which means it won't go out of bounds, or the index is modified by
333  * user code that knows enough about the iterator implementation to set valid
334  * index values.
335  *
336  * UCharIterator functions return code unit values 0..0xffff,
337  * or U_SENTINEL if the iteration bounds are reached.
338  *
339  * @stable ICU 2.1
340  */
341 struct UCharIterator {
342     /**
343      * (protected) Pointer to string or wrapped object or similar.
344      * Not used by caller.
345      * @stable ICU 2.1
346      */
347     const void *context;
348 
349     /**
350      * (protected) Length of string or similar.
351      * Not used by caller.
352      * @stable ICU 2.1
353      */
354     int32_t length;
355 
356     /**
357      * (protected) Start index or similar.
358      * Not used by caller.
359      * @stable ICU 2.1
360      */
361     int32_t start;
362 
363     /**
364      * (protected) Current index or similar.
365      * Not used by caller.
366      * @stable ICU 2.1
367      */
368     int32_t index;
369 
370     /**
371      * (protected) Limit index or similar.
372      * Not used by caller.
373      * @stable ICU 2.1
374      */
375     int32_t limit;
376 
377     /**
378      * (protected) Used by UTF-8 iterators and possibly others.
379      * @stable ICU 2.1
380      */
381     int32_t reservedField;
382 
383     /**
384      * (public) Returns the current position or the
385      * start or limit index of the iteration range.
386      *
387      * @see UCharIteratorGetIndex
388      * @stable ICU 2.1
389      */
390     UCharIteratorGetIndex *getIndex;
391 
392     /**
393      * (public) Moves the current position relative to the start or limit of the
394      * iteration range, or relative to the current position itself.
395      * The movement is expressed in numbers of code units forward
396      * or backward by specifying a positive or negative delta.
397      *
398      * @see UCharIteratorMove
399      * @stable ICU 2.1
400      */
401     UCharIteratorMove *move;
402 
403     /**
404      * (public) Check if current() and next() can still
405      * return another code unit.
406      *
407      * @see UCharIteratorHasNext
408      * @stable ICU 2.1
409      */
410     UCharIteratorHasNext *hasNext;
411 
412     /**
413      * (public) Check if previous() can still return another code unit.
414      *
415      * @see UCharIteratorHasPrevious
416      * @stable ICU 2.1
417      */
418     UCharIteratorHasPrevious *hasPrevious;
419 
420     /**
421      * (public) Return the code unit at the current position,
422      * or U_SENTINEL if there is none (index is at the limit).
423      *
424      * @see UCharIteratorCurrent
425      * @stable ICU 2.1
426      */
427     UCharIteratorCurrent *current;
428 
429     /**
430      * (public) Return the code unit at the current index and increment
431      * the index (post-increment, like s[i++]),
432      * or return U_SENTINEL if there is none (index is at the limit).
433      *
434      * @see UCharIteratorNext
435      * @stable ICU 2.1
436      */
437     UCharIteratorNext *next;
438 
439     /**
440      * (public) Decrement the index and return the code unit from there
441      * (pre-decrement, like s[--i]),
442      * or return U_SENTINEL if there is none (index is at the start).
443      *
444      * @see UCharIteratorPrevious
445      * @stable ICU 2.1
446      */
447     UCharIteratorPrevious *previous;
448 
449     /**
450      * (public) Reserved for future use. Currently NULL.
451      *
452      * @see UCharIteratorReserved
453      * @stable ICU 2.1
454      */
455     UCharIteratorReserved *reservedFn;
456 
457     /**
458      * (public) Return the state of the iterator, to be restored later with setState().
459      * This function pointer is NULL if the iterator does not implement it.
460      *
461      * @see UCharIteratorGet
462      * @stable ICU 2.6
463      */
464     UCharIteratorGetState *getState;
465 
466     /**
467      * (public) Restore the iterator state from the state word from a call
468      * to getState().
469      * This function pointer is NULL if the iterator does not implement it.
470      *
471      * @see UCharIteratorSet
472      * @stable ICU 2.6
473      */
474     UCharIteratorSetState *setState;
475 };
476 
477 /**
478  * Helper function for UCharIterator to get the code point
479  * at the current index.
480  *
481  * Return the code point that includes the code unit at the current position,
482  * or U_SENTINEL if there is none (index is at the limit).
483  * If the current code unit is a lead or trail surrogate,
484  * then the following or preceding surrogate is used to form
485  * the code point value.
486  *
487  * @param iter the UCharIterator structure ("this pointer")
488  * @return the current code point
489  *
490  * @see UCharIterator
491  * @see U16_GET
492  * @see UnicodeString::char32At()
493  * @stable ICU 2.1
494  */
495 U_STABLE UChar32 U_EXPORT2
496 uiter_current32(UCharIterator *iter);
497 
498 /**
499  * Helper function for UCharIterator to get the next code point.
500  *
501  * Return the code point at the current index and increment
502  * the index (post-increment, like s[i++]),
503  * or return U_SENTINEL if there is none (index is at the limit).
504  *
505  * @param iter the UCharIterator structure ("this pointer")
506  * @return the current code point (and post-increment the current index)
507  *
508  * @see UCharIterator
509  * @see U16_NEXT
510  * @stable ICU 2.1
511  */
512 U_STABLE UChar32 U_EXPORT2
513 uiter_next32(UCharIterator *iter);
514 
515 /**
516  * Helper function for UCharIterator to get the previous code point.
517  *
518  * Decrement the index and return the code point from there
519  * (pre-decrement, like s[--i]),
520  * or return U_SENTINEL if there is none (index is at the start).
521  *
522  * @param iter the UCharIterator structure ("this pointer")
523  * @return the previous code point (after pre-decrementing the current index)
524  *
525  * @see UCharIterator
526  * @see U16_PREV
527  * @stable ICU 2.1
528  */
529 U_STABLE UChar32 U_EXPORT2
530 uiter_previous32(UCharIterator *iter);
531 
532 /**
533  * Get the "state" of the iterator in the form of a single 32-bit word.
534  * This is a convenience function that calls iter->getState(iter)
535  * if iter->getState is not NULL;
536  * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
537  *
538  * Some UCharIterator implementations may not be able to return
539  * a valid state for each position, in which case they return UITER_NO_STATE instead.
540  * This will be clearly documented for each such iterator (none of the public ones here).
541  *
542  * @param iter the UCharIterator structure ("this pointer")
543  * @return the state word
544  *
545  * @see UCharIterator
546  * @see UCharIteratorGetState
547  * @see UITER_NO_STATE
548  * @stable ICU 2.6
549  */
550 U_STABLE uint32_t U_EXPORT2
551 uiter_getState(const UCharIterator *iter);
552 
553 /**
554  * Restore the "state" of the iterator using a state word from a getState() call.
555  * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
556  * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
557  *
558  * @param iter the UCharIterator structure ("this pointer")
559  * @param state the state word from a getState() call
560  *              on a same-type, same-string iterator
561  * @param pErrorCode Must be a valid pointer to an error code value,
562  *                   which must not indicate a failure before the function call.
563  *
564  * @see UCharIterator
565  * @see UCharIteratorSetState
566  * @stable ICU 2.6
567  */
568 U_STABLE void U_EXPORT2
569 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
570 
571 /**
572  * Set up a UCharIterator to iterate over a string.
573  *
574  * Sets the UCharIterator function pointers for iteration over the string s
575  * with iteration boundaries start=index=0 and length=limit=string length.
576  * The "provider" may set the start, index, and limit values at any time
577  * within the range 0..length.
578  * The length field will be ignored.
579  *
580  * The string pointer s is set into UCharIterator.context without copying
581  * or reallocating the string contents.
582  *
583  * getState() simply returns the current index.
584  * move() will always return the final index.
585  *
586  * @param iter UCharIterator structure to be set for iteration
587  * @param s String to iterate over
588  * @param length Length of s, or -1 if NUL-terminated
589  *
590  * @see UCharIterator
591  * @stable ICU 2.1
592  */
593 U_STABLE void U_EXPORT2
594 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
595 
596 /**
597  * Set up a UCharIterator to iterate over a UTF-16BE string
598  * (byte vector with a big-endian pair of bytes per UChar).
599  *
600  * Everything works just like with a normal UChar iterator (uiter_setString),
601  * except that UChars are assembled from byte pairs,
602  * and that the length argument here indicates an even number of bytes.
603  *
604  * getState() simply returns the current index.
605  * move() will always return the final index.
606  *
607  * @param iter UCharIterator structure to be set for iteration
608  * @param s UTF-16BE string to iterate over
609  * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
610  *               (NUL means pair of 0 bytes at even index from s)
611  *
612  * @see UCharIterator
613  * @see uiter_setString
614  * @stable ICU 2.6
615  */
616 U_STABLE void U_EXPORT2
617 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
618 
619 /**
620  * Set up a UCharIterator to iterate over a UTF-8 string.
621  *
622  * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
623  * with UTF-8 iteration boundaries 0 and length.
624  * The implementation counts the UTF-16 index on the fly and
625  * lazily evaluates the UTF-16 length of the text.
626  *
627  * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
628  * When the reservedField is not 0, then it contains a supplementary code point
629  * and the UTF-16 index is between the two corresponding surrogates.
630  * At that point, the UTF-8 index is behind that code point.
631  *
632  * The UTF-8 string pointer s is set into UCharIterator.context without copying
633  * or reallocating the string contents.
634  *
635  * getState() returns a state value consisting of
636  * - the current UTF-8 source byte index (bits 31..1)
637  * - a flag (bit 0) that indicates whether the UChar position is in the middle
638  *   of a surrogate pair
639  *   (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
640  *
641  * getState() cannot also encode the UTF-16 index in the state value.
642  * move(relative to limit or length), or
643  * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
644  *
645  * @param iter UCharIterator structure to be set for iteration
646  * @param s UTF-8 string to iterate over
647  * @param length Length of s in bytes, or -1 if NUL-terminated
648  *
649  * @see UCharIterator
650  * @stable ICU 2.6
651  */
652 U_STABLE void U_EXPORT2
653 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
654 
655 #if U_SHOW_CPLUSPLUS_API
656 
657 /**
658  * Set up a UCharIterator to wrap around a C++ CharacterIterator.
659  *
660  * Sets the UCharIterator function pointers for iteration using the
661  * CharacterIterator charIter.
662  *
663  * The CharacterIterator pointer charIter is set into UCharIterator.context
664  * without copying or cloning the CharacterIterator object.
665  * The other "protected" UCharIterator fields are set to 0 and will be ignored.
666  * The iteration index and boundaries are controlled by the CharacterIterator.
667  *
668  * getState() simply returns the current index.
669  * move() will always return the final index.
670  *
671  * @param iter UCharIterator structure to be set for iteration
672  * @param charIter CharacterIterator to wrap
673  *
674  * @see UCharIterator
675  * @stable ICU 2.1
676  */
677 U_STABLE void U_EXPORT2
678 uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
679 
680 /**
681  * Set up a UCharIterator to iterate over a C++ Replaceable.
682  *
683  * Sets the UCharIterator function pointers for iteration over the
684  * Replaceable rep with iteration boundaries start=index=0 and
685  * length=limit=rep->length().
686  * The "provider" may set the start, index, and limit values at any time
687  * within the range 0..length=rep->length().
688  * The length field will be ignored.
689  *
690  * The Replaceable pointer rep is set into UCharIterator.context without copying
691  * or cloning/reallocating the Replaceable object.
692  *
693  * getState() simply returns the current index.
694  * move() will always return the final index.
695  *
696  * @param iter UCharIterator structure to be set for iteration
697  * @param rep Replaceable to iterate over
698  *
699  * @see UCharIterator
700  * @stable ICU 2.1
701  */
702 U_STABLE void U_EXPORT2
703 uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
704 
705 #endif
706 
707 U_CDECL_END
708 
709 #endif
710