1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2011 International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uiter.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jan18
14 *   created by: Markus W. Scherer
15 */
16 
17 #ifndef __UITER_H__
18 #define __UITER_H__
19 
20 /**
21  * \file
22  * \brief C API: Unicode Character Iteration
23  *
24  * @see UCharIterator
25  */
26 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30     U_NAMESPACE_BEGIN
31 
32     class CharacterIterator;
33     class Replaceable;
34 
35     U_NAMESPACE_END
36 #endif
37 
38 U_CDECL_BEGIN
39 
40 struct UCharIterator;
41 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
42 
43 /**
44  * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45  * @see UCharIteratorMove
46  * @see UCharIterator
47  * @stable ICU 2.1
48  */
49 typedef enum UCharIteratorOrigin {
50     UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
51 } UCharIteratorOrigin;
52 
53 /** Constants for UCharIterator. @stable ICU 2.6 */
54 enum {
55     /**
56      * Constant value that may be returned by UCharIteratorMove
57      * indicating that the final UTF-16 index is not known, but that the move succeeded.
58      * This can occur when moving relative to limit or length, or
59      * when moving relative to the current index after a setState()
60      * when the current UTF-16 index is not known.
61      *
62      * It would be very inefficient to have to count from the beginning of the text
63      * just to get the current/limit/length index after moving relative to it.
64      * The actual index can be determined with getIndex(UITER_CURRENT)
65      * which will count the UChars if necessary.
66      *
67      * @stable ICU 2.6
68      */
69     UITER_UNKNOWN_INDEX=-2
70 };
71 
72 
73 /**
74  * Constant for UCharIterator getState() indicating an error or
75  * an unknown state.
76  * Returned by uiter_getState()/UCharIteratorGetState
77  * when an error occurs.
78  * Also, some UCharIterator implementations may not be able to return
79  * a valid state for each position. This will be clearly documented
80  * for each such iterator (none of the public ones here).
81  *
82  * @stable ICU 2.6
83  */
84 #define UITER_NO_STATE ((uint32_t)0xffffffff)
85 
86 /**
87  * Function type declaration for UCharIterator.getIndex().
88  *
89  * Gets the current position, or the start or limit of the
90  * iteration range.
91  *
92  * This function may perform slowly for UITER_CURRENT after setState() was called,
93  * or for UITER_LENGTH, because an iterator implementation may have to count
94  * UChars if the underlying storage is not UTF-16.
95  *
96  * @param iter the UCharIterator structure ("this pointer")
97  * @param origin get the 0, start, limit, length, or current index
98  * @return the requested index, or U_SENTINEL in an error condition
99  *
100  * @see UCharIteratorOrigin
101  * @see UCharIterator
102  * @stable ICU 2.1
103  */
104 typedef int32_t U_CALLCONV
105 UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
106 
107 /**
108  * Function type declaration for UCharIterator.move().
109  *
110  * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
111  *
112  * Moves the current position relative to the start or limit of the
113  * iteration range, or relative to the current position itself.
114  * The movement is expressed in numbers of code units forward
115  * or backward by specifying a positive or negative delta.
116  * Out of bounds movement will be pinned to the start or limit.
117  *
118  * This function may perform slowly for moving relative to UITER_LENGTH
119  * because an iterator implementation may have to count the rest of the
120  * UChars if the native storage is not UTF-16.
121  *
122  * When moving relative to the limit or length, or
123  * relative to the current position after setState() was called,
124  * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
125  * determination of the actual UTF-16 index.
126  * The actual index can be determined with getIndex(UITER_CURRENT)
127  * which will count the UChars if necessary.
128  * See UITER_UNKNOWN_INDEX for details.
129  *
130  * @param iter the UCharIterator structure ("this pointer")
131  * @param delta can be positive, zero, or negative
132  * @param origin move relative to the 0, start, limit, length, or current index
133  * @return the new index, or U_SENTINEL on an error condition,
134  *         or UITER_UNKNOWN_INDEX when the index is not known.
135  *
136  * @see UCharIteratorOrigin
137  * @see UCharIterator
138  * @see UITER_UNKNOWN_INDEX
139  * @stable ICU 2.1
140  */
141 typedef int32_t U_CALLCONV
142 UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
143 
144 /**
145  * Function type declaration for UCharIterator.hasNext().
146  *
147  * Check if current() and next() can still
148  * return another code unit.
149  *
150  * @param iter the UCharIterator structure ("this pointer")
151  * @return boolean value for whether current() and next() can still return another code unit
152  *
153  * @see UCharIterator
154  * @stable ICU 2.1
155  */
156 typedef UBool U_CALLCONV
157 UCharIteratorHasNext(UCharIterator *iter);
158 
159 /**
160  * Function type declaration for UCharIterator.hasPrevious().
161  *
162  * Check if previous() can still return another code unit.
163  *
164  * @param iter the UCharIterator structure ("this pointer")
165  * @return boolean value for whether previous() can still return another code unit
166  *
167  * @see UCharIterator
168  * @stable ICU 2.1
169  */
170 typedef UBool U_CALLCONV
171 UCharIteratorHasPrevious(UCharIterator *iter);
172 
173 /**
174  * Function type declaration for UCharIterator.current().
175  *
176  * Return the code unit at the current position,
177  * or U_SENTINEL if there is none (index is at the limit).
178  *
179  * @param iter the UCharIterator structure ("this pointer")
180  * @return the current code unit
181  *
182  * @see UCharIterator
183  * @stable ICU 2.1
184  */
185 typedef UChar32 U_CALLCONV
186 UCharIteratorCurrent(UCharIterator *iter);
187 
188 /**
189  * Function type declaration for UCharIterator.next().
190  *
191  * Return the code unit at the current index and increment
192  * the index (post-increment, like s[i++]),
193  * or return U_SENTINEL if there is none (index is at the limit).
194  *
195  * @param iter the UCharIterator structure ("this pointer")
196  * @return the current code unit (and post-increment the current index)
197  *
198  * @see UCharIterator
199  * @stable ICU 2.1
200  */
201 typedef UChar32 U_CALLCONV
202 UCharIteratorNext(UCharIterator *iter);
203 
204 /**
205  * Function type declaration for UCharIterator.previous().
206  *
207  * Decrement the index and return the code unit from there
208  * (pre-decrement, like s[--i]),
209  * or return U_SENTINEL if there is none (index is at the start).
210  *
211  * @param iter the UCharIterator structure ("this pointer")
212  * @return the previous code unit (after pre-decrementing the current index)
213  *
214  * @see UCharIterator
215  * @stable ICU 2.1
216  */
217 typedef UChar32 U_CALLCONV
218 UCharIteratorPrevious(UCharIterator *iter);
219 
220 /**
221  * Function type declaration for UCharIterator.reservedFn().
222  * Reserved for future use.
223  *
224  * @param iter the UCharIterator structure ("this pointer")
225  * @param something some integer argument
226  * @return some integer
227  *
228  * @see UCharIterator
229  * @stable ICU 2.1
230  */
231 typedef int32_t U_CALLCONV
232 UCharIteratorReserved(UCharIterator *iter, int32_t something);
233 
234 /**
235  * Function type declaration for UCharIterator.getState().
236  *
237  * Get the "state" of the iterator in the form of a single 32-bit word.
238  * It is recommended that the state value be calculated to be as small as
239  * is feasible. For strings with limited lengths, fewer than 32 bits may
240  * be sufficient.
241  *
242  * This is used together with setState()/UCharIteratorSetState
243  * to save and restore the iterator position more efficiently than with
244  * getIndex()/move().
245  *
246  * The iterator state is defined as a uint32_t value because it is designed
247  * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
248  * of the character iterator.
249  *
250  * With some UCharIterator implementations (e.g., UTF-8),
251  * getting and setting the UTF-16 index with existing functions
252  * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
253  * relatively slow because the iterator has to "walk" from a known index
254  * to the requested one.
255  * This takes more time the farther it needs to go.
256  *
257  * An opaque state value allows an iterator implementation to provide
258  * an internal index (UTF-8: the source byte array index) for
259  * fast, constant-time restoration.
260  *
261  * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
262  * the UTF-16 index may not be restored as well, but the iterator can deliver
263  * the correct text contents and move relative to the current position
264  * without performance degradation.
265  *
266  * Some UCharIterator implementations may not be able to return
267  * a valid state for each position, in which case they return UITER_NO_STATE instead.
268  * This will be clearly documented for each such iterator (none of the public ones here).
269  *
270  * @param iter the UCharIterator structure ("this pointer")
271  * @return the state word
272  *
273  * @see UCharIterator
274  * @see UCharIteratorSetState
275  * @see UITER_NO_STATE
276  * @stable ICU 2.6
277  */
278 typedef uint32_t U_CALLCONV
279 UCharIteratorGetState(const UCharIterator *iter);
280 
281 /**
282  * Function type declaration for UCharIterator.setState().
283  *
284  * Restore the "state" of the iterator using a state word from a getState() call.
285  * The iterator object need not be the same one as for which getState() was called,
286  * but it must be of the same type (set up using the same uiter_setXYZ function)
287  * and it must iterate over the same string
288  * (binary identical regardless of memory address).
289  * For more about the state word see UCharIteratorGetState.
290  *
291  * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
292  * the UTF-16 index may not be restored as well, but the iterator can deliver
293  * the correct text contents and move relative to the current position
294  * without performance degradation.
295  *
296  * @param iter the UCharIterator structure ("this pointer")
297  * @param state the state word from a getState() call
298  *              on a same-type, same-string iterator
299  * @param pErrorCode Must be a valid pointer to an error code value,
300  *                   which must not indicate a failure before the function call.
301  *
302  * @see UCharIterator
303  * @see UCharIteratorGetState
304  * @stable ICU 2.6
305  */
306 typedef void U_CALLCONV
307 UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
308 
309 
310 /**
311  * C API for code unit iteration.
312  * This can be used as a C wrapper around
313  * CharacterIterator, Replaceable, or implemented using simple strings, etc.
314  *
315  * There are two roles for using UCharIterator:
316  *
317  * A "provider" sets the necessary function pointers and controls the "protected"
318  * fields of the UCharIterator structure. A "provider" passes a UCharIterator
319  * into C APIs that need a UCharIterator as an abstract, flexible string interface.
320  *
321  * Implementations of such C APIs are "callers" of UCharIterator functions;
322  * they only use the "public" function pointers and never access the "protected"
323  * fields directly.
324  *
325  * The current() and next() functions only check the current index against the
326  * limit, and previous() only checks the current index against the start,
327  * to see if the iterator already reached the end of the iteration range.
328  *
329  * The assumption - in all iterators - is that the index is moved via the API,
330  * which means it won't go out of bounds, or the index is modified by
331  * user code that knows enough about the iterator implementation to set valid
332  * index values.
333  *
334  * UCharIterator functions return code unit values 0..0xffff,
335  * or U_SENTINEL if the iteration bounds are reached.
336  *
337  * @stable ICU 2.1
338  */
339 struct UCharIterator {
340     /**
341      * (protected) Pointer to string or wrapped object or similar.
342      * Not used by caller.
343      * @stable ICU 2.1
344      */
345     const void *context;
346 
347     /**
348      * (protected) Length of string or similar.
349      * Not used by caller.
350      * @stable ICU 2.1
351      */
352     int32_t length;
353 
354     /**
355      * (protected) Start index or similar.
356      * Not used by caller.
357      * @stable ICU 2.1
358      */
359     int32_t start;
360 
361     /**
362      * (protected) Current index or similar.
363      * Not used by caller.
364      * @stable ICU 2.1
365      */
366     int32_t index;
367 
368     /**
369      * (protected) Limit index or similar.
370      * Not used by caller.
371      * @stable ICU 2.1
372      */
373     int32_t limit;
374 
375     /**
376      * (protected) Used by UTF-8 iterators and possibly others.
377      * @stable ICU 2.1
378      */
379     int32_t reservedField;
380 
381     /**
382      * (public) Returns the current position or the
383      * start or limit index of the iteration range.
384      *
385      * @see UCharIteratorGetIndex
386      * @stable ICU 2.1
387      */
388     UCharIteratorGetIndex *getIndex;
389 
390     /**
391      * (public) Moves the current position relative to the start or limit of the
392      * iteration range, or relative to the current position itself.
393      * The movement is expressed in numbers of code units forward
394      * or backward by specifying a positive or negative delta.
395      *
396      * @see UCharIteratorMove
397      * @stable ICU 2.1
398      */
399     UCharIteratorMove *move;
400 
401     /**
402      * (public) Check if current() and next() can still
403      * return another code unit.
404      *
405      * @see UCharIteratorHasNext
406      * @stable ICU 2.1
407      */
408     UCharIteratorHasNext *hasNext;
409 
410     /**
411      * (public) Check if previous() can still return another code unit.
412      *
413      * @see UCharIteratorHasPrevious
414      * @stable ICU 2.1
415      */
416     UCharIteratorHasPrevious *hasPrevious;
417 
418     /**
419      * (public) Return the code unit at the current position,
420      * or U_SENTINEL if there is none (index is at the limit).
421      *
422      * @see UCharIteratorCurrent
423      * @stable ICU 2.1
424      */
425     UCharIteratorCurrent *current;
426 
427     /**
428      * (public) Return the code unit at the current index and increment
429      * the index (post-increment, like s[i++]),
430      * or return U_SENTINEL if there is none (index is at the limit).
431      *
432      * @see UCharIteratorNext
433      * @stable ICU 2.1
434      */
435     UCharIteratorNext *next;
436 
437     /**
438      * (public) Decrement the index and return the code unit from there
439      * (pre-decrement, like s[--i]),
440      * or return U_SENTINEL if there is none (index is at the start).
441      *
442      * @see UCharIteratorPrevious
443      * @stable ICU 2.1
444      */
445     UCharIteratorPrevious *previous;
446 
447     /**
448      * (public) Reserved for future use. Currently NULL.
449      *
450      * @see UCharIteratorReserved
451      * @stable ICU 2.1
452      */
453     UCharIteratorReserved *reservedFn;
454 
455     /**
456      * (public) Return the state of the iterator, to be restored later with setState().
457      * This function pointer is NULL if the iterator does not implement it.
458      *
459      * @see UCharIteratorGet
460      * @stable ICU 2.6
461      */
462     UCharIteratorGetState *getState;
463 
464     /**
465      * (public) Restore the iterator state from the state word from a call
466      * to getState().
467      * This function pointer is NULL if the iterator does not implement it.
468      *
469      * @see UCharIteratorSet
470      * @stable ICU 2.6
471      */
472     UCharIteratorSetState *setState;
473 };
474 
475 /**
476  * Helper function for UCharIterator to get the code point
477  * at the current index.
478  *
479  * Return the code point that includes the code unit at the current position,
480  * or U_SENTINEL if there is none (index is at the limit).
481  * If the current code unit is a lead or trail surrogate,
482  * then the following or preceding surrogate is used to form
483  * the code point value.
484  *
485  * @param iter the UCharIterator structure ("this pointer")
486  * @return the current code point
487  *
488  * @see UCharIterator
489  * @see U16_GET
490  * @see UnicodeString::char32At()
491  * @stable ICU 2.1
492  */
493 U_STABLE UChar32 U_EXPORT2
494 uiter_current32(UCharIterator *iter);
495 
496 /**
497  * Helper function for UCharIterator to get the next code point.
498  *
499  * Return the code point at the current index and increment
500  * the index (post-increment, like s[i++]),
501  * or return U_SENTINEL if there is none (index is at the limit).
502  *
503  * @param iter the UCharIterator structure ("this pointer")
504  * @return the current code point (and post-increment the current index)
505  *
506  * @see UCharIterator
507  * @see U16_NEXT
508  * @stable ICU 2.1
509  */
510 U_STABLE UChar32 U_EXPORT2
511 uiter_next32(UCharIterator *iter);
512 
513 /**
514  * Helper function for UCharIterator to get the previous code point.
515  *
516  * Decrement the index and return the code point from there
517  * (pre-decrement, like s[--i]),
518  * or return U_SENTINEL if there is none (index is at the start).
519  *
520  * @param iter the UCharIterator structure ("this pointer")
521  * @return the previous code point (after pre-decrementing the current index)
522  *
523  * @see UCharIterator
524  * @see U16_PREV
525  * @stable ICU 2.1
526  */
527 U_STABLE UChar32 U_EXPORT2
528 uiter_previous32(UCharIterator *iter);
529 
530 /**
531  * Get the "state" of the iterator in the form of a single 32-bit word.
532  * This is a convenience function that calls iter->getState(iter)
533  * if iter->getState is not NULL;
534  * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
535  *
536  * Some UCharIterator implementations may not be able to return
537  * a valid state for each position, in which case they return UITER_NO_STATE instead.
538  * This will be clearly documented for each such iterator (none of the public ones here).
539  *
540  * @param iter the UCharIterator structure ("this pointer")
541  * @return the state word
542  *
543  * @see UCharIterator
544  * @see UCharIteratorGetState
545  * @see UITER_NO_STATE
546  * @stable ICU 2.6
547  */
548 U_STABLE uint32_t U_EXPORT2
549 uiter_getState(const UCharIterator *iter);
550 
551 /**
552  * Restore the "state" of the iterator using a state word from a getState() call.
553  * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
554  * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
555  *
556  * @param iter the UCharIterator structure ("this pointer")
557  * @param state the state word from a getState() call
558  *              on a same-type, same-string iterator
559  * @param pErrorCode Must be a valid pointer to an error code value,
560  *                   which must not indicate a failure before the function call.
561  *
562  * @see UCharIterator
563  * @see UCharIteratorSetState
564  * @stable ICU 2.6
565  */
566 U_STABLE void U_EXPORT2
567 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
568 
569 /**
570  * Set up a UCharIterator to iterate over a string.
571  *
572  * Sets the UCharIterator function pointers for iteration over the string s
573  * with iteration boundaries start=index=0 and length=limit=string length.
574  * The "provider" may set the start, index, and limit values at any time
575  * within the range 0..length.
576  * The length field will be ignored.
577  *
578  * The string pointer s is set into UCharIterator.context without copying
579  * or reallocating the string contents.
580  *
581  * getState() simply returns the current index.
582  * move() will always return the final index.
583  *
584  * @param iter UCharIterator structure to be set for iteration
585  * @param s String to iterate over
586  * @param length Length of s, or -1 if NUL-terminated
587  *
588  * @see UCharIterator
589  * @stable ICU 2.1
590  */
591 U_STABLE void U_EXPORT2
592 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
593 
594 /**
595  * Set up a UCharIterator to iterate over a UTF-16BE string
596  * (byte vector with a big-endian pair of bytes per UChar).
597  *
598  * Everything works just like with a normal UChar iterator (uiter_setString),
599  * except that UChars are assembled from byte pairs,
600  * and that the length argument here indicates an even number of bytes.
601  *
602  * getState() simply returns the current index.
603  * move() will always return the final index.
604  *
605  * @param iter UCharIterator structure to be set for iteration
606  * @param s UTF-16BE string to iterate over
607  * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
608  *               (NUL means pair of 0 bytes at even index from s)
609  *
610  * @see UCharIterator
611  * @see uiter_setString
612  * @stable ICU 2.6
613  */
614 U_STABLE void U_EXPORT2
615 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
616 
617 /**
618  * Set up a UCharIterator to iterate over a UTF-8 string.
619  *
620  * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
621  * with UTF-8 iteration boundaries 0 and length.
622  * The implementation counts the UTF-16 index on the fly and
623  * lazily evaluates the UTF-16 length of the text.
624  *
625  * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
626  * When the reservedField is not 0, then it contains a supplementary code point
627  * and the UTF-16 index is between the two corresponding surrogates.
628  * At that point, the UTF-8 index is behind that code point.
629  *
630  * The UTF-8 string pointer s is set into UCharIterator.context without copying
631  * or reallocating the string contents.
632  *
633  * getState() returns a state value consisting of
634  * - the current UTF-8 source byte index (bits 31..1)
635  * - a flag (bit 0) that indicates whether the UChar position is in the middle
636  *   of a surrogate pair
637  *   (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
638  *
639  * getState() cannot also encode the UTF-16 index in the state value.
640  * move(relative to limit or length), or
641  * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
642  *
643  * @param iter UCharIterator structure to be set for iteration
644  * @param s UTF-8 string to iterate over
645  * @param length Length of s in bytes, or -1 if NUL-terminated
646  *
647  * @see UCharIterator
648  * @stable ICU 2.6
649  */
650 U_STABLE void U_EXPORT2
651 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
652 
653 #if U_SHOW_CPLUSPLUS_API
654 
655 /**
656  * Set up a UCharIterator to wrap around a C++ CharacterIterator.
657  *
658  * Sets the UCharIterator function pointers for iteration using the
659  * CharacterIterator charIter.
660  *
661  * The CharacterIterator pointer charIter is set into UCharIterator.context
662  * without copying or cloning the CharacterIterator object.
663  * The other "protected" UCharIterator fields are set to 0 and will be ignored.
664  * The iteration index and boundaries are controlled by the CharacterIterator.
665  *
666  * getState() simply returns the current index.
667  * move() will always return the final index.
668  *
669  * @param iter UCharIterator structure to be set for iteration
670  * @param charIter CharacterIterator to wrap
671  *
672  * @see UCharIterator
673  * @stable ICU 2.1
674  */
675 U_STABLE void U_EXPORT2
676 uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
677 
678 /**
679  * Set up a UCharIterator to iterate over a C++ Replaceable.
680  *
681  * Sets the UCharIterator function pointers for iteration over the
682  * Replaceable rep with iteration boundaries start=index=0 and
683  * length=limit=rep->length().
684  * The "provider" may set the start, index, and limit values at any time
685  * within the range 0..length=rep->length().
686  * The length field will be ignored.
687  *
688  * The Replaceable pointer rep is set into UCharIterator.context without copying
689  * or cloning/reallocating the Replaceable object.
690  *
691  * getState() simply returns the current index.
692  * move() will always return the final index.
693  *
694  * @param iter UCharIterator structure to be set for iteration
695  * @param rep Replaceable to iterate over
696  *
697  * @see UCharIterator
698  * @stable ICU 2.1
699  */
700 U_STABLE void U_EXPORT2
701 uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
702 
703 #endif
704 
705 U_CDECL_END
706 
707 #endif
708