1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************
5 *
6 *   Copyright (C) 1997-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ********************************************************************
10 */
11 
12 #ifndef CHARITER_H
13 #define CHARITER_H
14 
15 #include "unicode/utypes.h"
16 #include "unicode/uobject.h"
17 #include "unicode/unistr.h"
18 /**
19  * \file
20  * \brief C++ API: Character Iterator
21  */
22 
23 U_NAMESPACE_BEGIN
24 /**
25  * Abstract class that defines an API for forward-only iteration
26  * on text objects.
27  * This is a minimal interface for iteration without random access
28  * or backwards iteration. It is especially useful for wrapping
29  * streams with converters into an object for collation or
30  * normalization.
31  *
32  * <p>Characters can be accessed in two ways: as code units or as
33  * code points.
34  * Unicode code points are 21-bit integers and are the scalar values
35  * of Unicode characters. ICU uses the type UChar32 for them.
36  * Unicode code units are the storage units of a given
37  * Unicode/UCS Transformation Format (a character encoding scheme).
38  * With UTF-16, all code points can be represented with either one
39  * or two code units ("surrogates").
40  * String storage is typically based on code units, while properties
41  * of characters are typically determined using code point values.
42  * Some processes may be designed to work with sequences of code units,
43  * or it may be known that all characters that are important to an
44  * algorithm can be represented with single code units.
45  * Other processes will need to use the code point access functions.</p>
46  *
47  * <p>ForwardCharacterIterator provides nextPostInc() to access
48  * a code unit and advance an internal position into the text object,
49  * similar to a <code>return text[position++]</code>.<br>
50  * It provides next32PostInc() to access a code point and advance an internal
51  * position.</p>
52  *
53  * <p>next32PostInc() assumes that the current position is that of
54  * the beginning of a code point, i.e., of its first code unit.
55  * After next32PostInc(), this will be true again.
56  * In general, access to code units and code points in the same
57  * iteration loop should not be mixed. In UTF-16, if the current position
58  * is on a second code unit (Low Surrogate), then only that code unit
59  * is returned even by next32PostInc().</p>
60  *
61  * <p>For iteration with either function, there are two ways to
62  * check for the end of the iteration. When there are no more
63  * characters in the text object:
64  * <ul>
65  * <li>The hasNext() function returns FALSE.</li>
66  * <li>nextPostInc() and next32PostInc() return DONE
67  *     when one attempts to read beyond the end of the text object.</li>
68  * </ul>
69  *
70  * Example:
71  * \code
72  * void function1(ForwardCharacterIterator &it) {
73  *     UChar32 c;
74  *     while(it.hasNext()) {
75  *         c=it.next32PostInc();
76  *         // use c
77  *     }
78  * }
79  *
80  * void function1(ForwardCharacterIterator &it) {
81  *     char16_t c;
82  *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
83  *         // use c
84  *      }
85  *  }
86  * \endcode
87  * </p>
88  *
89  * @stable ICU 2.0
90  */
91 class U_COMMON_API ForwardCharacterIterator : public UObject {
92 public:
93     /**
94      * Value returned by most of ForwardCharacterIterator's functions
95      * when the iterator has reached the limits of its iteration.
96      * @stable ICU 2.0
97      */
98     enum { DONE = 0xffff };
99 
100     /**
101      * Destructor.
102      * @stable ICU 2.0
103      */
104     virtual ~ForwardCharacterIterator();
105 
106     /**
107      * Returns true when both iterators refer to the same
108      * character in the same character-storage object.
109      * @param that The ForwardCharacterIterator to be compared for equality
110      * @return true when both iterators refer to the same
111      * character in the same character-storage object
112      * @stable ICU 2.0
113      */
114     virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
115 
116     /**
117      * Returns true when the iterators refer to different
118      * text-storage objects, or to different characters in the
119      * same text-storage object.
120      * @param that The ForwardCharacterIterator to be compared for inequality
121      * @return true when the iterators refer to different
122      * text-storage objects, or to different characters in the
123      * same text-storage object
124      * @stable ICU 2.0
125      */
126     inline UBool operator!=(const ForwardCharacterIterator& that) const;
127 
128     /**
129      * Generates a hash code for this iterator.
130      * @return the hash code.
131      * @stable ICU 2.0
132      */
133     virtual int32_t hashCode(void) const = 0;
134 
135     /**
136      * Returns a UClassID for this ForwardCharacterIterator ("poor man's
137      * RTTI").<P> Despite the fact that this function is public,
138      * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
139      * @return a UClassID for this ForwardCharacterIterator
140      * @stable ICU 2.0
141      */
142     virtual UClassID getDynamicClassID(void) const = 0;
143 
144     /**
145      * Gets the current code unit for returning and advances to the next code unit
146      * in the iteration range
147      * (toward endIndex()).  If there are
148      * no more code units to return, returns DONE.
149      * @return the current code unit.
150      * @stable ICU 2.0
151      */
152     virtual char16_t         nextPostInc(void) = 0;
153 
154     /**
155      * Gets the current code point for returning and advances to the next code point
156      * in the iteration range
157      * (toward endIndex()).  If there are
158      * no more code points to return, returns DONE.
159      * @return the current code point.
160      * @stable ICU 2.0
161      */
162     virtual UChar32       next32PostInc(void) = 0;
163 
164     /**
165      * Returns FALSE if there are no more code units or code points
166      * at or after the current position in the iteration range.
167      * This is used with nextPostInc() or next32PostInc() in forward
168      * iteration.
169      * @returns FALSE if there are no more code units or code points
170      * at or after the current position in the iteration range.
171      * @stable ICU 2.0
172      */
173     virtual UBool        hasNext() = 0;
174 
175 protected:
176     /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
177     ForwardCharacterIterator();
178 
179     /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
180     ForwardCharacterIterator(const ForwardCharacterIterator &other);
181 
182     /**
183      * Assignment operator to be overridden in the implementing class.
184      * @stable ICU 2.0
185      */
186     ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
187 };
188 
189 /**
190  * Abstract class that defines an API for iteration
191  * on text objects.
192  * This is an interface for forward and backward iteration
193  * and random access into a text object.
194  *
195  * <p>The API provides backward compatibility to the Java and older ICU
196  * CharacterIterator classes but extends them significantly:
197  * <ol>
198  * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
199  * <li>While the old API functions provided forward iteration with
200  *     "pre-increment" semantics, the new one also provides functions
201  *     with "post-increment" semantics. They are more efficient and should
202  *     be the preferred iterator functions for new implementations.
203  *     The backward iteration always had "pre-decrement" semantics, which
204  *     are efficient.</li>
205  * <li>Just like ForwardCharacterIterator, it provides access to
206  *     both code units and code points. Code point access versions are available
207  *     for the old and the new iteration semantics.</li>
208  * <li>There are new functions for setting and moving the current position
209  *     without returning a character, for efficiency.</li>
210  * </ol>
211  *
212  * See ForwardCharacterIterator for examples for using the new forward iteration
213  * functions. For backward iteration, there is also a hasPrevious() function
214  * that can be used analogously to hasNext().
215  * The old functions work as before and are shown below.</p>
216  *
217  * <p>Examples for some of the new functions:</p>
218  *
219  * Forward iteration with hasNext():
220  * \code
221  * void forward1(CharacterIterator &it) {
222  *     UChar32 c;
223  *     for(it.setToStart(); it.hasNext();) {
224  *         c=it.next32PostInc();
225  *         // use c
226  *     }
227  *  }
228  * \endcode
229  * Forward iteration more similar to loops with the old forward iteration,
230  * showing a way to convert simple for() loops:
231  * \code
232  * void forward2(CharacterIterator &it) {
233  *     char16_t c;
234  *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
235  *          // use c
236  *      }
237  * }
238  * \endcode
239  * Backward iteration with setToEnd() and hasPrevious():
240  * \code
241  *  void backward1(CharacterIterator &it) {
242  *      UChar32 c;
243  *      for(it.setToEnd(); it.hasPrevious();) {
244  *         c=it.previous32();
245  *          // use c
246  *      }
247  *  }
248  * \endcode
249  * Backward iteration with a more traditional for() loop:
250  * \code
251  * void backward2(CharacterIterator &it) {
252  *     char16_t c;
253  *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
254  *         // use c
255  *      }
256  *  }
257  * \endcode
258  *
259  * Example for random access:
260  * \code
261  *  void random(CharacterIterator &it) {
262  *      // set to the third code point from the beginning
263  *      it.move32(3, CharacterIterator::kStart);
264  *      // get a code point from here without moving the position
265  *      UChar32 c=it.current32();
266  *      // get the position
267  *      int32_t pos=it.getIndex();
268  *      // get the previous code unit
269  *      char16_t u=it.previous();
270  *      // move back one more code unit
271  *      it.move(-1, CharacterIterator::kCurrent);
272  *      // set the position back to where it was
273  *      // and read the same code point c and move beyond it
274  *      it.setIndex(pos);
275  *      if(c!=it.next32PostInc()) {
276  *          exit(1); // CharacterIterator inconsistent
277  *      }
278  *  }
279  * \endcode
280  *
281  * <p>Examples, especially for the old API:</p>
282  *
283  * Function processing characters, in this example simple output
284  * <pre>
285  * \code
286  *  void processChar( char16_t c )
287  *  {
288  *      cout << " " << c;
289  *  }
290  * \endcode
291  * </pre>
292  * Traverse the text from start to finish
293  * <pre>
294  * \code
295  *  void traverseForward(CharacterIterator& iter)
296  *  {
297  *      for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
298  *          processChar(c);
299  *      }
300  *  }
301  * \endcode
302  * </pre>
303  * Traverse the text backwards, from end to start
304  * <pre>
305  * \code
306  *  void traverseBackward(CharacterIterator& iter)
307  *  {
308  *      for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
309  *          processChar(c);
310  *      }
311  *  }
312  * \endcode
313  * </pre>
314  * Traverse both forward and backward from a given position in the text.
315  * Calls to notBoundary() in this example represents some additional stopping criteria.
316  * <pre>
317  * \code
318  * void traverseOut(CharacterIterator& iter, int32_t pos)
319  * {
320  *      char16_t c;
321  *      for (c = iter.setIndex(pos);
322  *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
323  *          c = iter.next()) {}
324  *      int32_t end = iter.getIndex();
325  *      for (c = iter.setIndex(pos);
326  *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
327  *          c = iter.previous()) {}
328  *      int32_t start = iter.getIndex() + 1;
329  *
330  *      cout << "start: " << start << " end: " << end << endl;
331  *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
332  *          processChar(c);
333  *     }
334  *  }
335  * \endcode
336  * </pre>
337  * Creating a StringCharacterIterator and calling the test functions
338  * <pre>
339  * \code
340  *  void CharacterIterator_Example( void )
341  *   {
342  *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
343  *       UnicodeString text("Ein kleiner Satz.");
344  *       StringCharacterIterator iterator(text);
345  *       cout << "----- traverseForward: -----------" << endl;
346  *       traverseForward( iterator );
347  *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
348  *       traverseBackward( iterator );
349  *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
350  *       traverseOut( iterator, 7 );
351  *       cout << endl << endl << "-----" << endl;
352  *   }
353  * \endcode
354  * </pre>
355  *
356  * @stable ICU 2.0
357  */
358 class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
359 public:
360     /**
361      * Origin enumeration for the move() and move32() functions.
362      * @stable ICU 2.0
363      */
364     enum EOrigin { kStart, kCurrent, kEnd };
365 
366     /**
367      * Destructor.
368      * @stable ICU 2.0
369      */
370     virtual ~CharacterIterator();
371 
372     /**
373      * Returns a pointer to a new CharacterIterator of the same
374      * concrete class as this one, and referring to the same
375      * character in the same text-storage object as this one.  The
376      * caller is responsible for deleting the new clone.
377      * @return a pointer to a new CharacterIterator
378      * @stable ICU 2.0
379      */
380     virtual CharacterIterator* clone(void) const = 0;
381 
382     /**
383      * Sets the iterator to refer to the first code unit in its
384      * iteration range, and returns that code unit.
385      * This can be used to begin an iteration with next().
386      * @return the first code unit in its iteration range.
387      * @stable ICU 2.0
388      */
389     virtual char16_t         first(void) = 0;
390 
391     /**
392      * Sets the iterator to refer to the first code unit in its
393      * iteration range, returns that code unit, and moves the position
394      * to the second code unit. This is an alternative to setToStart()
395      * for forward iteration with nextPostInc().
396      * @return the first code unit in its iteration range.
397      * @stable ICU 2.0
398      */
399     virtual char16_t         firstPostInc(void);
400 
401     /**
402      * Sets the iterator to refer to the first code point in its
403      * iteration range, and returns that code unit,
404      * This can be used to begin an iteration with next32().
405      * Note that an iteration with next32PostInc(), beginning with,
406      * e.g., setToStart() or firstPostInc(), is more efficient.
407      * @return the first code point in its iteration range.
408      * @stable ICU 2.0
409      */
410     virtual UChar32       first32(void) = 0;
411 
412     /**
413      * Sets the iterator to refer to the first code point in its
414      * iteration range, returns that code point, and moves the position
415      * to the second code point. This is an alternative to setToStart()
416      * for forward iteration with next32PostInc().
417      * @return the first code point in its iteration range.
418      * @stable ICU 2.0
419      */
420     virtual UChar32       first32PostInc(void);
421 
422     /**
423      * Sets the iterator to refer to the first code unit or code point in its
424      * iteration range. This can be used to begin a forward
425      * iteration with nextPostInc() or next32PostInc().
426      * @return the start position of the iteration range
427      * @stable ICU 2.0
428      */
429     inline int32_t    setToStart();
430 
431     /**
432      * Sets the iterator to refer to the last code unit in its
433      * iteration range, and returns that code unit.
434      * This can be used to begin an iteration with previous().
435      * @return the last code unit.
436      * @stable ICU 2.0
437      */
438     virtual char16_t         last(void) = 0;
439 
440     /**
441      * Sets the iterator to refer to the last code point in its
442      * iteration range, and returns that code unit.
443      * This can be used to begin an iteration with previous32().
444      * @return the last code point.
445      * @stable ICU 2.0
446      */
447     virtual UChar32       last32(void) = 0;
448 
449     /**
450      * Sets the iterator to the end of its iteration range, just behind
451      * the last code unit or code point. This can be used to begin a backward
452      * iteration with previous() or previous32().
453      * @return the end position of the iteration range
454      * @stable ICU 2.0
455      */
456     inline int32_t    setToEnd();
457 
458     /**
459      * Sets the iterator to refer to the "position"-th code unit
460      * in the text-storage object the iterator refers to, and
461      * returns that code unit.
462      * @param position the "position"-th code unit in the text-storage object
463      * @return the "position"-th code unit.
464      * @stable ICU 2.0
465      */
466     virtual char16_t         setIndex(int32_t position) = 0;
467 
468     /**
469      * Sets the iterator to refer to the beginning of the code point
470      * that contains the "position"-th code unit
471      * in the text-storage object the iterator refers to, and
472      * returns that code point.
473      * The current position is adjusted to the beginning of the code point
474      * (its first code unit).
475      * @param position the "position"-th code unit in the text-storage object
476      * @return the "position"-th code point.
477      * @stable ICU 2.0
478      */
479     virtual UChar32       setIndex32(int32_t position) = 0;
480 
481     /**
482      * Returns the code unit the iterator currently refers to.
483      * @return the current code unit.
484      * @stable ICU 2.0
485      */
486     virtual char16_t         current(void) const = 0;
487 
488     /**
489      * Returns the code point the iterator currently refers to.
490      * @return the current code point.
491      * @stable ICU 2.0
492      */
493     virtual UChar32       current32(void) const = 0;
494 
495     /**
496      * Advances to the next code unit in the iteration range
497      * (toward endIndex()), and returns that code unit.  If there are
498      * no more code units to return, returns DONE.
499      * @return the next code unit.
500      * @stable ICU 2.0
501      */
502     virtual char16_t         next(void) = 0;
503 
504     /**
505      * Advances to the next code point in the iteration range
506      * (toward endIndex()), and returns that code point.  If there are
507      * no more code points to return, returns DONE.
508      * Note that iteration with "pre-increment" semantics is less
509      * efficient than iteration with "post-increment" semantics
510      * that is provided by next32PostInc().
511      * @return the next code point.
512      * @stable ICU 2.0
513      */
514     virtual UChar32       next32(void) = 0;
515 
516     /**
517      * Advances to the previous code unit in the iteration range
518      * (toward startIndex()), and returns that code unit.  If there are
519      * no more code units to return, returns DONE.
520      * @return the previous code unit.
521      * @stable ICU 2.0
522      */
523     virtual char16_t         previous(void) = 0;
524 
525     /**
526      * Advances to the previous code point in the iteration range
527      * (toward startIndex()), and returns that code point.  If there are
528      * no more code points to return, returns DONE.
529      * @return the previous code point.
530      * @stable ICU 2.0
531      */
532     virtual UChar32       previous32(void) = 0;
533 
534     /**
535      * Returns FALSE if there are no more code units or code points
536      * before the current position in the iteration range.
537      * This is used with previous() or previous32() in backward
538      * iteration.
539      * @return FALSE if there are no more code units or code points
540      * before the current position in the iteration range, return TRUE otherwise.
541      * @stable ICU 2.0
542      */
543     virtual UBool        hasPrevious() = 0;
544 
545     /**
546      * Returns the numeric index in the underlying text-storage
547      * object of the character returned by first().  Since it's
548      * possible to create an iterator that iterates across only
549      * part of a text-storage object, this number isn't
550      * necessarily 0.
551      * @returns the numeric index in the underlying text-storage
552      * object of the character returned by first().
553      * @stable ICU 2.0
554      */
555     inline int32_t       startIndex(void) const;
556 
557     /**
558      * Returns the numeric index in the underlying text-storage
559      * object of the position immediately BEYOND the character
560      * returned by last().
561      * @return the numeric index in the underlying text-storage
562      * object of the position immediately BEYOND the character
563      * returned by last().
564      * @stable ICU 2.0
565      */
566     inline int32_t       endIndex(void) const;
567 
568     /**
569      * Returns the numeric index in the underlying text-storage
570      * object of the character the iterator currently refers to
571      * (i.e., the character returned by current()).
572      * @return the numberic index in the text-storage object of
573      * the character the iterator currently refers to
574      * @stable ICU 2.0
575      */
576     inline int32_t       getIndex(void) const;
577 
578     /**
579      * Returns the length of the entire text in the underlying
580      * text-storage object.
581      * @return the length of the entire text in the text-storage object
582      * @stable ICU 2.0
583      */
584     inline int32_t           getLength() const;
585 
586     /**
587      * Moves the current position relative to the start or end of the
588      * iteration range, or relative to the current position itself.
589      * The movement is expressed in numbers of code units forward
590      * or backward by specifying a positive or negative delta.
591      * @param delta the position relative to origin. A positive delta means forward;
592      * a negative delta means backward.
593      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
594      * @return the new position
595      * @stable ICU 2.0
596      */
597     virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
598 
599     /**
600      * Moves the current position relative to the start or end of the
601      * iteration range, or relative to the current position itself.
602      * The movement is expressed in numbers of code points forward
603      * or backward by specifying a positive or negative delta.
604      * @param delta the position relative to origin. A positive delta means forward;
605      * a negative delta means backward.
606      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
607      * @return the new position
608      * @stable ICU 2.0
609      */
610 #ifdef move32
611      // One of the system headers right now is sometimes defining a conflicting macro we don't use
612 #undef move32
613 #endif
614     virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
615 
616     /**
617      * Copies the text under iteration into the UnicodeString
618      * referred to by "result".
619      * @param result Receives a copy of the text under iteration.
620      * @stable ICU 2.0
621      */
622     virtual void            getText(UnicodeString&  result) = 0;
623 
624 protected:
625     /**
626      * Empty constructor.
627      * @stable ICU 2.0
628      */
629     CharacterIterator();
630 
631     /**
632      * Constructor, just setting the length field in this base class.
633      * @stable ICU 2.0
634      */
635     CharacterIterator(int32_t length);
636 
637     /**
638      * Constructor, just setting the length and position fields in this base class.
639      * @stable ICU 2.0
640      */
641     CharacterIterator(int32_t length, int32_t position);
642 
643     /**
644      * Constructor, just setting the length, start, end, and position fields in this base class.
645      * @stable ICU 2.0
646      */
647     CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
648 
649     /**
650      * Copy constructor.
651      *
652      * @param that The CharacterIterator to be copied
653      * @stable ICU 2.0
654      */
655     CharacterIterator(const CharacterIterator &that);
656 
657     /**
658      * Assignment operator.  Sets this CharacterIterator to have the same behavior,
659      * as the one passed in.
660      * @param that The CharacterIterator passed in.
661      * @return the newly set CharacterIterator.
662      * @stable ICU 2.0
663      */
664     CharacterIterator &operator=(const CharacterIterator &that);
665 
666     /**
667      * Base class text length field.
668      * Necessary this for correct getText() and hashCode().
669      * @stable ICU 2.0
670      */
671     int32_t textLength;
672 
673     /**
674      * Base class field for the current position.
675      * @stable ICU 2.0
676      */
677     int32_t  pos;
678 
679     /**
680      * Base class field for the start of the iteration range.
681      * @stable ICU 2.0
682      */
683     int32_t  begin;
684 
685     /**
686      * Base class field for the end of the iteration range.
687      * @stable ICU 2.0
688      */
689     int32_t  end;
690 };
691 
692 inline UBool
693 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
694     return !operator==(that);
695 }
696 
697 inline int32_t
setToStart()698 CharacterIterator::setToStart() {
699     return move(0, kStart);
700 }
701 
702 inline int32_t
setToEnd()703 CharacterIterator::setToEnd() {
704     return move(0, kEnd);
705 }
706 
707 inline int32_t
startIndex(void)708 CharacterIterator::startIndex(void) const {
709     return begin;
710 }
711 
712 inline int32_t
endIndex(void)713 CharacterIterator::endIndex(void) const {
714     return end;
715 }
716 
717 inline int32_t
getIndex(void)718 CharacterIterator::getIndex(void) const {
719     return pos;
720 }
721 
722 inline int32_t
getLength(void)723 CharacterIterator::getLength(void) const {
724     return textLength;
725 }
726 
727 U_NAMESPACE_END
728 #endif
729