1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2003-2011, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 * Author: Alan Liu
9 * Created: September 24 2003
10 * Since: ICU 2.8
11 **********************************************************************
12 */
13 #ifndef _RULEITER_H_
14 #define _RULEITER_H_
15 
16 #include "unicode/uobject.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeString;
21 class ParsePosition;
22 class SymbolTable;
23 
24 /**
25  * An iterator that returns 32-bit code points.  This class is deliberately
26  * <em>not</em> related to any of the ICU character iterator classes
27  * in order to minimize complexity.
28  * @author Alan Liu
29  * @since ICU 2.8
30  */
31 class RuleCharacterIterator : public UMemory {
32 
33     // TODO: Ideas for later.  (Do not implement if not needed, lest the
34     // code coverage numbers go down due to unused methods.)
35     // 1. Add a copy constructor, operator==() method.
36     // 2. Rather than return DONE, throw an exception if the end
37     // is reached -- this is an alternate usage model, probably not useful.
38 
39 private:
40     /**
41      * Text being iterated.
42      */
43     const UnicodeString& text;
44 
45     /**
46      * Position of iterator.
47      */
48     ParsePosition& pos;
49 
50     /**
51      * Symbol table used to parse and dereference variables.  May be 0.
52      */
53     const SymbolTable* sym;
54 
55     /**
56      * Current variable expansion, or 0 if none.
57      */
58     const UnicodeString* buf;
59 
60     /**
61      * Position within buf.  Meaningless if buf == 0.
62      */
63     int32_t bufPos;
64 
65 public:
66     /**
67      * Value returned when there are no more characters to iterate.
68      */
69     enum { DONE = -1 };
70 
71     /**
72      * Bitmask option to enable parsing of variable names.  If (options &
73      * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
74      * its value.  Variables are parsed using the SymbolTable API.
75      */
76     enum { PARSE_VARIABLES = 1 };
77 
78     /**
79      * Bitmask option to enable parsing of escape sequences.  If (options &
80      * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
81      * to its value.  Escapes are parsed using Utility.unescapeAt().
82      */
83     enum { PARSE_ESCAPES   = 2 };
84 
85     /**
86      * Bitmask option to enable skipping of whitespace.  If (options &
87      * SKIP_WHITESPACE) != 0, then Pattern_White_Space characters will be silently
88      * skipped, as if they were not present in the input.
89      */
90     enum { SKIP_WHITESPACE = 4 };
91 
92     /**
93      * Constructs an iterator over the given text, starting at the given
94      * position.
95      * @param text the text to be iterated
96      * @param sym the symbol table, or null if there is none.  If sym is null,
97      * then variables will not be deferenced, even if the PARSE_VARIABLES
98      * option is set.
99      * @param pos upon input, the index of the next character to return.  If a
100      * variable has been dereferenced, then pos will <em>not</em> increment as
101      * characters of the variable value are iterated.
102      */
103     RuleCharacterIterator(const UnicodeString& text, const SymbolTable* sym,
104                           ParsePosition& pos);
105 
106     /**
107      * Returns true if this iterator has no more characters to return.
108      */
109     UBool atEnd() const;
110 
111     /**
112      * Returns the next character using the given options, or DONE if there
113      * are no more characters, and advance the position to the next
114      * character.
115      * @param options one or more of the following options, bitwise-OR-ed
116      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
117      * @param isEscaped output parameter set to TRUE if the character
118      * was escaped
119      * @param ec input-output error code.  An error will only be set by
120      * this routing if options includes PARSE_VARIABLES and an unknown
121      * variable name is seen, or if options includes PARSE_ESCAPES and
122      * an invalid escape sequence is seen.
123      * @return the current 32-bit code point, or DONE
124      */
125     UChar32 next(int32_t options, UBool& isEscaped, UErrorCode& ec);
126 
127     /**
128      * Returns true if this iterator is currently within a variable expansion.
129      */
130     inline UBool inVariable() const;
131 
132     /**
133      * An opaque object representing the position of a RuleCharacterIterator.
134      */
135     struct Pos : public UMemory {
136     private:
137         const UnicodeString* buf;
138         int32_t pos;
139         int32_t bufPos;
140         friend class RuleCharacterIterator;
141     };
142 
143     /**
144      * Sets an object which, when later passed to setPos(), will
145      * restore this iterator's position.  Usage idiom:
146      *
147      * RuleCharacterIterator iterator = ...;
148      * RuleCharacterIterator::Pos pos;
149      * iterator.getPos(pos);
150      * for (;;) {
151      *   iterator.getPos(pos);
152      *   int c = iterator.next(...);
153      *   ...
154      * }
155      * iterator.setPos(pos);
156      *
157      * @param p a position object to be set to this iterator's
158      * current position.
159      */
160     void getPos(Pos& p) const;
161 
162     /**
163      * Restores this iterator to the position it had when getPos()
164      * set the given object.
165      * @param p a position object previously set by getPos()
166      */
167     void setPos(const Pos& p);
168 
169     /**
170      * Skips ahead past any ignored characters, as indicated by the given
171      * options.  This is useful in conjunction with the lookahead() method.
172      *
173      * Currently, this only has an effect for SKIP_WHITESPACE.
174      * @param options one or more of the following options, bitwise-OR-ed
175      * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
176      */
177     void skipIgnored(int32_t options);
178 
179     /**
180      * Returns a string containing the remainder of the characters to be
181      * returned by this iterator, without any option processing.  If the
182      * iterator is currently within a variable expansion, this will only
183      * extend to the end of the variable expansion.  This method is provided
184      * so that iterators may interoperate with string-based APIs.  The typical
185      * sequence of calls is to call skipIgnored(), then call lookahead(), then
186      * parse the string returned by lookahead(), then call jumpahead() to
187      * resynchronize the iterator.
188      * @param result a string to receive the characters to be returned
189      * by future calls to next()
190      * @param maxLookAhead The maximum to copy into the result.
191      * @return a reference to result
192      */
193     UnicodeString& lookahead(UnicodeString& result, int32_t maxLookAhead = -1) const;
194 
195     /**
196      * Advances the position by the given number of 16-bit code units.
197      * This is useful in conjunction with the lookahead() method.
198      * @param count the number of 16-bit code units to jump over
199      */
200     void jumpahead(int32_t count);
201 
202     /**
203      * Returns a string representation of this object, consisting of the
204      * characters being iterated, with a '|' marking the current position.
205      * Position within an expanded variable is <em>not</em> indicated.
206      * @param result output parameter to receive a string
207      * representation of this object
208      */
209 //    UnicodeString& toString(UnicodeString& result) const;
210 
211 private:
212     /**
213      * Returns the current 32-bit code point without parsing escapes, parsing
214      * variables, or skipping whitespace.
215      * @return the current 32-bit code point
216      */
217     UChar32 _current() const;
218 
219     /**
220      * Advances the position by the given amount.
221      * @param count the number of 16-bit code units to advance past
222      */
223     void _advance(int32_t count);
224 };
225 
inVariable()226 inline UBool RuleCharacterIterator::inVariable() const {
227     return buf != 0;
228 }
229 
230 U_NAMESPACE_END
231 
232 #endif // _RULEITER_H_
233 //eof
234