1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2011, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/17/99    aliu        Creation.
10 **********************************************************************
11 */
12 #ifndef RBT_PARS_H
13 #define RBT_PARS_H
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_TRANSLITERATION
18 #ifdef __cplusplus
19 
20 #include "unicode/uobject.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/unorm.h"
23 #include "rbt.h"
24 #include "hash.h"
25 #include "uvector.h"
26 
27 U_NAMESPACE_BEGIN
28 
29 class TransliterationRuleData;
30 class UnicodeFunctor;
31 class ParseData;
32 class RuleHalf;
33 class ParsePosition;
34 class StringMatcher;
35 
36 class TransliteratorParser : public UMemory {
37 
38  public:
39 
40     /**
41      * A Vector of TransliterationRuleData objects, one for each discrete group
42      * of rules in the rule set
43      */
44     UVector dataVector;
45 
46     /**
47      * PUBLIC data member.
48      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
49      */
50     UVector idBlockVector;
51 
52     /**
53      * PUBLIC data member containing the parsed compound filter, if any.
54      */
55     UnicodeSet* compoundFilter;
56 
57  private:
58 
59     /**
60      * The current data object for which we are parsing rules
61      */
62     TransliterationRuleData* curData;
63 
64     UTransDirection direction;
65 
66     /**
67      * Parse error information.
68      */
69     UParseError parseError;
70 
71     /**
72      * Temporary symbol table used during parsing.
73      */
74     ParseData* parseData;
75 
76     /**
77      * Temporary vector of matcher variables.  When parsing is complete, this
78      * is copied into the array data.variables.  As with data.variables,
79      * element 0 corresponds to character data.variablesBase.
80      */
81     UVector variablesVector;
82 
83     /**
84      * Temporary table of variable names.  When parsing is complete, this is
85      * copied into data.variableNames.
86      */
87     Hashtable variableNames;
88 
89     /**
90      * String of standins for segments.  Used during the parsing of a single
91      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
92      * to StringMatcher object segmentObjects.elementAt(0), etc.
93      */
94     UnicodeString segmentStandins;
95 
96     /**
97      * Vector of StringMatcher objects for segments.  Used during the
98      * parsing of a single rule.
99      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
100      * to StringMatcher object segmentObjects.elementAt(0), etc.
101      */
102     UVector segmentObjects;
103 
104     /**
105      * The next available stand-in for variables.  This starts at some point in
106      * the private use area (discovered dynamically) and increments up toward
107      * <code>variableLimit</code>.  At any point during parsing, available
108      * variables are <code>variableNext..variableLimit-1</code>.
109      */
110     UChar variableNext;
111 
112     /**
113      * The last available stand-in for variables.  This is discovered
114      * dynamically.  At any point during parsing, available variables are
115      * <code>variableNext..variableLimit-1</code>.
116      */
117     UChar variableLimit;
118 
119     /**
120      * When we encounter an undefined variable, we do not immediately signal
121      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
122      * Instead, we save the name of the undefined variable, and substitute
123      * in the placeholder char variableLimit - 1, and decrement
124      * variableLimit.
125      */
126     UnicodeString undefinedVariableName;
127 
128     /**
129      * The stand-in character for the 'dot' set, represented by '.' in
130      * patterns.  This is allocated the first time it is needed, and
131      * reused thereafter.
132      */
133     UChar dotStandIn;
134 
135 public:
136 
137     /**
138      * Constructor.
139      */
140     TransliteratorParser(UErrorCode &statusReturn);
141 
142     /**
143      * Destructor.
144      */
145     ~TransliteratorParser();
146 
147     /**
148      * Parse the given string as a sequence of rules, separated by newline
149      * characters ('\n'), and cause this object to implement those rules.  Any
150      * previous rules are discarded.  Typically this method is called exactly
151      * once after construction.
152      *
153      * Parse the given rules, in the given direction.  After this call
154      * returns, query the public data members for results.  The caller
155      * owns the 'data' and 'compoundFilter' data members after this
156      * call returns.
157      * @param rules      rules, separated by ';'
158      * @param direction  either FORWARD or REVERSE.
159      * @param pe         Struct to recieve information on position
160      *                   of error if an error is encountered
161      * @param ec         Output param set to success/failure code.
162      */
163     void parse(const UnicodeString& rules,
164                UTransDirection direction,
165                UParseError& pe,
166                UErrorCode& ec);
167 
168     /**
169      * Return the compound filter parsed by parse().  Caller owns result.
170      * @return the compound filter parsed by parse().
171      */
172     UnicodeSet* orphanCompoundFilter();
173 
174 private:
175 
176     /**
177      * Return a representation of this transliterator as source rules.
178      * @param rules      Output param to receive the rules.
179      * @param direction  either FORWARD or REVERSE.
180      */
181     void parseRules(const UnicodeString& rules,
182                     UTransDirection direction,
183                     UErrorCode& status);
184 
185     /**
186      * MAIN PARSER.  Parse the next rule in the given rule string, starting
187      * at pos.  Return the index after the last character parsed.  Do not
188      * parse characters at or after limit.
189      *
190      * Important:  The character at pos must be a non-whitespace character
191      * that is not the comment character.
192      *
193      * This method handles quoting, escaping, and whitespace removal.  It
194      * parses the end-of-rule character.  It recognizes context and cursor
195      * indicators.  Once it does a lexical breakdown of the rule at pos, it
196      * creates a rule object and adds it to our rule list.
197      * @param rules      Output param to receive the rules.
198      * @param pos        the starting position.
199      * @param limit      pointer past the last character of the rule.
200      * @return           the index after the last character parsed.
201      */
202     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
203 
204     /**
205      * Set the variable range to [start, end] (inclusive).
206      * @param start    the start value of the range.
207      * @param end      the end value of the range.
208      */
209     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
210 
211     /**
212      * Assert that the given character is NOT within the variable range.
213      * If it is, return false.  This is necessary to ensure that the
214      * variable range does not overlap characters used in a rule.
215      * @param ch     the given character.
216      * @return       True, if the given character is NOT within the variable range.
217      */
218     UBool checkVariableRange(UChar32 ch) const;
219 
220     /**
221      * Set the maximum backup to 'backup', in response to a pragma
222      * statement.
223      * @param backup    the new value to be set.
224      */
225     void pragmaMaximumBackup(int32_t backup);
226 
227     /**
228      * Begin normalizing all rules using the given mode, in response
229      * to a pragma statement.
230      * @param mode    the given mode.
231      */
232     void pragmaNormalizeRules(UNormalizationMode mode);
233 
234     /**
235      * Return true if the given rule looks like a pragma.
236      * @param pos offset to the first non-whitespace character
237      * of the rule.
238      * @param limit pointer past the last character of the rule.
239      * @return true if the given rule looks like a pragma.
240      */
241     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
242 
243     /**
244      * Parse a pragma.  This method assumes resemblesPragma() has
245      * already returned true.
246      * @param pos offset to the first non-whitespace character
247      * of the rule.
248      * @param limit pointer past the last character of the rule.
249      * @return the position index after the final ';' of the pragma,
250      * or -1 on failure.
251      */
252     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
253 
254     /**
255      * Called by main parser upon syntax error.  Search the rule string
256      * for the probable end of the rule.  Of course, if the error is that
257      * the end of rule marker is missing, then the rule end will not be found.
258      * In any case the rule start will be correctly reported.
259      * @param parseErrorCode error code.
260      * @param msg error description.
261      * @param start position of first character of current rule.
262      * @return start position of first character of current rule.
263      */
264     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
265                         UErrorCode& status);
266 
267     /**
268      * Parse a UnicodeSet out, store it, and return the stand-in character
269      * used to represent it.
270      *
271      * @param rule    the rule for UnicodeSet.
272      * @param pos     the position in pattern at which to start parsing.
273      * @return        the stand-in character used to represent it.
274      */
275     UChar parseSet(const UnicodeString& rule,
276                    ParsePosition& pos,
277                    UErrorCode& status);
278 
279     /**
280      * Generate and return a stand-in for a new UnicodeFunctor.  Store
281      * the matcher (adopt it).
282      * @param adopted the UnicodeFunctor to be adopted.
283      * @return        a stand-in for a new UnicodeFunctor.
284      */
285     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
286 
287     /**
288      * Return the standin for segment seg (1-based).
289      * @param seg    the given segment.
290      * @return       the standIn character for the given segment.
291      */
292     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
293 
294     /**
295      * Set the object for segment seg (1-based).
296      * @param seg      the given segment.
297      * @param adopted  the StringMatcher to be adopted.
298      */
299     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
300 
301     /**
302      * Return the stand-in for the dot set.  It is allocated the first
303      * time and reused thereafter.
304      * @return    the stand-in for the dot set.
305      */
306     UChar getDotStandIn(UErrorCode& status);
307 
308     /**
309      * Append the value of the given variable name to the given
310      * UnicodeString.
311      * @param name    the variable name to be appended.
312      * @param buf     the given UnicodeString to append to.
313      */
314     void appendVariableDef(const UnicodeString& name,
315                            UnicodeString& buf,
316                            UErrorCode& status);
317 
318     /**
319      * Glue method to get around access restrictions in C++.
320      */
321     /*static Transliterator* createBasicInstance(const UnicodeString& id,
322                                                const UnicodeString* canonID);*/
323 
324     friend class RuleHalf;
325 
326     // Disallowed methods; no impl.
327     /**
328      * Copy constructor
329      */
330     TransliteratorParser(const TransliteratorParser&);
331 
332     /**
333      * Assignment operator
334      */
335     TransliteratorParser& operator=(const TransliteratorParser&);
336 };
337 
338 U_NAMESPACE_END
339 
340 #endif /* #ifdef __cplusplus */
341 
342 /**
343  * Strip/convert the following from the transliterator rules:
344  * comments
345  * newlines
346  * white space at the beginning and end of a line
347  * unescape \u notation
348  *
349  * The target must be equal in size as the source.
350  * @internal
351  */
352 U_CAPI int32_t
353 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
354 
355 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
356 
357 #endif
358