1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.h
9 *
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONRULEPARSER_H__
15 #define __COLLATIONRULEPARSER_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 
25 struct UParseError;
26 
27 U_NAMESPACE_BEGIN
28 
29 struct CollationData;
30 struct CollationTailoring;
31 
32 class Locale;
33 class Normalizer2;
34 
35 struct CollationSettings;
36 
37 class U_I18N_API CollationRuleParser : public UMemory {
38 public:
39     /** Special reset positions. */
40     enum Position {
41         FIRST_TERTIARY_IGNORABLE,
42         LAST_TERTIARY_IGNORABLE,
43         FIRST_SECONDARY_IGNORABLE,
44         LAST_SECONDARY_IGNORABLE,
45         FIRST_PRIMARY_IGNORABLE,
46         LAST_PRIMARY_IGNORABLE,
47         FIRST_VARIABLE,
48         LAST_VARIABLE,
49         FIRST_REGULAR,
50         LAST_REGULAR,
51         FIRST_IMPLICIT,
52         LAST_IMPLICIT,
53         FIRST_TRAILING,
54         LAST_TRAILING
55     };
56 
57     /**
58      * First character of contractions that encode special reset positions.
59      * U+FFFE cannot be tailored via rule syntax.
60      *
61      * The second contraction character is POS_BASE + Position.
62      */
63     static const UChar POS_LEAD = 0xfffe;
64     /**
65      * Base for the second character of contractions that encode special reset positions.
66      * Braille characters U+28xx are printable and normalization-inert.
67      * @see POS_LEAD
68      */
69     static const UChar POS_BASE = 0x2800;
70 
71     class U_I18N_API Sink : public UObject {
72     public:
73         virtual ~Sink();
74         /**
75          * Adds a reset.
76          * strength=UCOL_IDENTICAL for &str.
77          * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
78          */
79         virtual void addReset(int32_t strength, const UnicodeString &str,
80                               const char *&errorReason, UErrorCode &errorCode) = 0;
81         /**
82          * Adds a relation with strength and prefix | str / extension.
83          */
84         virtual void addRelation(int32_t strength, const UnicodeString &prefix,
85                                  const UnicodeString &str, const UnicodeString &extension,
86                                  const char *&errorReason, UErrorCode &errorCode) = 0;
87 
88         virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
89                                           UErrorCode &errorCode);
90 
91         virtual void optimize(const UnicodeSet &set, const char *&errorReason,
92                               UErrorCode &errorCode);
93     };
94 
95     class U_I18N_API Importer : public UObject {
96     public:
97         virtual ~Importer();
98         virtual void getRules(
99                 const char *localeID, const char *collationType,
100                 UnicodeString &rules,
101                 const char *&errorReason, UErrorCode &errorCode) = 0;
102     };
103 
104     /**
105      * Constructor.
106      * The Sink must be set before parsing.
107      * The Importer can be set, otherwise [import locale] syntax is not supported.
108      */
109     CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
110     ~CollationRuleParser();
111 
112     /**
113      * Sets the pointer to a Sink object.
114      * The pointer is aliased: Pointer copy without cloning or taking ownership.
115      */
setSink(Sink * sinkAlias)116     void setSink(Sink *sinkAlias) {
117         sink = sinkAlias;
118     }
119 
120     /**
121      * Sets the pointer to an Importer object.
122      * The pointer is aliased: Pointer copy without cloning or taking ownership.
123      */
setImporter(Importer * importerAlias)124     void setImporter(Importer *importerAlias) {
125         importer = importerAlias;
126     }
127 
128     void parse(const UnicodeString &ruleString,
129                CollationSettings &outSettings,
130                UParseError *outParseError,
131                UErrorCode &errorCode);
132 
getErrorReason()133     const char *getErrorReason() const { return errorReason; }
134 
135     /**
136      * Gets a script or reorder code from its string representation.
137      * @return the script/reorder code, or
138      * -1 if not recognized
139      */
140     static int32_t getReorderCode(const char *word);
141 
142 private:
143     /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144     static const int32_t STRENGTH_MASK = 0xf;
145     static const int32_t STARRED_FLAG = 0x10;
146     static const int32_t OFFSET_SHIFT = 8;
147 
148     void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
149     void parseRuleChain(UErrorCode &errorCode);
150     int32_t parseResetAndPosition(UErrorCode &errorCode);
151     int32_t parseRelationOperator(UErrorCode &errorCode);
152     void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
153     void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
154     int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
155     int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
156 
157     /**
158      * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159      * @return rule index after the special reset position
160      */
161     int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
162     void parseSetting(UErrorCode &errorCode);
163     void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
164     static UColAttributeValue getOnOffValue(const UnicodeString &s);
165 
166     int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
167     int32_t readWords(int32_t i, UnicodeString &raw) const;
168     int32_t skipComment(int32_t i) const;
169 
170     void setParseError(const char *reason, UErrorCode &errorCode);
171     void setErrorContext();
172 
173     /**
174      * ASCII [:P:] and [:S:]:
175      * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
176      */
177     static UBool isSyntaxChar(UChar32 c);
178     int32_t skipWhiteSpace(int32_t i) const;
179 
180     const Normalizer2 &nfd, &nfc;
181 
182     const UnicodeString *rules;
183     const CollationData *const baseData;
184     CollationSettings *settings;
185     UParseError *parseError;
186     const char *errorReason;
187 
188     Sink *sink;
189     Importer *importer;
190 
191     int32_t ruleIndex;
192 };
193 
194 U_NAMESPACE_END
195 
196 #endif  // !UCONFIG_NO_COLLATION
197 #endif  // __COLLATIONRULEPARSER_H__
198