1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2001-2012, International Business Machines Corporation
6 *   and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   07/23/01    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "strmatch.h"
18 #include "rbt_data.h"
19 #include "util.h"
20 #include "unicode/uniset.h"
21 #include "unicode/utf16.h"
22 
23 U_NAMESPACE_BEGIN
24 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
26 
27 StringMatcher::StringMatcher(const UnicodeString& theString,
28                              int32_t start,
29                              int32_t limit,
30                              int32_t segmentNum,
31                              const TransliterationRuleData& theData) :
32     data(&theData),
33     segmentNumber(segmentNum),
34     matchStart(-1),
35     matchLimit(-1)
36 {
37     theString.extractBetween(start, limit, pattern);
38 }
39 
StringMatcher(const StringMatcher & o)40 StringMatcher::StringMatcher(const StringMatcher& o) :
41     UnicodeFunctor(o),
42     UnicodeMatcher(o),
43     UnicodeReplacer(o),
44     pattern(o.pattern),
45     data(o.data),
46     segmentNumber(o.segmentNumber),
47     matchStart(o.matchStart),
48     matchLimit(o.matchLimit)
49 {
50 }
51 
52 /**
53  * Destructor
54  */
~StringMatcher()55 StringMatcher::~StringMatcher() {
56 }
57 
58 /**
59  * Implement UnicodeFunctor
60  */
clone() const61 StringMatcher* StringMatcher::clone() const {
62     return new StringMatcher(*this);
63 }
64 
65 /**
66  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
67  * and return the pointer.
68  */
toMatcher() const69 UnicodeMatcher* StringMatcher::toMatcher() const {
70   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
71   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
72 
73   return nonconst_base;
74 }
75 
76 /**
77  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
78  * and return the pointer.
79  */
toReplacer() const80 UnicodeReplacer* StringMatcher::toReplacer() const {
81   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
82   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
83 
84   return nonconst_base;
85 }
86 
87 /**
88  * Implement UnicodeMatcher
89  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)90 UMatchDegree StringMatcher::matches(const Replaceable& text,
91                                     int32_t& offset,
92                                     int32_t limit,
93                                     UBool incremental) {
94     int32_t i;
95     int32_t cursor = offset;
96     if (limit < cursor) {
97         // Match in the reverse direction
98         for (i=pattern.length()-1; i>=0; --i) {
99             UChar keyChar = pattern.charAt(i);
100             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
101             if (subm == 0) {
102                 if (cursor > limit &&
103                     keyChar == text.charAt(cursor)) {
104                     --cursor;
105                 } else {
106                     return U_MISMATCH;
107                 }
108             } else {
109                 UMatchDegree m =
110                     subm->matches(text, cursor, limit, incremental);
111                 if (m != U_MATCH) {
112                     return m;
113                 }
114             }
115         }
116         // Record the match position, but adjust for a normal
117         // forward start, limit, and only if a prior match does not
118         // exist -- we want the rightmost match.
119         if (matchStart < 0) {
120             matchStart = cursor+1;
121             matchLimit = offset+1;
122         }
123     } else {
124         for (i=0; i<pattern.length(); ++i) {
125             if (incremental && cursor == limit) {
126                 // We've reached the context limit without a mismatch and
127                 // without completing our match.
128                 return U_PARTIAL_MATCH;
129             }
130             UChar keyChar = pattern.charAt(i);
131             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
132             if (subm == 0) {
133                 // Don't need the cursor < limit check if
134                 // incremental is TRUE (because it's done above); do need
135                 // it otherwise.
136                 if (cursor < limit &&
137                     keyChar == text.charAt(cursor)) {
138                     ++cursor;
139                 } else {
140                     return U_MISMATCH;
141                 }
142             } else {
143                 UMatchDegree m =
144                     subm->matches(text, cursor, limit, incremental);
145                 if (m != U_MATCH) {
146                     return m;
147                 }
148             }
149         }
150         // Record the match position
151         matchStart = offset;
152         matchLimit = cursor;
153     }
154 
155     offset = cursor;
156     return U_MATCH;
157 }
158 
159 /**
160  * Implement UnicodeMatcher
161  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const162 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
163                                         UBool escapeUnprintable) const
164 {
165     result.truncate(0);
166     UnicodeString str, quoteBuf;
167     if (segmentNumber > 0) {
168         result.append((UChar)40); /*(*/
169     }
170     for (int32_t i=0; i<pattern.length(); ++i) {
171         UChar keyChar = pattern.charAt(i);
172         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
173         if (m == 0) {
174             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
175         } else {
176             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
177                          TRUE, escapeUnprintable, quoteBuf);
178         }
179     }
180     if (segmentNumber > 0) {
181         result.append((UChar)41); /*)*/
182     }
183     // Flush quoteBuf out to result
184     ICU_Utility::appendToRule(result, -1,
185                               TRUE, escapeUnprintable, quoteBuf);
186     return result;
187 }
188 
189 /**
190  * Implement UnicodeMatcher
191  */
matchesIndexValue(uint8_t v) const192 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
193     if (pattern.length() == 0) {
194         return TRUE;
195     }
196     UChar32 c = pattern.char32At(0);
197     const UnicodeMatcher *m = data->lookupMatcher(c);
198     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
199 }
200 
201 /**
202  * Implement UnicodeMatcher
203  */
addMatchSetTo(UnicodeSet & toUnionTo) const204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
205     UChar32 ch;
206     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
207         ch = pattern.char32At(i);
208         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
209         if (matcher == NULL) {
210             toUnionTo.add(ch);
211         } else {
212             matcher->addMatchSetTo(toUnionTo);
213         }
214     }
215 }
216 
217 /**
218  * UnicodeReplacer API
219  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t &)220 int32_t StringMatcher::replace(Replaceable& text,
221                                int32_t start,
222                                int32_t limit,
223                                int32_t& /*cursor*/) {
224 
225     int32_t outLen = 0;
226 
227     // Copy segment with out-of-band data
228     int32_t dest = limit;
229     // If there was no match, that means that a quantifier
230     // matched zero-length.  E.g., x (a)* y matched "xy".
231     if (matchStart >= 0) {
232         if (matchStart != matchLimit) {
233             text.copy(matchStart, matchLimit, dest);
234             outLen = matchLimit - matchStart;
235         }
236     }
237 
238     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
239 
240     return outLen;
241 }
242 
243 /**
244  * UnicodeReplacer API
245  */
toReplacerPattern(UnicodeString & rule,UBool) const246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
247                                                 UBool /*escapeUnprintable*/) const {
248     // assert(segmentNumber > 0);
249     rule.truncate(0);
250     rule.append((UChar)0x0024 /*$*/);
251     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
252     return rule;
253 }
254 
255 /**
256  * Remove any match info.  This must be called before performing a
257  * set of matches with this segment.
258  */
resetMatch()259  void StringMatcher::resetMatch() {
260     matchStart = matchLimit = -1;
261 }
262 
263 /**
264  * Union the set of all characters that may output by this object
265  * into the given set.
266  * @param toUnionTo the set into which to union the output characters
267  */
addReplacementSetTo(UnicodeSet &) const268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
269     // The output of this replacer varies; it is the source text between
270     // matchStart and matchLimit.  Since this varies depending on the
271     // input text, we can't compute it here.  We can either do nothing
272     // or we can add ALL characters to the set.  It's probably more useful
273     // to do nothing.
274 }
275 
276 /**
277  * Implement UnicodeFunctor
278  */
setData(const TransliterationRuleData * d)279 void StringMatcher::setData(const TransliterationRuleData* d) {
280     data = d;
281     int32_t i = 0;
282     while (i<pattern.length()) {
283         UChar32 c = pattern.char32At(i);
284         UnicodeFunctor* f = data->lookup(c);
285         if (f != NULL) {
286             f->setData(data);
287         }
288         i += U16_LENGTH(c);
289     }
290 }
291 
292 U_NAMESPACE_END
293 
294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
295 
296 //eof
297