1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   11/17/99    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/rep.h"
18 #include "unicode/uniset.h"
19 #include "rbt_pars.h"
20 #include "rbt_data.h"
21 #include "rbt_rule.h"
22 #include "rbt.h"
23 #include "mutex.h"
24 #include "umutex.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
29 
30 static Replaceable *gLockedText = NULL;
31 
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)32 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
33                                          UTransDirection direction,
34                                          UParseError& parseError,
35                                          UErrorCode& status) {
36     fData = 0;
37     isDataOwned = TRUE;
38     if (U_FAILURE(status)) {
39         return;
40     }
41 
42     TransliteratorParser parser(status);
43     parser.parse(rules, direction, parseError, status);
44     if (U_FAILURE(status)) {
45         return;
46     }
47 
48     if (parser.idBlockVector.size() != 0 ||
49         parser.compoundFilter != NULL ||
50         parser.dataVector.size() == 0) {
51         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
52         return;
53     }
54 
55     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
56     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
57 }
58 
59 /**
60  * Constructs a new transliterator from the given rules.
61  * @param id            the id for the transliterator.
62  * @param rules         rules, separated by ';'
63  * @param direction     either FORWARD or REVERSE.
64  * @param adoptedFilter the filter for this transliterator.
65  * @param parseError    Struct to recieve information on position
66  *                      of error if an error is encountered
67  * @param status        Output param set to success/failure code.
68  * @exception IllegalArgumentException if rules are malformed
69  * or direction is invalid.
70  */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)71 RuleBasedTransliterator::RuleBasedTransliterator(
72                             const UnicodeString& id,
73                             const UnicodeString& rules,
74                             UTransDirection direction,
75                             UnicodeFilter* adoptedFilter,
76                             UParseError& parseError,
77                             UErrorCode& status) :
78     Transliterator(id, adoptedFilter) {
79     _construct(rules, direction,parseError,status);
80 }
81 
82 /**
83  * Constructs a new transliterator from the given rules.
84  * @param id            the id for the transliterator.
85  * @param rules         rules, separated by ';'
86  * @param direction     either FORWARD or REVERSE.
87  * @param adoptedFilter the filter for this transliterator.
88  * @param status        Output param set to success/failure code.
89  * @exception IllegalArgumentException if rules are malformed
90  * or direction is invalid.
91  */
92 /*RuleBasedTransliterator::RuleBasedTransliterator(
93                             const UnicodeString& id,
94                             const UnicodeString& rules,
95                             UTransDirection direction,
96                             UnicodeFilter* adoptedFilter,
97                             UErrorCode& status) :
98     Transliterator(id, adoptedFilter) {
99     UParseError parseError;
100     _construct(rules, direction,parseError, status);
101 }*/
102 
103 /**
104  * Covenience constructor with no filter.
105  */
106 /*RuleBasedTransliterator::RuleBasedTransliterator(
107                             const UnicodeString& id,
108                             const UnicodeString& rules,
109                             UTransDirection direction,
110                             UErrorCode& status) :
111     Transliterator(id, 0) {
112     UParseError parseError;
113     _construct(rules, direction,parseError, status);
114 }*/
115 
116 /**
117  * Covenience constructor with no filter and FORWARD direction.
118  */
119 /*RuleBasedTransliterator::RuleBasedTransliterator(
120                             const UnicodeString& id,
121                             const UnicodeString& rules,
122                             UErrorCode& status) :
123     Transliterator(id, 0) {
124     UParseError parseError;
125     _construct(rules, UTRANS_FORWARD, parseError, status);
126 }*/
127 
128 /**
129  * Covenience constructor with FORWARD direction.
130  */
131 /*RuleBasedTransliterator::RuleBasedTransliterator(
132                             const UnicodeString& id,
133                             const UnicodeString& rules,
134                             UnicodeFilter* adoptedFilter,
135                             UErrorCode& status) :
136     Transliterator(id, adoptedFilter) {
137     UParseError parseError;
138     _construct(rules, UTRANS_FORWARD,parseError, status);
139 }*/
140 
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)141 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
142                                  const TransliterationRuleData* theData,
143                                  UnicodeFilter* adoptedFilter) :
144     Transliterator(id, adoptedFilter),
145     fData((TransliterationRuleData*)theData), // cast away const
146     isDataOwned(FALSE) {
147     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
148 }
149 
150 /**
151  * Internal constructor.
152  */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)153 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
154                                                  TransliterationRuleData* theData,
155                                                  UBool isDataAdopted) :
156     Transliterator(id, 0),
157     fData(theData),
158     isDataOwned(isDataAdopted) {
159     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
160 }
161 
162 /**
163  * Copy constructor.
164  */
RuleBasedTransliterator(const RuleBasedTransliterator & other)165 RuleBasedTransliterator::RuleBasedTransliterator(
166         const RuleBasedTransliterator& other) :
167     Transliterator(other), fData(other.fData),
168     isDataOwned(other.isDataOwned) {
169 
170     // The data object may or may not be owned.  If it is not owned we
171     // share it; it is invariant.  If it is owned, it's still
172     // invariant, but we need to copy it to prevent double-deletion.
173     // If this becomes a performance issue (if people do a lot of RBT
174     // copying -- unlikely) we can reference count the data object.
175 
176     // Only do a deep copy if this is owned data, that is, data that
177     // will be later deleted.  System transliterators contain
178     // non-owned data.
179     if (isDataOwned) {
180         fData = new TransliterationRuleData(*other.fData);
181     }
182 }
183 
184 /**
185  * Destructor.
186  */
~RuleBasedTransliterator()187 RuleBasedTransliterator::~RuleBasedTransliterator() {
188     // Delete the data object only if we own it.
189     if (isDataOwned) {
190         delete fData;
191     }
192 }
193 
194 RuleBasedTransliterator*
clone() const195 RuleBasedTransliterator::clone() const {
196     return new RuleBasedTransliterator(*this);
197 }
198 
199 /**
200  * Implements {@link Transliterator#handleTransliterate}.
201  */
202 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const203 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
204                                              UBool isIncremental) const {
205     /* We keep contextStart and contextLimit fixed the entire time,
206      * relative to the text -- contextLimit may move numerically if
207      * text is inserted or removed.  The start offset moves toward
208      * limit, with replacements happening under it.
209      *
210      * Example: rules 1. ab>x|y
211      *                2. yc>z
212      *
213      * |eabcd   begin - no match, advance start
214      * e|abcd   match rule 1 - change text & adjust start
215      * ex|ycd   match rule 2 - change text & adjust start
216      * exz|d    no match, advance start
217      * exzd|    done
218      */
219 
220     /* A rule like
221      *   a>b|a
222      * creates an infinite loop. To prevent that, we put an arbitrary
223      * limit on the number of iterations that we take, one that is
224      * high enough that any reasonable rules are ok, but low enough to
225      * prevent a server from hanging.  The limit is 16 times the
226      * number of characters n, unless n is so large that 16n exceeds a
227      * uint32_t.
228      */
229     uint32_t loopCount = 0;
230     uint32_t loopLimit = index.limit - index.start;
231     if (loopLimit >= 0x10000000) {
232         loopLimit = 0xFFFFFFFF;
233     } else {
234         loopLimit <<= 4;
235     }
236 
237     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
238     //   operations must be prevented.
239     // A Complication: compound transliterators can result in recursive entries to this
240     //   function, sometimes with different "This" objects, always with the same text.
241     //   Double-locking must be prevented in these cases.
242     //
243 
244     UBool    lockedMutexAtThisLevel = FALSE;
245 
246     // Test whether this request is operating on the same text string as
247     //   some other transliteration that is still in progress and holding the
248     //   transliteration mutex.  If so, do not lock the transliteration
249     //    mutex again.
250     //
251     //  gLockedText variable is protected by the global ICU mutex.
252     //  Shared RBT data protected by transliteratorDataMutex.
253     //
254     // TODO(andy): Need a better scheme for handling this.
255 
256     static UMutex transliteratorDataMutex;
257     UBool needToLock;
258     {
259         Mutex m;
260         needToLock = (&text != gLockedText);
261     }
262     if (needToLock) {
263         umtx_lock(&transliteratorDataMutex);  // Contention, longish waits possible here.
264         Mutex m;
265         gLockedText = &text;
266         lockedMutexAtThisLevel = TRUE;
267     }
268 
269     // Check to make sure we don't dereference a null pointer.
270     if (fData != NULL) {
271 	    while (index.start < index.limit &&
272 	           loopCount <= loopLimit &&
273 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
274 	        ++loopCount;
275 	    }
276     }
277     if (lockedMutexAtThisLevel) {
278         {
279             Mutex m;
280             gLockedText = NULL;
281         }
282         umtx_unlock(&transliteratorDataMutex);
283     }
284 }
285 
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const286 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
287                                                 UBool escapeUnprintable) const {
288     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
289 }
290 
291 /**
292  * Implement Transliterator framework
293  */
handleGetSourceSet(UnicodeSet & result) const294 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
295     fData->ruleSet.getSourceTargetSet(result, FALSE);
296 }
297 
298 /**
299  * Override Transliterator framework
300  */
getTargetSet(UnicodeSet & result) const301 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
302     return fData->ruleSet.getSourceTargetSet(result, TRUE);
303 }
304 
305 U_NAMESPACE_END
306 
307 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
308