1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2012, International Business Machines Corporation
6 *   and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   01/21/2002  aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/uniset.h"
18 #include "unicode/utf16.h"
19 #include "strrepl.h"
20 #include "rbt_data.h"
21 #include "util.h"
22 
23 U_NAMESPACE_BEGIN
24 
~UnicodeReplacer()25 UnicodeReplacer::~UnicodeReplacer() {}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
27 
28 /**
29  * Construct a StringReplacer that sets the emits the given output
30  * text and sets the cursor to the given position.
31  * @param theOutput text that will replace input text when the
32  * replace() method is called.  May contain stand-in characters
33  * that represent nested replacers.
34  * @param theCursorPos cursor position that will be returned by
35  * the replace() method
36  * @param theData transliterator context object that translates
37  * stand-in characters to UnicodeReplacer objects
38  */
39 StringReplacer::StringReplacer(const UnicodeString& theOutput,
40                                int32_t theCursorPos,
41                                const TransliterationRuleData* theData) {
42     output = theOutput;
43     cursorPos = theCursorPos;
44     hasCursor = TRUE;
45     data = theData;
46     isComplex = TRUE;
47 }
48 
49 /**
50  * Construct a StringReplacer that sets the emits the given output
51  * text and does not modify the cursor.
52  * @param theOutput text that will replace input text when the
53  * replace() method is called.  May contain stand-in characters
54  * that represent nested replacers.
55  * @param theData transliterator context object that translates
56  * stand-in characters to UnicodeReplacer objects
57  */
StringReplacer(const UnicodeString & theOutput,const TransliterationRuleData * theData)58 StringReplacer::StringReplacer(const UnicodeString& theOutput,
59                                const TransliterationRuleData* theData) {
60     output = theOutput;
61     cursorPos = 0;
62     hasCursor = FALSE;
63     data = theData;
64     isComplex = TRUE;
65 }
66 
67 /**
68  * Copy constructor.
69  */
StringReplacer(const StringReplacer & other)70 StringReplacer::StringReplacer(const StringReplacer& other) :
71     UnicodeFunctor(other),
72     UnicodeReplacer(other)
73 {
74     output = other.output;
75     cursorPos = other.cursorPos;
76     hasCursor = other.hasCursor;
77     data = other.data;
78     isComplex = other.isComplex;
79 }
80 
81 /**
82  * Destructor
83  */
~StringReplacer()84 StringReplacer::~StringReplacer() {
85 }
86 
87 /**
88  * Implement UnicodeFunctor
89  */
clone() const90 StringReplacer* StringReplacer::clone() const {
91     return new StringReplacer(*this);
92 }
93 
94 /**
95  * Implement UnicodeFunctor
96  */
toReplacer() const97 UnicodeReplacer* StringReplacer::toReplacer() const {
98   return const_cast<StringReplacer *>(this);
99 }
100 
101 /**
102  * UnicodeReplacer API
103  */
replace(Replaceable & text,int32_t start,int32_t limit,int32_t & cursor)104 int32_t StringReplacer::replace(Replaceable& text,
105                                 int32_t start,
106                                 int32_t limit,
107                                 int32_t& cursor) {
108     int32_t outLen;
109     int32_t newStart = 0;
110 
111     // NOTE: It should be possible to _always_ run the complex
112     // processing code; just slower.  If not, then there is a bug
113     // in the complex processing code.
114 
115     // Simple (no nested replacers) Processing Code :
116     if (!isComplex) {
117         text.handleReplaceBetween(start, limit, output);
118         outLen = output.length();
119 
120         // Setup default cursor position (for cursorPos within output)
121         newStart = cursorPos;
122     }
123 
124     // Complex (nested replacers) Processing Code :
125     else {
126         /* When there are segments to be copied, use the Replaceable.copy()
127          * API in order to retain out-of-band data.  Copy everything to the
128          * end of the string, then copy them back over the key.  This preserves
129          * the integrity of indices into the key and surrounding context while
130          * generating the output text.
131          */
132         UnicodeString buf;
133         int32_t oOutput; // offset into 'output'
134         isComplex = FALSE;
135 
136         // The temporary buffer starts at tempStart, and extends
137         // to destLimit.  The start of the buffer has a single
138         // character from before the key.  This provides style
139         // data when addition characters are filled into the
140         // temporary buffer.  If there is nothing to the left, use
141         // the non-character U+FFFF, which Replaceable subclasses
142         // should treat specially as a "no-style character."
143         // destStart points to the point after the style context
144         // character, so it is tempStart+1 or tempStart+2.
145         int32_t tempStart = text.length(); // start of temp buffer
146         int32_t destStart = tempStart; // copy new text to here
147         if (start > 0) {
148             int32_t len = U16_LENGTH(text.char32At(start-1));
149             text.copy(start-len, start, tempStart);
150             destStart += len;
151         } else {
152             UnicodeString str((UChar) 0xFFFF);
153             text.handleReplaceBetween(tempStart, tempStart, str);
154             destStart++;
155         }
156         int32_t destLimit = destStart;
157 
158         for (oOutput=0; oOutput<output.length(); ) {
159             if (oOutput == cursorPos) {
160                 // Record the position of the cursor
161                 newStart = destLimit - destStart; // relative to start
162             }
163             UChar32 c = output.char32At(oOutput);
164             UnicodeReplacer* r = data->lookupReplacer(c);
165             if (r == NULL) {
166                 // Accumulate straight (non-segment) text.
167                 buf.append(c);
168             } else {
169                 isComplex = TRUE;
170 
171                 // Insert any accumulated straight text.
172                 if (buf.length() > 0) {
173                     text.handleReplaceBetween(destLimit, destLimit, buf);
174                     destLimit += buf.length();
175                     buf.truncate(0);
176                 }
177 
178                 // Delegate output generation to replacer object
179                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
180                 destLimit += len;
181             }
182             oOutput += U16_LENGTH(c);
183         }
184         // Insert any accumulated straight text.
185         if (buf.length() > 0) {
186             text.handleReplaceBetween(destLimit, destLimit, buf);
187             destLimit += buf.length();
188         }
189         if (oOutput == cursorPos) {
190             // Record the position of the cursor
191             newStart = destLimit - destStart; // relative to start
192         }
193 
194         outLen = destLimit - destStart;
195 
196         // Copy new text to start, and delete it
197         text.copy(destStart, destLimit, start);
198         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
199 
200         // Delete the old text (the key)
201         text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
202     }
203 
204     if (hasCursor) {
205         // Adjust the cursor for positions outside the key.  These
206         // refer to code points rather than code units.  If cursorPos
207         // is within the output string, then use newStart, which has
208         // already been set above.
209         if (cursorPos < 0) {
210             newStart = start;
211             int32_t n = cursorPos;
212             // Outside the output string, cursorPos counts code points
213             while (n < 0 && newStart > 0) {
214                 newStart -= U16_LENGTH(text.char32At(newStart-1));
215                 ++n;
216             }
217             newStart += n;
218         } else if (cursorPos > output.length()) {
219             newStart = start + outLen;
220             int32_t n = cursorPos - output.length();
221             // Outside the output string, cursorPos counts code points
222             while (n > 0 && newStart < text.length()) {
223                 newStart += U16_LENGTH(text.char32At(newStart));
224                 --n;
225             }
226             newStart += n;
227         } else {
228             // Cursor is within output string.  It has been set up above
229             // to be relative to start.
230             newStart += start;
231         }
232 
233         cursor = newStart;
234     }
235 
236     return outLen;
237 }
238 
239 /**
240  * UnicodeReplacer API
241  */
toReplacerPattern(UnicodeString & rule,UBool escapeUnprintable) const242 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
243                                                  UBool escapeUnprintable) const {
244     rule.truncate(0);
245     UnicodeString quoteBuf;
246 
247     int32_t cursor = cursorPos;
248 
249     // Handle a cursor preceding the output
250     if (hasCursor && cursor < 0) {
251         while (cursor++ < 0) {
252             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
253         }
254         // Fall through and append '|' below
255     }
256 
257     for (int32_t i=0; i<output.length(); ++i) {
258         if (hasCursor && i == cursor) {
259             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
260         }
261         UChar c = output.charAt(i); // Ok to use 16-bits here
262 
263         UnicodeReplacer* r = data->lookupReplacer(c);
264         if (r == NULL) {
265             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
266         } else {
267             UnicodeString buf;
268             r->toReplacerPattern(buf, escapeUnprintable);
269             buf.insert(0, (UChar)0x20);
270             buf.append((UChar)0x20);
271             ICU_Utility::appendToRule(rule, buf,
272                                       TRUE, escapeUnprintable, quoteBuf);
273         }
274     }
275 
276     // Handle a cursor after the output.  Use > rather than >= because
277     // if cursor == output.length() it is at the end of the output,
278     // which is the default position, so we need not emit it.
279     if (hasCursor && cursor > output.length()) {
280         cursor -= output.length();
281         while (cursor-- > 0) {
282             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
283         }
284         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
285     }
286     // Flush quoteBuf out to result
287     ICU_Utility::appendToRule(rule, -1,
288                               TRUE, escapeUnprintable, quoteBuf);
289 
290     return rule;
291 }
292 
293 /**
294  * Implement UnicodeReplacer
295  */
addReplacementSetTo(UnicodeSet & toUnionTo) const296 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
297     UChar32 ch;
298     for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
299     ch = output.char32At(i);
300     UnicodeReplacer* r = data->lookupReplacer(ch);
301     if (r == NULL) {
302         toUnionTo.add(ch);
303     } else {
304         r->addReplacementSetTo(toUnionTo);
305     }
306     }
307 }
308 
309 /**
310  * UnicodeFunctor API
311  */
setData(const TransliterationRuleData * d)312 void StringReplacer::setData(const TransliterationRuleData* d) {
313     data = d;
314     int32_t i = 0;
315     while (i<output.length()) {
316         UChar32 c = output.char32At(i);
317         UnicodeFunctor* f = data->lookup(c);
318         if (f != NULL) {
319             f->setData(data);
320         }
321         i += U16_LENGTH(c);
322     }
323 }
324 
325 U_NAMESPACE_END
326 
327 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
328 
329 //eof
330