1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (c) 2001-2011, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  *   Date        Name        Description
9  *   11/19/2001  aliu        Creation.
10  **********************************************************************
11  */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/uchar.h"
18 #include "unicode/utf16.h"
19 #include "unesctrn.h"
20 #include "util.h"
21 
22 #include "cmemory.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 /**
27  * Special character marking the end of the spec[] array.
28  */
29 static const UChar END = 0xFFFF;
30 
31 // Unicode: "U+10FFFF" hex, min=4, max=6
32 static const UChar SPEC_Unicode[] = {
33     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
34     END
35 };
36 
37 // Java: "\\uFFFF" hex, min=4, max=4
38 static const UChar SPEC_Java[] = {
39     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
40     END
41 };
42 
43 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
44 static const UChar SPEC_C[] = {
45     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
46     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
47     END
48 };
49 
50 // XML: "" hex, min=1, max=6
51 static const UChar SPEC_XML[] = {
52     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
53     END
54 };
55 
56 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
57 static const UChar SPEC_XML10[] = {
58     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
59     END
60 };
61 
62 // Perl: "\\x{263A}" hex, min=1, max=6
63 static const UChar SPEC_Perl[] = {
64     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
65     END
66 };
67 
68 // All: Java, C, Perl, XML, XML10, Unicode
69 static const UChar SPEC_Any[] = {
70     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
71     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
72     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
73     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
74     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
75     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
76     END
77 };
78 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)79 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
80 
81 static UChar* copySpec(const UChar* spec) {
82     int32_t len = 0;
83     while (spec[len] != END) {
84         ++len;
85     }
86     ++len;
87     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
88     // Check for memory allocation error.
89     if (result != NULL) {
90     	uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
91     }
92     return result;
93 }
94 
95 /**
96  * Factory methods.  Ignore the context.
97  */
_createUnicode(const UnicodeString & ID,Transliterator::Token)98 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
99     return new UnescapeTransliterator(ID, SPEC_Unicode);
100 }
_createJava(const UnicodeString & ID,Transliterator::Token)101 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
102     return new UnescapeTransliterator(ID, SPEC_Java);
103 }
_createC(const UnicodeString & ID,Transliterator::Token)104 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
105     return new UnescapeTransliterator(ID, SPEC_C);
106 }
_createXML(const UnicodeString & ID,Transliterator::Token)107 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
108     return new UnescapeTransliterator(ID, SPEC_XML);
109 }
_createXML10(const UnicodeString & ID,Transliterator::Token)110 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
111     return new UnescapeTransliterator(ID, SPEC_XML10);
112 }
_createPerl(const UnicodeString & ID,Transliterator::Token)113 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
114     return new UnescapeTransliterator(ID, SPEC_Perl);
115 }
_createAny(const UnicodeString & ID,Transliterator::Token)116 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
117     return new UnescapeTransliterator(ID, SPEC_Any);
118 }
119 
120 /**
121  * Registers standard variants with the system.  Called by
122  * Transliterator during initialization.
123  */
registerIDs()124 void UnescapeTransliterator::registerIDs() {
125     Token t = integerToken(0);
126 
127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
128 
129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
130 
131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
132 
133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
134 
135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
136 
137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
138 
139     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
140 }
141 
142 /**
143  * Constructor.  Takes the encoded spec array.
144  */
UnescapeTransliterator(const UnicodeString & newID,const UChar * newSpec)145 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
146                                                const UChar *newSpec) :
147     Transliterator(newID, NULL)
148 {
149     this->spec = copySpec(newSpec);
150 }
151 
152 /**
153  * Copy constructor.
154  */
UnescapeTransliterator(const UnescapeTransliterator & o)155 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
156     Transliterator(o) {
157     this->spec = copySpec(o.spec);
158 }
159 
~UnescapeTransliterator()160 UnescapeTransliterator::~UnescapeTransliterator() {
161     uprv_free(spec);
162 }
163 
164 /**
165  * Transliterator API.
166  */
clone() const167 UnescapeTransliterator* UnescapeTransliterator::clone() const {
168     return new UnescapeTransliterator(*this);
169 }
170 
171 /**
172  * Implements {@link Transliterator#handleTransliterate}.
173  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const174 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
175                                                  UBool isIncremental) const {
176     int32_t start = pos.start;
177     int32_t limit = pos.limit;
178     int32_t i, j, ipat;
179 
180     while (start < limit) {
181         // Loop over the forms in spec[].  Exit this loop when we
182         // match one of the specs.  Exit the outer loop if a
183         // partial match is detected and isIncremental is true.
184         for (j=0, ipat=0; spec[ipat] != END; ++j) {
185 
186             // Read the header
187             int32_t prefixLen = spec[ipat++];
188             int32_t suffixLen = spec[ipat++];
189             int8_t  radix     = (int8_t) spec[ipat++];
190             int32_t minDigits = spec[ipat++];
191             int32_t maxDigits = spec[ipat++];
192 
193             // s is a copy of start that is advanced over the
194             // characters as we parse them.
195             int32_t s = start;
196             UBool match = TRUE;
197 
198             for (i=0; i<prefixLen; ++i) {
199                 if (s >= limit) {
200                     if (i > 0) {
201                         // We've already matched a character.  This is
202                         // a partial match, so we return if in
203                         // incremental mode.  In non-incremental mode,
204                         // go to the next spec.
205                         if (isIncremental) {
206                             goto exit;
207                         }
208                         match = FALSE;
209                         break;
210                     }
211                 }
212                 UChar c = text.charAt(s++);
213                 if (c != spec[ipat + i]) {
214                     match = FALSE;
215                     break;
216                 }
217             }
218 
219             if (match) {
220                 UChar32 u = 0;
221                 int32_t digitCount = 0;
222                 for (;;) {
223                     if (s >= limit) {
224                         // Check for partial match in incremental mode.
225                         if (s > start && isIncremental) {
226                             goto exit;
227                         }
228                         break;
229                     }
230                     UChar32 ch = text.char32At(s);
231                     int32_t digit = u_digit(ch, radix);
232                     if (digit < 0) {
233                         break;
234                     }
235                     s += U16_LENGTH(ch);
236                     u = (u * radix) + digit;
237                     if (++digitCount == maxDigits) {
238                         break;
239                     }
240                 }
241 
242                 match = (digitCount >= minDigits);
243 
244                 if (match) {
245                     for (i=0; i<suffixLen; ++i) {
246                         if (s >= limit) {
247                             // Check for partial match in incremental mode.
248                             if (s > start && isIncremental) {
249                                 goto exit;
250                             }
251                             match = FALSE;
252                             break;
253                         }
254                         UChar c = text.charAt(s++);
255                         if (c != spec[ipat + prefixLen + i]) {
256                             match = FALSE;
257                             break;
258                         }
259                     }
260 
261                     if (match) {
262                         // At this point, we have a match
263                         UnicodeString str(u);
264                         text.handleReplaceBetween(start, s, str);
265                         limit -= s - start - str.length();
266                         // The following break statement leaves the
267                         // loop that is traversing the forms in
268                         // spec[].  We then parse the next input
269                         // character.
270                         break;
271                     }
272                 }
273             }
274 
275             ipat += prefixLen + suffixLen;
276         }
277 
278         if (start < limit) {
279             start += U16_LENGTH(text.char32At(start));
280         }
281     }
282 
283   exit:
284     pos.contextLimit += limit - pos.limit;
285     pos.limit = limit;
286     pos.start = start;
287 }
288 
289 U_NAMESPACE_END
290 
291 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
292 
293 //eof
294