1 // utf8.cpp: utilities for converting to and from UTF-8
2 //
3 //   Copyright (C) 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18 //
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
20 //
21 // Much useful info at "UTF-8 and Unicode FAQ" http://www.cl.cam.ac.uk/~mgk25/unicode.html
22 
23 #include "utf8.h"
24 
25 #include <limits>
26 #include <cstdint>
27 #include <string>
28 #include <vector>
29 #include <cstdlib>
30 
31 namespace gnash {
32 namespace utf8 {
33 
34 namespace {
35     const std::uint32_t invalid = std::numeric_limits<std::uint32_t>::max();
36 }
37 
38 std::wstring
decodeCanonicalString(const std::string & str,int version)39 decodeCanonicalString(const std::string& str, int version)
40 {
41 
42     std::wstring wstr;
43 
44     std::string::const_iterator it = str.begin(), e = str.end();
45 
46     if (version > 5) {
47         while (std::uint32_t code = decodeNextUnicodeCharacter(it, e)) {
48             if (code == invalid) {
49                 continue;
50             }
51             wstr.push_back(static_cast<wchar_t>(code));
52         }
53     }
54     else {
55         while (it != str.end()) {
56             // This mangles UTF-8 (UCS4) strings, but is what is
57             // wanted for SWF5.
58             wstr.push_back(static_cast<unsigned char>(*it++));
59         }
60     }
61 
62     return wstr;
63 
64 }
65 
66 std::string
encodeCanonicalString(const std::wstring & wstr,int version)67 encodeCanonicalString(const std::wstring& wstr, int version)
68 {
69 
70     std::string str;
71 
72     std::wstring::const_iterator it = wstr.begin();
73     while ( it != wstr.end())
74     {
75         if (version > 5) str.append(encodeUnicodeCharacter(*it++));
76         else str.append(encodeLatin1Character(*it++));
77     }
78 
79     return str;
80 
81 }
82 
83 std::string
encodeLatin1Character(std::uint32_t ucsCharacter)84 encodeLatin1Character(std::uint32_t ucsCharacter)
85 {
86     std::string text;
87     text.push_back(static_cast<unsigned char>(ucsCharacter));
88     return text;
89 }
90 
91 
92 std::uint32_t
decodeNextUnicodeCharacter(std::string::const_iterator & it,const std::string::const_iterator & e)93 decodeNextUnicodeCharacter(std::string::const_iterator& it,
94                              const std::string::const_iterator& e)
95 {
96     std::uint32_t uc;
97 
98     // Security considerations:
99     //
100     // If we hit a zero byte, we want to return 0 without stepping
101     // the buffer pointer past the 0.
102     //
103     // If we hit an "overlong sequence"; i.e. a character encoded
104     // in a longer multibyte string than is necessary, then we
105     // need to discard the character.  This is so attackers can't
106     // disguise dangerous characters or character sequences --
107     // there is only one valid encoding for each character.
108     //
109     // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
110     // 0xFFFF } then we ignore them; they are not valid in UTF-8.
111 
112 #define FIRST_BYTE(mask, shift)        \
113     /* Post-increment iterator */ \
114     uc = (*it++ & (mask)) << (shift);
115 
116 #define NEXT_BYTE(shift)                        \
117                     \
118     if (it == e || *it == 0) return 0; /* end of buffer, do not advance */    \
119     if ((*it & 0xC0) != 0x80) return invalid; /* standard check */    \
120     /* Post-increment iterator: */        \
121     uc |= (*it++ & 0x3F) << shift;
122 
123     if (it == e || *it == 0) return 0;    // End of buffer.  Do not advance.
124 
125     // Conventional 7-bit ASCII; return and increment iterator:
126     if ((*it & 0x80) == 0) return static_cast<std::uint32_t>(*it++);
127 
128     // Multi-byte sequences
129     if ((*it & 0xE0) == 0xC0) {
130         // Two-byte sequence.
131         FIRST_BYTE(0x1F, 6);
132         NEXT_BYTE(0);
133         if (uc < 0x80) return invalid;    // overlong
134         return uc;
135     }
136     else if ((*it & 0xF0) == 0xE0) {
137         // Three-byte sequence.
138         FIRST_BYTE(0x0F, 12);
139         NEXT_BYTE(6);
140         NEXT_BYTE(0);
141         if (uc < 0x800) {
142             return invalid;
143         }
144         return uc;
145     }
146     else if ((*it & 0xF8) == 0xF0) {
147         // Four-byte sequence.
148         FIRST_BYTE(0x07, 18);
149         NEXT_BYTE(12);
150         NEXT_BYTE(6);
151         NEXT_BYTE(0);
152         if (uc < 0x010000) return invalid;    // overlong
153         return uc;
154     }
155     else {
156         // Invalid.
157         it++;
158         return invalid;
159     }
160 }
161 
162 // TODO: buffer as std::string; index (iterator);
163 
164 std::string
encodeUnicodeCharacter(std::uint32_t ucs_character)165 encodeUnicodeCharacter(std::uint32_t ucs_character)
166 {
167 
168     std::string text;
169 
170     if (ucs_character <= 0x7F)
171     {
172         // Plain single-byte ASCII.
173         text.push_back(ucs_character);
174     }
175     else if (ucs_character <= 0x7FF)
176     {
177         // Two bytes.
178         text.push_back(0xC0 | (ucs_character >> 6));
179         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
180     }
181     else if (ucs_character <= 0xFFFF) {
182         // Three bytes.
183         text.push_back(0xE0 | (ucs_character >> 12));
184         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
185         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
186     }
187     else if (ucs_character <= 0x1FFFFF) {
188         // Four bytes.
189         text.push_back(0xF0 | (ucs_character >> 18));
190         text.push_back(0x80 | ((ucs_character >> 12) & 0x3F));
191         text.push_back(0x80 | ((ucs_character >> 6) & 0x3F));
192         text.push_back(0x80 | ((ucs_character >> 0) & 0x3F));
193     }
194     else {
195         // Invalid char; don't encode anything.
196     }
197 
198     return text;
199 }
200 
201 
202 #define ENC_DEFAULT 0
203 #define ENC_UTF8 1
204 #define ENC_UTF16BE 2
205 #define ENC_UTF16LE 3
206 
207 const char*
stripBOM(const char * in,size_t & size,TextEncoding & encoding)208 stripBOM(const char* in, size_t& size, TextEncoding& encoding)
209 {
210     encoding = encUNSPECIFIED;
211     if ( size > 2 )
212     {
213         // need *ptr to be unsigned or cast all 0xNN
214         const unsigned char* ptr = reinterpret_cast<const unsigned char*>(in);
215 
216         if (*ptr == 0xFF && *(ptr+1) == 0xFE) {
217             // Text is UTF-16 LE
218             encoding = encUTF16LE;
219             in+=2;
220             size-=2;
221         }
222         else if ( *ptr == 0xFE && *(ptr+1) == 0xFF )
223         {
224             // Text is UTF-16 BE
225             encoding = encUTF16BE;
226             in+=2;
227             size-=2;
228         }
229         else if (size > 3 && *ptr == 0xEF && *(ptr+1) == 0xBB &&
230                 *(ptr+2) == 0xBF )
231         {
232             // Text is UTF-8
233             encoding = encUTF8;
234             in+=3;
235             size-=3;
236         }
237         else if ( size > 4 && *ptr == 0x00 && *(ptr+1) == 0x00 &&
238                 *(ptr+2) == 0xFE && *(ptr+3) == 0xFF )
239         {
240             // Text is UTF-32 BE
241             encoding = encUTF32BE;
242             in+=4;
243             size-=4;
244         }
245         else if ( size > 4 && *ptr == 0xFF && *(ptr+1) == 0xFE &&
246                 *(ptr+2) == 0x00 && *(ptr+3) == 0x00 )
247         {
248             // Text is UTF-32 LE
249             encoding = encUTF32LE;
250             in+=4;
251             size-=4;
252         }
253 
254         // TODO: check other kinds of boms !
255         // See http://en.wikipedia.org/wiki/Byte-order_mark#Representations_of_byte_order_marks_by_encoding
256     }
257 
258     return in;
259 }
260 
261 const char*
textEncodingName(TextEncoding enc)262 textEncodingName(TextEncoding enc)
263 {
264     switch (enc)
265     {
266         case encUNSPECIFIED: return "Unspecified";
267         case encUTF8: return "UTF8";
268         case encUTF16BE: return "UTF16BE";
269         case encUTF16LE: return "UTF16LE";
270         case encUTF32BE: return "UTF32BE";
271         case encUTF32LE: return "UTF32LE";
272         case encSCSU: return "SCSU";
273         case encUTF7: return "UTF7";
274         case encUTFEBCDIC: return "UTFEBCDIC";
275         case encBOCU1: return "BOCU1";
276         default: return "INVALID";
277     }
278 }
279 
280 EncodingGuess
guessEncoding(const std::string & str,int & length,std::vector<int> & offsets)281 guessEncoding(const std::string &str, int &length, std::vector<int>& offsets)
282 {
283     int width = 0; // The remaining width, not the total.
284     bool is_sought = true;
285 
286     std::string::const_iterator it = str.begin();
287     const std::string::const_iterator e = str.end();
288 
289     length = 0;
290 
291     // First, assume it's UTF8 and try to be wrong.
292     while (it != e && is_sought) {
293         ++length;
294 
295         offsets.push_back(it - str.begin()); // current position
296 
297         // Advances the iterator to point to the next
298         std::uint32_t c = utf8::decodeNextUnicodeCharacter(it, e);
299 
300         if (c == utf8::invalid) {
301             is_sought = false;
302             break;
303         }
304     }
305 
306     offsets.push_back(it - str.begin()); // current position
307 
308     if (it == e && is_sought) {
309         // No characters left, so it's almost certainly UTF8.
310         return ENCGUESS_UNICODE;
311     }
312 
313     it = str.begin();
314     int index = 0;
315     is_sought = true;
316     width = 0;
317     length = 0;
318     bool was_odd = true;
319     bool was_even = true;
320     // Now, assume it's SHIFT_JIS and try to be wrong.
321     while (it != e && is_sought) {
322         int c = static_cast<int> (*it);
323 
324         if (width) {
325             --width;
326             if ((c < 0x40) || ((c < 0x9F) && was_even) ||
327                 ((c > 0x9E) && was_odd) || (c == 0x7F)) {
328                 is_sought = false;
329             }
330             continue;
331         }
332 
333         ++length;
334         offsets.push_back(index); // [length - 1] = index;
335 
336         if ((c == 0x80) || (c == 0xA0) || (c >= 0xF0)) {
337             is_sought = false;
338             break;
339         }
340 
341         if (((c >= 0x81) && (c <= 0x9F)) || ((c >= 0xE0) && (c <= 0xEF))) {
342             width = 1;
343             was_odd = c & 0x01;
344             was_even = !was_odd;
345         }
346 
347         ++it;
348         ++index;
349     }
350     offsets.push_back(index); // [length - 1] = index;
351 
352     if (!width && is_sought) {
353         // No width left, so it's probably SHIFT_JIS.
354         return ENCGUESS_JIS;
355     }
356 
357     // It's something else.
358 #ifdef ANDROID
359     length = str.size();
360 #else
361     length = std::mbstowcs(nullptr, str.c_str(), 0);
362 #endif
363     if (length == -1)
364     {
365         length = str.length();
366     }
367     return ENCGUESS_OTHER;
368 }
369 
370 
371 } // namespace utf8
372 } // namespace gnash
373 
374 // Local Variables:
375 // mode: C++
376 // c-basic-offset: 8
377 // tab-width: 8
378 // indent-tabs-mode: t
379 // End:
380