1 // utf8.h: utilities for converting to and from UTF-8
2 //
3 //   Copyright (C) 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation; either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software
17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18 //
19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004
20 
21 #ifndef UTF8_H
22 #define UTF8_H
23 
24 #include <string>
25 #include <cstdint> // for C99 int types
26 #include <vector>
27 
28 #include "dsodefs.h" // For DSOEXPORT
29 
30 // Android doesn't have any support for wide characters at all.
31 #ifdef __ANDROID__
32 namespace std {
33 typedef basic_string
34    <wchar_t
35    ,std::char_traits<wchar_t>
36    ,std::allocator<wchar_t> >
37 wstring;
38 }
39 #endif
40 
41 namespace gnash {
42 
43 /// Utilities to convert between std::string and std::wstring.
44 //
45 /// Strings in Gnash are generally stored as std::strings.
46 /// We have to deal, however, with characters larger than standard
47 /// ASCII (128), which can be encoded in two different ways.
48 ///
49 /// SWF6 and later use UTF-8, encoded as multibyte characters and
50 /// allowing many thousands of unique codes. Multibyte characters are
51 /// difficult to handle, as their length - used for many string
52 /// operations - is not certain without parsing the string.
53 /// Converting the string to a wstring (generally a uint32_t - the
54 /// pp seems only to handle characters up to 65535 - two bytes is
55 /// the minimum size of a wchar) facilitates string operations, as
56 /// the length of the string is equal to the number of valid characters.
57 ///
58 /// SWF5 and earlier, however, used the ISO-8859 specification,
59 /// allowing the standard 128 ASCII characters plus 128 extra
60 /// characters that depend on the particular subset of ISO-8859.
61 /// Characters are 8 bits, not the ASCII standard 7. SWF5 cannot
62 /// handle multi-byte characters without special functions.
63 ///
64 /// It is important that SWF5 can distinguish between the two encodings,
65 /// so we cannot convert all strings to UTF-8.
66 //
67 /// Please note that, although this is called utf8, what the Adobe
68 /// player uses is only loosely related to real unicode, so the
69 /// encoding support here is correspondingly non-standard.
70 namespace utf8 {
71 
72     /// Converts a std::string with multibyte characters into a std::wstring.
73     //
74     /// @return a version-dependent wstring.
75     /// @param str the canonical string to convert.
76     /// @param version the SWF version, used to decide how to decode the string.
77     //
78     /// For SWF5, UTF-8 (or any other) multibyte encoded characters are
79     /// converted char by char, mangling the string.
80     DSOEXPORT std::wstring decodeCanonicalString(const std::string& str, int version);
81 
82     /// Converts a std::wstring into canonical std::string.
83     //
84     /// @return a version-dependent encoded std::string.
85     /// @param wstr the wide string to convert.
86     /// @param version the SWF version, used to decide how to encode the string.
87     ///
88     /// For SWF 5, each character is stored as an 8-bit (at least) char, rather
89     /// than converting it to a canonical UTF-8 byte sequence. Gnash can then
90     /// distinguish between 8-bit characters, which it handles correctly, and
91     /// multi-byte characters, which are regarded as multiple characters for
92     /// string methods.
93     DSOEXPORT std::string encodeCanonicalString(const std::wstring& wstr, int version);
94 
95     /// Return the next Unicode character in the UTF-8 encoded string.
96     //
97     /// Invalid UTF-8 sequences produce a U+FFFD character
98     /// as output.  Advances string iterator past the character
99     /// returned, unless the returned character is '\0', in which
100     /// case the iterator does not advance.
101     DSOEXPORT std::uint32_t decodeNextUnicodeCharacter(std::string::const_iterator& it,
102                                                      const std::string::const_iterator& e);
103 
104     /// \brief Encodes the given wide character into a canonical
105     /// string, theoretically up to 6 chars in length.
106     DSOEXPORT std::string encodeUnicodeCharacter(std::uint32_t ucs_character);
107 
108     /// Encodes the given wide character into an at least 8-bit character.
109     //
110     /// Allows storage of Latin1 (ISO-8859-1) characters. This
111     /// is the format of SWF5 and below.
112     DSOEXPORT std::string encodeLatin1Character(std::uint32_t ucsCharacter);
113 
114     enum TextEncoding {
115         encUNSPECIFIED,
116         encUTF8,
117         encUTF16BE,
118         encUTF16LE,
119         encUTF32BE,
120         encUTF32LE,
121         encSCSU,
122         encUTF7,
123         encUTFEBCDIC,
124         encBOCU1
125     };
126 
127     /// Interpret (and skip) Byte Order Mark in input stream
128     //
129     /// This function takes a pointer to a buffer and returns
130     /// the start of actual data after an eventual BOM.
131     /// No conversion is performed, no bytes copy, just skipping of
132     /// the BOM snippet and interpretation of it returned to the
133     /// encoding input parameter.
134     ///
135     /// See http://en.wikipedia.org/wiki/Byte-order_mark
136     ///
137     /// @param in
138     ///    The input buffer.
139     ///
140     /// @param size
141     ///    Size of the input buffer, will be decremented by the
142     ///    size of the BOM, if any.
143     ///
144     /// @param encoding
145     ///    Output parameter, will always be set.
146     ///    encUNSPECIFIED if no BOM is found.
147     ///
148     /// @returns
149     ///    A pointer either equal to 'in' or some bytes inside it.
150     ///
151     DSOEXPORT const char* stripBOM(const char* in, size_t& size,
152                                    TextEncoding& encoding);
153 
154     /// Return name of a text encoding
155     DSOEXPORT const char* textEncodingName(TextEncoding enc);
156 
157     enum EncodingGuess {
158         ENCGUESS_UNICODE = 0,
159         ENCGUESS_JIS = 1,
160         ENCGUESS_OTHER = 2
161     };
162 
163     /// Common code for guessing at the encoding of random text, between
164     // Shift-Jis, UTF8, and other. Puts the DisplayObject count in length,
165     // and the offsets to the DisplayObjects in offsets, if offsets is not NULL.
166     // If not NULL, offsets should be at least s.length().
167     // offsets are not accurate if the return value is GUESSENC_OTHER
168     //
169     /// TODO: It's doubtful if this even works, and it may not be useful at
170     /// all.
171     DSOEXPORT EncodingGuess guessEncoding(const std::string& s, int& length,
172             std::vector<int>& offsets);
173 
174 
175 } // namespace utf8
176 } // namespace gnash
177 
178 #endif // UTF8_H
179 
180 
181 // Local Variables:
182 // mode: C++
183 // c-basic-offset: 8
184 // tab-width: 8
185 // indent-tabs-mode: t
186 // End:
187