1 // utf8.h: utilities for converting to and from UTF-8 2 // 3 // Copyright (C) 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. 4 // 5 // This program is free software; you can redistribute it and/or modify 6 // it under the terms of the GNU General Public License as published by 7 // the Free Software Foundation; either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU General Public License for more details. 14 // 15 // You should have received a copy of the GNU General Public License 16 // along with this program; if not, write to the Free Software 17 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 // 19 // Based on the public domain work of Thatcher Ulrich <tu@tulrich.com> 2004 20 21 #ifndef UTF8_H 22 #define UTF8_H 23 24 #include <string> 25 #include <cstdint> // for C99 int types 26 #include <vector> 27 28 #include "dsodefs.h" // For DSOEXPORT 29 30 // Android doesn't have any support for wide characters at all. 31 #ifdef __ANDROID__ 32 namespace std { 33 typedef basic_string 34 <wchar_t 35 ,std::char_traits<wchar_t> 36 ,std::allocator<wchar_t> > 37 wstring; 38 } 39 #endif 40 41 namespace gnash { 42 43 /// Utilities to convert between std::string and std::wstring. 44 // 45 /// Strings in Gnash are generally stored as std::strings. 46 /// We have to deal, however, with characters larger than standard 47 /// ASCII (128), which can be encoded in two different ways. 48 /// 49 /// SWF6 and later use UTF-8, encoded as multibyte characters and 50 /// allowing many thousands of unique codes. Multibyte characters are 51 /// difficult to handle, as their length - used for many string 52 /// operations - is not certain without parsing the string. 53 /// Converting the string to a wstring (generally a uint32_t - the 54 /// pp seems only to handle characters up to 65535 - two bytes is 55 /// the minimum size of a wchar) facilitates string operations, as 56 /// the length of the string is equal to the number of valid characters. 57 /// 58 /// SWF5 and earlier, however, used the ISO-8859 specification, 59 /// allowing the standard 128 ASCII characters plus 128 extra 60 /// characters that depend on the particular subset of ISO-8859. 61 /// Characters are 8 bits, not the ASCII standard 7. SWF5 cannot 62 /// handle multi-byte characters without special functions. 63 /// 64 /// It is important that SWF5 can distinguish between the two encodings, 65 /// so we cannot convert all strings to UTF-8. 66 // 67 /// Please note that, although this is called utf8, what the Adobe 68 /// player uses is only loosely related to real unicode, so the 69 /// encoding support here is correspondingly non-standard. 70 namespace utf8 { 71 72 /// Converts a std::string with multibyte characters into a std::wstring. 73 // 74 /// @return a version-dependent wstring. 75 /// @param str the canonical string to convert. 76 /// @param version the SWF version, used to decide how to decode the string. 77 // 78 /// For SWF5, UTF-8 (or any other) multibyte encoded characters are 79 /// converted char by char, mangling the string. 80 DSOEXPORT std::wstring decodeCanonicalString(const std::string& str, int version); 81 82 /// Converts a std::wstring into canonical std::string. 83 // 84 /// @return a version-dependent encoded std::string. 85 /// @param wstr the wide string to convert. 86 /// @param version the SWF version, used to decide how to encode the string. 87 /// 88 /// For SWF 5, each character is stored as an 8-bit (at least) char, rather 89 /// than converting it to a canonical UTF-8 byte sequence. Gnash can then 90 /// distinguish between 8-bit characters, which it handles correctly, and 91 /// multi-byte characters, which are regarded as multiple characters for 92 /// string methods. 93 DSOEXPORT std::string encodeCanonicalString(const std::wstring& wstr, int version); 94 95 /// Return the next Unicode character in the UTF-8 encoded string. 96 // 97 /// Invalid UTF-8 sequences produce a U+FFFD character 98 /// as output. Advances string iterator past the character 99 /// returned, unless the returned character is '\0', in which 100 /// case the iterator does not advance. 101 DSOEXPORT std::uint32_t decodeNextUnicodeCharacter(std::string::const_iterator& it, 102 const std::string::const_iterator& e); 103 104 /// \brief Encodes the given wide character into a canonical 105 /// string, theoretically up to 6 chars in length. 106 DSOEXPORT std::string encodeUnicodeCharacter(std::uint32_t ucs_character); 107 108 /// Encodes the given wide character into an at least 8-bit character. 109 // 110 /// Allows storage of Latin1 (ISO-8859-1) characters. This 111 /// is the format of SWF5 and below. 112 DSOEXPORT std::string encodeLatin1Character(std::uint32_t ucsCharacter); 113 114 enum TextEncoding { 115 encUNSPECIFIED, 116 encUTF8, 117 encUTF16BE, 118 encUTF16LE, 119 encUTF32BE, 120 encUTF32LE, 121 encSCSU, 122 encUTF7, 123 encUTFEBCDIC, 124 encBOCU1 125 }; 126 127 /// Interpret (and skip) Byte Order Mark in input stream 128 // 129 /// This function takes a pointer to a buffer and returns 130 /// the start of actual data after an eventual BOM. 131 /// No conversion is performed, no bytes copy, just skipping of 132 /// the BOM snippet and interpretation of it returned to the 133 /// encoding input parameter. 134 /// 135 /// See http://en.wikipedia.org/wiki/Byte-order_mark 136 /// 137 /// @param in 138 /// The input buffer. 139 /// 140 /// @param size 141 /// Size of the input buffer, will be decremented by the 142 /// size of the BOM, if any. 143 /// 144 /// @param encoding 145 /// Output parameter, will always be set. 146 /// encUNSPECIFIED if no BOM is found. 147 /// 148 /// @returns 149 /// A pointer either equal to 'in' or some bytes inside it. 150 /// 151 DSOEXPORT const char* stripBOM(const char* in, size_t& size, 152 TextEncoding& encoding); 153 154 /// Return name of a text encoding 155 DSOEXPORT const char* textEncodingName(TextEncoding enc); 156 157 enum EncodingGuess { 158 ENCGUESS_UNICODE = 0, 159 ENCGUESS_JIS = 1, 160 ENCGUESS_OTHER = 2 161 }; 162 163 /// Common code for guessing at the encoding of random text, between 164 // Shift-Jis, UTF8, and other. Puts the DisplayObject count in length, 165 // and the offsets to the DisplayObjects in offsets, if offsets is not NULL. 166 // If not NULL, offsets should be at least s.length(). 167 // offsets are not accurate if the return value is GUESSENC_OTHER 168 // 169 /// TODO: It's doubtful if this even works, and it may not be useful at 170 /// all. 171 DSOEXPORT EncodingGuess guessEncoding(const std::string& s, int& length, 172 std::vector<int>& offsets); 173 174 175 } // namespace utf8 176 } // namespace gnash 177 178 #endif // UTF8_H 179 180 181 // Local Variables: 182 // mode: C++ 183 // c-basic-offset: 8 184 // tab-width: 8 185 // indent-tabs-mode: t 186 // End: 187