1 #include <utils/encoding.hpp> 2 3 #include <utils/scopeguard.hpp> 4 5 #include <stdexcept> 6 7 #include <cassert> 8 #include <string.h> 9 #include <iconv.h> 10 #include <cerrno> 11 12 #include <map> 13 #include <bitset> 14 15 /** 16 * The UTF-8-encoded character used as a place holder when a character conversion fails. 17 * This is U+FFFD � "replacement character" 18 */ 19 static const char* invalid_char = "\xef\xbf\xbd"; 20 static const size_t invalid_char_len = 3; 21 22 namespace utils 23 { 24 /** 25 * Based on http://en.wikipedia.org/wiki/UTF-8#Description 26 */ get_next_codepoint_size(const unsigned char c)27 std::size_t get_next_codepoint_size(const unsigned char c) 28 { 29 if ((c & 0b11111000) == 0b11110000) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 30 return 4; 31 else if ((c & 0b11110000) == 0b11100000) // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx 32 return 3; 33 else if ((c & 0b11100000) == 0b11000000) // 2 bytes: 110xxxxx 10xxxxxx 34 return 2; 35 return 1; // 1 byte: 0xxxxxxx 36 } 37 is_valid_utf8(const char * s)38 bool is_valid_utf8(const char* s) 39 { 40 if (!s) 41 return false; 42 43 const unsigned char* str = reinterpret_cast<const unsigned char*>(s); 44 45 while (*str) 46 { 47 const auto codepoint_size = get_next_codepoint_size(str[0]); 48 if (codepoint_size == 4) 49 { 50 if (!str[1] || !str[2] || !str[3] 51 || ((str[1] & 0b11000000u) != 0b10000000u) 52 || ((str[2] & 0b11000000u) != 0b10000000u) 53 || ((str[3] & 0b11000000u) != 0b10000000u)) 54 return false; 55 } 56 else if (codepoint_size == 3) 57 { 58 if (!str[1] || !str[2] 59 || ((str[1] & 0b11000000u) != 0b10000000u) 60 || ((str[2] & 0b11000000u) != 0b10000000u)) 61 return false; 62 } 63 else if (codepoint_size == 2) 64 { 65 if (!str[1] || 66 ((str[1] & 0b11000000) != 0b10000000)) 67 return false; 68 } 69 else if ((str[0] & 0b10000000) != 0) 70 return false; 71 str += codepoint_size; 72 } 73 return true; 74 } 75 remove_invalid_xml_chars(const std::string & original)76 std::string remove_invalid_xml_chars(const std::string& original) 77 { 78 // The given string MUST be a valid utf-8 string 79 std::vector<char> res(original.size(), '\0'); 80 81 // pointer where we write valid chars 82 char* r = res.data(); 83 84 const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str()); 85 std::bitset<20> codepoint; 86 87 while (*str) 88 { 89 // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 90 if ((str[0] & 0b11111000) == 0b11110000) 91 { 92 codepoint = ((str[0] & 0b00000111u) << 18u); 93 codepoint |= ((str[1] & 0b00111111u) << 12u); 94 codepoint |= ((str[2] & 0b00111111u) << 6u ); 95 codepoint |= ((str[3] & 0b00111111u) << 0u ); 96 if (codepoint.to_ulong() <= 0x10FFFF) 97 { 98 ::memcpy(r, str, 4); 99 r += 4; 100 } 101 str += 4; 102 } 103 // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx 104 else if ((str[0] & 0b11110000) == 0b11100000) 105 { 106 codepoint = ((str[0] & 0b00001111u) << 12u); 107 codepoint |= ((str[1] & 0b00111111u) << 6u); 108 codepoint |= ((str[2] & 0b00111111u) << 0u ); 109 if (codepoint.to_ulong() <= 0xD7FF || 110 (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD)) 111 { 112 ::memcpy(r, str, 3); 113 r += 3; 114 } 115 str += 3; 116 } 117 // 2 bytes: 110xxxxx 10xxxxxx 118 else if (((str[0]) & 0b11100000) == 0b11000000) 119 { 120 // All 2 bytes char are valid, don't even bother calculating 121 // the codepoint 122 ::memcpy(r, str, 2); 123 r += 2; 124 str += 2; 125 } 126 // 1 byte: 0xxxxxxx 127 else if ((str[0] & 0b10000000) == 0) 128 { 129 codepoint = ((str[0] & 0b01111111)); 130 if (codepoint.to_ulong() == 0x09 || 131 codepoint.to_ulong() == 0x0A || 132 codepoint.to_ulong() == 0x0D || 133 codepoint.to_ulong() >= 0x20) 134 { 135 ::memcpy(r, str, 1); 136 r += 1; 137 } 138 str += 1; 139 } 140 else 141 throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars"); 142 } 143 return {res.data(), static_cast<size_t>(r - res.data())}; 144 } 145 convert_to_utf8(const std::string & str,const char * charset)146 std::string convert_to_utf8(const std::string& str, const char* charset) 147 { 148 std::string res; 149 150 const iconv_t cd = iconv_open("UTF-8", charset); 151 if (cd == (iconv_t)-1) 152 throw std::runtime_error("Cannot convert into UTF-8"); 153 154 // Make sure cd is always closed when we leave this function 155 const auto sg = utils::make_scope_guard([&cd](){ iconv_close(cd); }); 156 157 size_t inbytesleft = str.size(); 158 159 // iconv will not attempt to modify this buffer, but some plateform 160 // require a char** anyway 161 #ifdef ICONV_SECOND_ARGUMENT_IS_CONST 162 const char* inbuf_ptr = str.c_str(); 163 #else 164 char* inbuf_ptr = const_cast<char*>(str.c_str()); 165 #endif 166 167 size_t outbytesleft = str.size() * 4; 168 char* outbuf = new char[outbytesleft]; 169 char* outbuf_ptr = outbuf; 170 171 // Make sure outbuf is always deleted when we leave this function 172 const auto sg2 = utils::make_scope_guard([outbuf](){ delete[] outbuf; }); 173 174 bool done = false; 175 while (done == false) 176 { 177 size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft); 178 if ((size_t)-1 == error) 179 { 180 switch (errno) 181 { 182 case EILSEQ: 183 // Invalid byte found. Insert a placeholder instead of the 184 // converted character, jump one byte and continue 185 memcpy(outbuf_ptr, invalid_char, invalid_char_len); 186 outbuf_ptr += invalid_char_len; 187 inbytesleft--; 188 inbuf_ptr++; 189 break; 190 case EINVAL: 191 // A multibyte sequence is not terminated, but we can't 192 // provide any more data, so we just add a placeholder to 193 // indicate that the character is not properly converted, 194 // and we stop the conversion 195 memcpy(outbuf_ptr, invalid_char, invalid_char_len); 196 outbuf_ptr += invalid_char_len; 197 outbuf_ptr++; 198 done = true; 199 break; 200 case E2BIG: // This should never happen 201 default: // This should happen even neverer 202 done = true; 203 break; 204 } 205 } 206 else 207 { 208 // The conversion finished without any error, stop converting 209 done = true; 210 } 211 } 212 // Terminate the converted buffer, and copy that buffer it into the 213 // string we return 214 *outbuf_ptr = '\0'; 215 res = outbuf; 216 return res; 217 } 218 219 } 220 221 namespace xep0106 222 { 223 static const std::map<const char, const std::string> encode_map = { 224 {' ', "\\20"}, 225 {'"', "\\22"}, 226 {'&', "\\26"}, 227 {'\'',"\\27"}, 228 {'/', "\\2f"}, 229 {':', "\\3a"}, 230 {'<', "\\3c"}, 231 {'>', "\\3e"}, 232 {'@', "\\40"}, 233 }; 234 decode(std::string & s)235 void decode(std::string& s) 236 { 237 std::string::size_type pos; 238 for (const auto& pair: encode_map) 239 while ((pos = s.find(pair.second)) != std::string::npos) 240 s.replace(pos, pair.second.size(), 241 1, pair.first); 242 } 243 encode(std::string & s)244 void encode(std::string& s) 245 { 246 std::string::size_type pos; 247 while ((pos = s.find_first_of(" \"&'/:<>@")) != std::string::npos) 248 { 249 auto it = encode_map.find(s[pos]); 250 assert(it != encode_map.end()); 251 s.replace(pos, 1, it->second); 252 } 253 } 254 } 255