1 #include <utils/encoding.hpp>
2 
3 #include <utils/scopeguard.hpp>
4 
5 #include <stdexcept>
6 
7 #include <cassert>
8 #include <string.h>
9 #include <iconv.h>
10 #include <cerrno>
11 
12 #include <map>
13 #include <bitset>
14 
15 /**
16  * The UTF-8-encoded character used as a place holder when a character conversion fails.
17  * This is U+FFFD � "replacement character"
18  */
19 static const char* invalid_char = "\xef\xbf\xbd";
20 static const size_t invalid_char_len = 3;
21 
22 namespace utils
23 {
24   /**
25    * Based on http://en.wikipedia.org/wiki/UTF-8#Description
26    */
get_next_codepoint_size(const unsigned char c)27   std::size_t get_next_codepoint_size(const unsigned char c)
28   {
29     if ((c & 0b11111000) == 0b11110000)          // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
30       return 4;
31     else if ((c & 0b11110000) == 0b11100000)     // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
32       return 3;
33     else if ((c & 0b11100000) == 0b11000000)     // 2 bytes:  110xxxxx 10xxxxxx
34       return 2;
35     return 1;                                    // 1 byte:  0xxxxxxx
36   }
37 
is_valid_utf8(const char * s)38   bool is_valid_utf8(const char* s)
39   {
40     if (!s)
41       return false;
42 
43     const unsigned char* str = reinterpret_cast<const unsigned char*>(s);
44 
45     while (*str)
46       {
47         const auto codepoint_size = get_next_codepoint_size(str[0]);
48         if (codepoint_size == 4)
49           {
50             if (!str[1] || !str[2] || !str[3]
51                 || ((str[1] & 0b11000000u) != 0b10000000u)
52                 || ((str[2] & 0b11000000u) != 0b10000000u)
53                 || ((str[3] & 0b11000000u) != 0b10000000u))
54               return false;
55           }
56         else if (codepoint_size == 3)
57           {
58             if (!str[1] || !str[2]
59                 || ((str[1] & 0b11000000u) != 0b10000000u)
60                 || ((str[2] & 0b11000000u) != 0b10000000u))
61               return false;
62           }
63         else if (codepoint_size == 2)
64           {
65             if (!str[1] ||
66                 ((str[1] & 0b11000000) != 0b10000000))
67               return false;
68           }
69         else if ((str[0] & 0b10000000) != 0)
70           return false;
71         str += codepoint_size;
72       }
73     return true;
74   }
75 
remove_invalid_xml_chars(const std::string & original)76   std::string remove_invalid_xml_chars(const std::string& original)
77   {
78     // The given string MUST be a valid utf-8 string
79     std::vector<char> res(original.size(), '\0');
80 
81     // pointer where we write valid chars
82     char* r = res.data();
83 
84     const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
85     std::bitset<20> codepoint;
86 
87     while (*str)
88       {
89         // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
90         if ((str[0] & 0b11111000) == 0b11110000)
91           {
92             codepoint  = ((str[0] & 0b00000111u) << 18u);
93             codepoint |= ((str[1] & 0b00111111u) << 12u);
94             codepoint |= ((str[2] & 0b00111111u) << 6u );
95             codepoint |= ((str[3] & 0b00111111u) << 0u );
96             if (codepoint.to_ulong() <= 0x10FFFF)
97               {
98                 ::memcpy(r, str, 4);
99                 r += 4;
100               }
101             str += 4;
102           }
103         // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
104         else if ((str[0] & 0b11110000) == 0b11100000)
105           {
106             codepoint  = ((str[0] & 0b00001111u) << 12u);
107             codepoint |= ((str[1] & 0b00111111u) << 6u);
108             codepoint |= ((str[2] & 0b00111111u) << 0u );
109             if (codepoint.to_ulong() <= 0xD7FF ||
110                 (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
111               {
112                 ::memcpy(r, str, 3);
113                 r += 3;
114               }
115             str += 3;
116           }
117         // 2 bytes:  110xxxxx 10xxxxxx
118         else if (((str[0]) & 0b11100000) == 0b11000000)
119           {
120             // All 2 bytes char are valid, don't even bother calculating
121             // the codepoint
122             ::memcpy(r, str, 2);
123             r += 2;
124             str += 2;
125           }
126         // 1 byte:  0xxxxxxx
127         else if ((str[0] & 0b10000000) == 0)
128           {
129             codepoint = ((str[0] & 0b01111111));
130             if (codepoint.to_ulong() == 0x09 ||
131                 codepoint.to_ulong() == 0x0A ||
132                 codepoint.to_ulong() == 0x0D ||
133                 codepoint.to_ulong() >= 0x20)
134               {
135                 ::memcpy(r, str, 1);
136                 r += 1;
137               }
138             str += 1;
139           }
140         else
141           throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
142       }
143     return {res.data(), static_cast<size_t>(r - res.data())};
144   }
145 
convert_to_utf8(const std::string & str,const char * charset)146   std::string convert_to_utf8(const std::string& str, const char* charset)
147   {
148     std::string res;
149 
150     const iconv_t cd = iconv_open("UTF-8", charset);
151     if (cd == (iconv_t)-1)
152       throw std::runtime_error("Cannot convert into UTF-8");
153 
154     // Make sure cd is always closed when we leave this function
155     const auto sg = utils::make_scope_guard([&cd](){ iconv_close(cd); });
156 
157     size_t inbytesleft = str.size();
158 
159     // iconv will not attempt to modify this buffer, but some plateform
160     // require a char** anyway
161 #ifdef ICONV_SECOND_ARGUMENT_IS_CONST
162     const char* inbuf_ptr = str.c_str();
163 #else
164     char* inbuf_ptr = const_cast<char*>(str.c_str());
165 #endif
166 
167     size_t outbytesleft = str.size() * 4;
168     char* outbuf = new char[outbytesleft];
169     char* outbuf_ptr = outbuf;
170 
171     // Make sure outbuf is always deleted when we leave this function
172     const auto sg2 = utils::make_scope_guard([outbuf](){ delete[] outbuf; });
173 
174     bool done = false;
175     while (done == false)
176       {
177         size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
178         if ((size_t)-1 == error)
179           {
180             switch (errno)
181               {
182               case EILSEQ:
183                 // Invalid byte found. Insert a placeholder instead of the
184                 // converted character, jump one byte and continue
185                 memcpy(outbuf_ptr, invalid_char, invalid_char_len);
186                 outbuf_ptr += invalid_char_len;
187                 inbytesleft--;
188                 inbuf_ptr++;
189                 break;
190               case EINVAL:
191                 // A multibyte sequence is not terminated, but we can't
192                 // provide any more data, so we just add a placeholder to
193                 // indicate that the character is not properly converted,
194                 // and we stop the conversion
195                 memcpy(outbuf_ptr, invalid_char, invalid_char_len);
196                 outbuf_ptr += invalid_char_len;
197                 outbuf_ptr++;
198                 done = true;
199                 break;
200               case E2BIG:  // This should never happen
201               default:     // This should happen even neverer
202                 done = true;
203                 break;
204               }
205           }
206         else
207           {
208             // The conversion finished without any error, stop converting
209             done = true;
210           }
211       }
212     // Terminate the converted buffer, and copy that buffer it into the
213     // string we return
214     *outbuf_ptr = '\0';
215     res = outbuf;
216     return res;
217   }
218 
219 }
220 
221 namespace xep0106
222 {
223   static const std::map<const char, const std::string> encode_map = {
224     {' ', "\\20"},
225     {'"', "\\22"},
226     {'&', "\\26"},
227     {'\'',"\\27"},
228     {'/', "\\2f"},
229     {':', "\\3a"},
230     {'<', "\\3c"},
231     {'>', "\\3e"},
232     {'@', "\\40"},
233   };
234 
decode(std::string & s)235   void decode(std::string& s)
236   {
237     std::string::size_type pos;
238     for (const auto& pair: encode_map)
239       while ((pos = s.find(pair.second)) != std::string::npos)
240         s.replace(pos, pair.second.size(),
241                   1, pair.first);
242   }
243 
encode(std::string & s)244   void encode(std::string& s)
245   {
246     std::string::size_type pos;
247     while ((pos = s.find_first_of(" \"&'/:<>@")) != std::string::npos)
248       {
249         auto it = encode_map.find(s[pos]);
250         assert(it != encode_map.end());
251         s.replace(pos, 1, it->second);
252       }
253   }
254 }
255