1 // Copyright 2006 Nemanja Trifunovic
2 
3 /*
4 Permission is hereby granted, free of charge, to any person or organization
5 obtaining a copy of the software and accompanying documentation covered by
6 this license (the "Software") to use, reproduce, display, distribute,
7 execute, and transmit the Software, and to prepare derivative works of the
8 Software, and to permit third-parties to whom the Software is furnished to
9 do so, all subject to the following:
10 
11 The copyright notices in the Software and this entire statement, including
12 the above license grant, this restriction and the following disclaimer,
13 must be included in all copies of the Software, in whole or in part, and
14 all derivative works of the Software, unless such copies or derivative
15 works are solely in the form of machine-executable object code generated by
16 a source language processor.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.
25 */
26 
27 
28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 
31 #include <iterator>
32 
33 namespace utf8
34 {
35     typedef vmime_uint8   uint8_t;
36     typedef vmime_uint16  uint16_t;
37     typedef vmime_uint32  uint32_t;
38 
39 // Helper code - not intended to be directly called by the library users. May be changed at any time
40 namespace internal
41 {
42     // Unicode constants
43     // Leading (high) surrogates: 0xd800 - 0xdbff
44     // Trailing (low) surrogates: 0xdc00 - 0xdfff
45     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
46     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
47     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
48     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
49     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
50     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
51 
52     // Maximum valid value for a Unicode code point
53     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
54 
55     template<typename octet_type>
mask8(octet_type oc)56     inline uint8_t mask8(octet_type oc)
57     {
58         return static_cast<uint8_t>(0xff & oc);
59     }
60     template<typename u16_type>
mask16(u16_type oc)61     inline uint16_t mask16(u16_type oc)
62     {
63         return static_cast<uint16_t>(0xffff & oc);
64     }
65     template<typename octet_type>
is_trail(octet_type oc)66     inline bool is_trail(octet_type oc)
67     {
68         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
69     }
70 
71     template <typename u16>
is_lead_surrogate(u16 cp)72     inline bool is_lead_surrogate(u16 cp)
73     {
74         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
75     }
76 
77     template <typename u16>
is_trail_surrogate(u16 cp)78     inline bool is_trail_surrogate(u16 cp)
79     {
80         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
81     }
82 
83     template <typename u16>
is_surrogate(u16 cp)84     inline bool is_surrogate(u16 cp)
85     {
86         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
87     }
88 
89     template <typename u32>
is_code_point_valid(u32 cp)90     inline bool is_code_point_valid(u32 cp)
91     {
92         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
93     }
94 
95     template <typename octet_iterator>
96     inline typename std::iterator_traits<octet_iterator>::difference_type
sequence_length(octet_iterator lead_it)97     sequence_length(octet_iterator lead_it)
98     {
99         uint8_t lead = utf8::internal::mask8(*lead_it);
100         if (lead < 0x80)
101             return 1;
102         else if ((lead >> 5) == 0x6)
103             return 2;
104         else if ((lead >> 4) == 0xe)
105             return 3;
106         else if ((lead >> 3) == 0x1e)
107             return 4;
108         else
109             return 0;
110     }
111 
112     template <typename octet_difference_type>
is_overlong_sequence(uint32_t cp,octet_difference_type length)113     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
114     {
115         if (cp < 0x80) {
116             if (length != 1)
117                 return true;
118         }
119         else if (cp < 0x800) {
120             if (length != 2)
121                 return true;
122         }
123         else if (cp < 0x10000) {
124             if (length != 3)
125                 return true;
126         }
127 
128         return false;
129     }
130 
131     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
132 
133     /// Helper for get_sequence_x
134     template <typename octet_iterator>
increase_safely(octet_iterator & it,octet_iterator end)135     utf_error increase_safely(octet_iterator& it, octet_iterator end)
136     {
137         if (++it == end)
138             return NOT_ENOUGH_ROOM;
139 
140         if (!utf8::internal::is_trail(*it))
141             return INCOMPLETE_SEQUENCE;
142 
143         return UTF8_OK;
144     }
145 
146     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
147 
148     /// get_sequence_x functions decode utf-8 sequences of the length x
149     template <typename octet_iterator>
get_sequence_1(octet_iterator & it,octet_iterator end,uint32_t & code_point)150     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
151     {
152         if (it == end)
153             return NOT_ENOUGH_ROOM;
154 
155         code_point = utf8::internal::mask8(*it);
156 
157         return UTF8_OK;
158     }
159 
160     template <typename octet_iterator>
get_sequence_2(octet_iterator & it,octet_iterator end,uint32_t & code_point)161     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
162     {
163         if (it == end)
164             return NOT_ENOUGH_ROOM;
165 
166         code_point = utf8::internal::mask8(*it);
167 
168         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
169 
170         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
171 
172         return UTF8_OK;
173     }
174 
175     template <typename octet_iterator>
get_sequence_3(octet_iterator & it,octet_iterator end,uint32_t & code_point)176     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
177     {
178         if (it == end)
179             return NOT_ENOUGH_ROOM;
180 
181         code_point = utf8::internal::mask8(*it);
182 
183         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
184 
185         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
186 
187         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
188 
189         code_point += (*it) & 0x3f;
190 
191         return UTF8_OK;
192     }
193 
194     template <typename octet_iterator>
get_sequence_4(octet_iterator & it,octet_iterator end,uint32_t & code_point)195     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
196     {
197         if (it == end)
198            return NOT_ENOUGH_ROOM;
199 
200         code_point = utf8::internal::mask8(*it);
201 
202         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
203 
204         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
205 
206         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
207 
208         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
209 
210         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
211 
212         code_point += (*it) & 0x3f;
213 
214         return UTF8_OK;
215     }
216 
217     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
218 
219     template <typename octet_iterator>
validate_next(octet_iterator & it,octet_iterator end,uint32_t & code_point)220     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
221     {
222         // Save the original value of it so we can go back in case of failure
223         // Of course, it does not make much sense with i.e. stream iterators
224         octet_iterator original_it = it;
225 
226         uint32_t cp = 0;
227         // Determine the sequence length based on the lead octet
228         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
229         const octet_difference_type length = utf8::internal::sequence_length(it);
230 
231         // Get trail octets and calculate the code point
232         utf_error err = UTF8_OK;
233         switch (length) {
234             case 0:
235                 return INVALID_LEAD;
236             case 1:
237                 err = utf8::internal::get_sequence_1(it, end, cp);
238                 break;
239             case 2:
240                 err = utf8::internal::get_sequence_2(it, end, cp);
241             break;
242             case 3:
243                 err = utf8::internal::get_sequence_3(it, end, cp);
244             break;
245             case 4:
246                 err = utf8::internal::get_sequence_4(it, end, cp);
247             break;
248         }
249 
250         if (err == UTF8_OK) {
251             // Decoding succeeded. Now, security checks...
252             if (utf8::internal::is_code_point_valid(cp)) {
253                 if (!utf8::internal::is_overlong_sequence(cp, length)){
254                     // Passed! Return here.
255                     code_point = cp;
256                     ++it;
257                     return UTF8_OK;
258                 }
259                 else
260                     err = OVERLONG_SEQUENCE;
261             }
262             else
263                 err = INVALID_CODE_POINT;
264         }
265 
266         // Failure branch - restore the original value of the iterator
267         it = original_it;
268         return err;
269     }
270 
271     template <typename octet_iterator>
validate_next(octet_iterator & it,octet_iterator end)272     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
273         uint32_t ignored;
274         return utf8::internal::validate_next(it, end, ignored);
275     }
276 
277 } // namespace internal
278 
279     /// The library API - functions intended to be called by the users
280 
281     // Byte order mark
282     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
283 
284     template <typename octet_iterator>
find_invalid(octet_iterator start,octet_iterator end)285     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
286     {
287         octet_iterator result = start;
288         while (result != end) {
289             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
290             if (err_code != internal::UTF8_OK)
291                 return result;
292         }
293         return result;
294     }
295 
296     template <typename octet_iterator>
is_valid(octet_iterator start,octet_iterator end)297     inline bool is_valid(octet_iterator start, octet_iterator end)
298     {
299         return (utf8::find_invalid(start, end) == end);
300     }
301 
302     template <typename octet_iterator>
starts_with_bom(octet_iterator it,octet_iterator end)303     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
304     {
305         return (
306             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
307             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
308             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
309            );
310     }
311 
312     //Deprecated in release 2.3
313     template <typename octet_iterator>
is_bom(octet_iterator it)314     inline bool is_bom (octet_iterator it)
315     {
316         return (
317             (utf8::internal::mask8(*it++)) == bom[0] &&
318             (utf8::internal::mask8(*it++)) == bom[1] &&
319             (utf8::internal::mask8(*it))   == bom[2]
320            );
321     }
322 } // namespace utf8
323 
324 #endif // header guard
325 
326 
327