1 //
2 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 //  Distributed under the Boost Software License, Version 1.0. (See
5 //  accompanying file LICENSE_1_0.txt or copy at
6 //  http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
10 
11 #include <boost/cstdint.hpp>
12 
13 namespace boost {
14 namespace locale {
15 ///
16 /// \brief Namespace that holds basic operations on UTF encoded sequences
17 ///
18 /// All functions defined in this namespace do not require linking with Boost.Locale library
19 ///
20 namespace utf {
21     /// \cond INTERNAL
22     #ifdef __GNUC__
23     #   define BOOST_LOCALE_LIKELY(x)   __builtin_expect((x),1)
24     #   define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
25     #else
26     #   define BOOST_LOCALE_LIKELY(x)   (x)
27     #   define BOOST_LOCALE_UNLIKELY(x) (x)
28     #endif
29     /// \endcond
30 
31     ///
32     /// \brief The integral type that can hold a Unicode code point
33     ///
34     typedef uint32_t code_point;
35 
36     ///
37     /// \brief Special constant that defines illegal code point
38     ///
39     static const code_point illegal = 0xFFFFFFFFu;
40 
41     ///
42     /// \brief Special constant that defines incomplete code point
43     ///
44     static const code_point incomplete = 0xFFFFFFFEu;
45 
46     ///
47     /// \brief the function checks if \a v is a valid code point
48     ///
is_valid_codepoint(code_point v)49     inline bool is_valid_codepoint(code_point v)
50     {
51         if(v>0x10FFFF)
52             return false;
53         if(0xD800 <=v && v<= 0xDFFF) // surragates
54             return false;
55         return true;
56     }
57 
58     #ifdef BOOST_LOCALE_DOXYGEN
59     ///
60     /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
61     ///
62     template<typename CharType,int size=sizeof(CharType)>
63     struct utf_traits {
64         ///
65         /// The type of the character
66         ///
67         typedef CharType char_type;
68         ///
69         /// Read one code point from the range [p,e) and return it.
70         ///
71         /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
72         /// - If illegal sequence detected returns \ref illegal
73         ///
74         /// Requirements
75         ///
76         /// - Iterator is valid input iterator
77         ///
78         /// Postconditions
79         ///
80         /// - p points to the last consumed character
81         ///
82         template<typename Iterator>
83         static code_point decode(Iterator &p,Iterator e);
84 
85         ///
86         /// Maximal width of valid sequence in the code units:
87         ///
88         /// - UTF-8  - 4
89         /// - UTF-16 - 2
90         /// - UTF-32 - 1
91         ///
92         static const int max_width;
93         ///
94         /// The width of specific code point in the code units.
95         ///
96         /// Requirement: value is a valid Unicode code point
97         /// Returns value in range [1..max_width]
98         ///
99         static int width(code_point value);
100 
101         ///
102         /// Get the size of the trail part of variable length encoded sequence.
103         ///
104         /// Returns -1 if C is not valid lead character
105         ///
106         static int trail_length(char_type c);
107         ///
108         /// Returns true if c is trail code unit, always false for UTF-32
109         ///
110         static bool is_trail(char_type c);
111         ///
112         /// Returns true if c is lead code unit, always true of UTF-32
113         ///
114         static bool is_lead(char_type c);
115 
116         ///
117         /// Convert valid Unicode code point \a value to the UTF sequence.
118         ///
119         /// Requirements:
120         ///
121         /// - \a value is valid code point
122         /// - \a out is an output iterator should be able to accept at least width(value) units
123         ///
124         /// Returns the iterator past the last written code unit.
125         ///
126         template<typename Iterator>
127         static Iterator encode(code_point value,Iterator out);
128         ///
129         /// Decodes valid UTF sequence that is pointed by p into code point.
130         ///
131         /// If the sequence is invalid or points to end the behavior is undefined
132         ///
133         template<typename Iterator>
134         static code_point decode_valid(Iterator &p);
135     };
136 
137     #else
138 
139     template<typename CharType,int size=sizeof(CharType)>
140     struct utf_traits;
141 
142     template<typename CharType>
143     struct utf_traits<CharType,1> {
144 
145         typedef CharType char_type;
146 
trail_lengthboost::locale::utf::utf_traits147         static int trail_length(char_type ci)
148         {
149             unsigned char c = ci;
150             if(c < 128)
151                 return 0;
152             if(BOOST_LOCALE_UNLIKELY(c < 194))
153                 return -1;
154             if(c < 224)
155                 return 1;
156             if(c < 240)
157                 return 2;
158             if(BOOST_LOCALE_LIKELY(c <=244))
159                 return 3;
160             return -1;
161         }
162 
163         static const int max_width = 4;
164 
widthboost::locale::utf::utf_traits165         static int width(code_point value)
166         {
167             if(value <=0x7F) {
168                 return 1;
169             }
170             else if(value <=0x7FF) {
171                 return 2;
172             }
173             else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
174                 return 3;
175             }
176             else {
177                 return 4;
178             }
179         }
180 
is_trailboost::locale::utf::utf_traits181         static bool is_trail(char_type ci)
182         {
183             unsigned char c=ci;
184             return (c & 0xC0)==0x80;
185         }
186 
is_leadboost::locale::utf::utf_traits187         static bool is_lead(char_type ci)
188         {
189             return !is_trail(ci);
190         }
191 
192         template<typename Iterator>
decodeboost::locale::utf::utf_traits193         static code_point decode(Iterator &p,Iterator e)
194         {
195             if(BOOST_LOCALE_UNLIKELY(p==e))
196                 return incomplete;
197 
198             unsigned char lead = *p++;
199 
200             // First byte is fully validated here
201             int trail_size = trail_length(lead);
202 
203             if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
204                 return illegal;
205 
206             //
207             // Ok as only ASCII may be of size = 0
208             // also optimize for ASCII text
209             //
210             if(trail_size == 0)
211                 return lead;
212 
213             code_point c = lead & ((1<<(6-trail_size))-1);
214 
215             // Read the rest
216             unsigned char tmp;
217             switch(trail_size) {
218             case 3:
219                 if(BOOST_LOCALE_UNLIKELY(p==e))
220                     return incomplete;
221                 tmp = *p++;
222                 if (!is_trail(tmp))
223                     return illegal;
224                 c = (c << 6) | ( tmp & 0x3F);
225             case 2:
226                 if(BOOST_LOCALE_UNLIKELY(p==e))
227                     return incomplete;
228                 tmp = *p++;
229                 if (!is_trail(tmp))
230                     return illegal;
231                 c = (c << 6) | ( tmp & 0x3F);
232             case 1:
233                 if(BOOST_LOCALE_UNLIKELY(p==e))
234                     return incomplete;
235                 tmp = *p++;
236                 if (!is_trail(tmp))
237                     return illegal;
238                 c = (c << 6) | ( tmp & 0x3F);
239             }
240 
241             // Check code point validity: no surrogates and
242             // valid range
243             if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
244                 return illegal;
245 
246             // make sure it is the most compact representation
247             if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
248                 return illegal;
249 
250             return c;
251 
252         }
253 
254         template<typename Iterator>
decode_validboost::locale::utf::utf_traits255         static code_point decode_valid(Iterator &p)
256         {
257             unsigned char lead = *p++;
258             if(lead < 192)
259                 return lead;
260 
261             int trail_size;
262 
263             if(lead < 224)
264                 trail_size = 1;
265             else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
266                 trail_size = 2;
267             else
268                 trail_size = 3;
269 
270             code_point c = lead & ((1<<(6-trail_size))-1);
271 
272             switch(trail_size) {
273             case 3:
274                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
275             case 2:
276                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
277             case 1:
278                 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
279             }
280 
281             return c;
282         }
283 
284 
285 
286         template<typename Iterator>
encodeboost::locale::utf::utf_traits287         static Iterator encode(code_point value,Iterator out)
288         {
289             if(value <= 0x7F) {
290                 *out++ = static_cast<char_type>(value);
291             }
292             else if(value <= 0x7FF) {
293                 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
294                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
295             }
296             else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
297                 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
298                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
299                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
300             }
301             else {
302                 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
303                 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
304                 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
305                 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
306             }
307             return out;
308         }
309     }; // utf8
310 
311     template<typename CharType>
312     struct utf_traits<CharType,2> {
313         typedef CharType char_type;
314 
315         // See RFC 2781
is_first_surrogateboost::locale::utf::utf_traits316         static bool is_first_surrogate(uint16_t x)
317         {
318             return 0xD800 <=x && x<= 0xDBFF;
319         }
is_second_surrogateboost::locale::utf::utf_traits320         static bool is_second_surrogate(uint16_t x)
321         {
322             return 0xDC00 <=x && x<= 0xDFFF;
323         }
combine_surrogateboost::locale::utf::utf_traits324         static code_point combine_surrogate(uint16_t w1,uint16_t w2)
325         {
326             return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
327         }
trail_lengthboost::locale::utf::utf_traits328         static int trail_length(char_type c)
329         {
330             if(is_first_surrogate(c))
331                 return 1;
332             if(is_second_surrogate(c))
333                 return -1;
334             return 0;
335         }
336         ///
337         /// Returns true if c is trail code unit, always false for UTF-32
338         ///
is_trailboost::locale::utf::utf_traits339         static bool is_trail(char_type c)
340         {
341             return is_second_surrogate(c);
342         }
343         ///
344         /// Returns true if c is lead code unit, always true of UTF-32
345         ///
is_leadboost::locale::utf::utf_traits346         static bool is_lead(char_type c)
347         {
348             return !is_second_surrogate(c);
349         }
350 
351         template<typename It>
decodeboost::locale::utf::utf_traits352         static code_point decode(It &current,It last)
353         {
354             if(BOOST_LOCALE_UNLIKELY(current == last))
355                 return incomplete;
356             uint16_t w1=*current++;
357             if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
358                 return w1;
359             }
360             if(w1 > 0xDBFF)
361                 return illegal;
362             if(current==last)
363                 return incomplete;
364             uint16_t w2=*current++;
365             if(w2 < 0xDC00 || 0xDFFF < w2)
366                 return illegal;
367             return combine_surrogate(w1,w2);
368         }
369         template<typename It>
decode_validboost::locale::utf::utf_traits370         static code_point decode_valid(It &current)
371         {
372             uint16_t w1=*current++;
373             if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
374                 return w1;
375             }
376             uint16_t w2=*current++;
377             return combine_surrogate(w1,w2);
378         }
379 
380         static const int max_width = 2;
widthboost::locale::utf::utf_traits381         static int width(code_point u)
382         {
383             return u>=0x10000 ? 2 : 1;
384         }
385         template<typename It>
encodeboost::locale::utf::utf_traits386         static It encode(code_point u,It out)
387         {
388             if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
389                 *out++ = static_cast<char_type>(u);
390             }
391             else {
392                 u -= 0x10000;
393                 *out++ = static_cast<char_type>(0xD800 | (u>>10));
394                 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
395             }
396             return out;
397         }
398     }; // utf16;
399 
400 
401     template<typename CharType>
402     struct utf_traits<CharType,4> {
403         typedef CharType char_type;
trail_lengthboost::locale::utf::utf_traits404         static int trail_length(char_type c)
405         {
406             if(is_valid_codepoint(c))
407                 return 0;
408             return -1;
409         }
is_trailboost::locale::utf::utf_traits410         static bool is_trail(char_type /*c*/)
411         {
412             return false;
413         }
is_leadboost::locale::utf::utf_traits414         static bool is_lead(char_type /*c*/)
415         {
416             return true;
417         }
418 
419         template<typename It>
decode_validboost::locale::utf::utf_traits420         static code_point decode_valid(It &current)
421         {
422             return *current++;
423         }
424 
425         template<typename It>
decodeboost::locale::utf::utf_traits426         static code_point decode(It &current,It last)
427         {
428             if(BOOST_LOCALE_UNLIKELY(current == last))
429                 return boost::locale::utf::incomplete;
430             code_point c=*current++;
431             if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
432                 return boost::locale::utf::illegal;
433             return c;
434         }
435         static const int max_width = 1;
widthboost::locale::utf::utf_traits436         static int width(code_point /*u*/)
437         {
438             return 1;
439         }
440         template<typename It>
encodeboost::locale::utf::utf_traits441         static It encode(code_point u,It out)
442         {
443             *out++ = static_cast<char_type>(u);
444             return out;
445         }
446 
447     }; // utf32
448 
449     #endif
450 
451 
452 } // utf
453 } // locale
454 } // boost
455 
456 
457 #endif
458 
459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
460 
461