1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // utf8_codecvt_facet.cpp
3 
4 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Distributed under the Boost Software License, Version 1.0. (See accompany-
7 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
8 
9 // See http://www.boost.org/libs/iostreams for documentation.
10 
11 //#include <cstdlib> // for multi-byte converson routines
12 
13 // Jonathan Turkanis:
14 //   - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
15 //     BOOST_IOSTREAMS_NO_WIDE_STREAMS;
16 //   - Derived from codecvt_helper instead of codecvt.
17 
18 #include <boost/config.hpp>
19 #include <boost/iostreams/detail/config/wide_streams.hpp>
20 #include <boost/numeric/conversion/cast.hpp>
21 #ifdef BOOST_IOSTREAMS_NO_LOCALES
22 # error "C++ locales not supported on this platform"
23 #else
24 
25 #include <cassert>
26 #include <cstddef>
27 
28 #include <boost/detail/workaround.hpp>
29 #include "./utf8_codecvt_facet.hpp"
30 
31 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
32 # pragma warn -sig // Conversion may lose significant digits
33 # pragma warn -rng // Constant is out of range in comparison
34 #endif
35 
36 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
37 // implementation for wchar_t
38 
39 // Translate incoming UTF-8 into UCS-4
do_in(std::mbstate_t &,const char * from,const char * from_end,const char * & from_next,wchar_t * to,wchar_t * to_end,wchar_t * & to_next) const40 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
41     std::mbstate_t&,
42     const char * from,
43     const char * from_end,
44     const char * & from_next,
45     wchar_t * to,
46     wchar_t * to_end,
47     wchar_t * & to_next
48 ) const {
49     // Basic algorithm:  The first octet determines how many
50     // octets total make up the UCS-4 character.  The remaining
51     // "continuing octets" all begin with "10". To convert, subtract
52     // the amount that specifies the number of octets from the first
53     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
54     // then mash the whole lot together.  Note that each continuing
55     // octet only uses 6 bits as unique values, so only shift by
56     // multiples of 6 to combine.
57     while (from != from_end && to != to_end) {
58 
59         // Error checking   on the first octet
60         if (invalid_leading_octet(*from)){
61             from_next = from;
62             to_next = to;
63             return std::codecvt_base::error;
64         }
65 
66         // The first octet is   adjusted by a value dependent upon
67         // the number   of "continuing octets" encoding the character
68         const   int cont_octet_count = get_cont_octet_count(*from);
69         const   wchar_t octet1_modifier_table[] =   {
70             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
71         };
72 
73         // The unsigned char conversion is necessary in case char is
74         // signed   (I learned this the hard way)
75         wchar_t ucs_result =
76             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
77 
78         // Invariants   :
79         //   1) At the start of the loop,   'i' continuing characters have been
80         //    processed
81         //   2) *from   points to the next continuing character to be processed.
82         int i   = 0;
83         while(i != cont_octet_count && from != from_end) {
84 
85             // Error checking on continuing characters
86             if (invalid_continuing_octet(*from)) {
87                 from_next   = from;
88                 to_next =   to;
89                 return std::codecvt_base::error;
90             }
91 
92             ucs_result *= (1 << 6);
93 
94             // each continuing character has an extra (10xxxxxx)b attached to
95             // it that must be removed.
96             ucs_result += (unsigned char)(*from++) - 0x80;
97             ++i;
98         }
99 
100         // If   the buffer ends with an incomplete unicode character...
101         if (from == from_end && i   != cont_octet_count) {
102             // rewind "from" to before the current character translation
103             from_next = from - (i+1);
104             to_next = to;
105             return std::codecvt_base::partial;
106         }
107         *to++   = ucs_result;
108     }
109     from_next = from;
110     to_next = to;
111 
112     // Were we done converting or did we run out of destination space?
113     if(from == from_end) return std::codecvt_base::ok;
114     else return std::codecvt_base::partial;
115 }
116 
do_out(std::mbstate_t &,const wchar_t * from,const wchar_t * from_end,const wchar_t * & from_next,char * to,char * to_end,char * & to_next) const117 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
118     std::mbstate_t &,
119     const wchar_t *   from,
120     const wchar_t * from_end,
121     const wchar_t * & from_next,
122     char * to,
123     char * to_end,
124     char * & to_next
125 ) const
126 {
127     // RG - consider merging this table with the other one
128     const wchar_t octet1_modifier_table[] = {
129         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
130     };
131 
132     while (from != from_end && to != to_end) {
133 
134 #define BOOST_NULL // Prevent macro expansion
135         // Check for invalid UCS-4 character
136         if (*from  > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
137             from_next = from;
138             to_next = to;
139             return std::codecvt_base::error;
140         }
141 #undef BOOST_NULL
142 
143         int cont_octet_count = get_cont_octet_out_count(*from);
144 
145         // RG  - comment this formula better
146         int shift_exponent = (cont_octet_count) *   6;
147 
148         // Process the first character
149         *to++ = octet1_modifier_table[cont_octet_count] +
150             (unsigned char)(*from / (1 << shift_exponent));
151 
152         // Process the continuation characters
153         // Invariants: At   the start of the loop:
154         //   1) 'i' continuing octets   have been generated
155         //   2) '*to'   points to the next location to place an octet
156         //   3) shift_exponent is   6 more than needed for the next octet
157         int i   = 0;
158         while   (i != cont_octet_count && to != to_end) {
159             shift_exponent -= 6;
160             *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
161             ++i;
162         }
163         // If   we filled up the out buffer before encoding the character
164         if(to   == to_end && i != cont_octet_count) {
165             from_next = from;
166             to_next = to - (i+1);
167             return std::codecvt_base::partial;
168         }
169         ++from;
170     }
171     from_next = from;
172     to_next = to;
173     // Were we done or did we run out of destination space
174     if(from == from_end) return std::codecvt_base::ok;
175     else return std::codecvt_base::partial;
176 }
177 
178 // How many char objects can I process to get <= max_limit
179 // wchar_t objects?
do_length(BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,const char * from,const char * from_end,std::size_t max_limit) const180 int utf8_codecvt_facet_wchar_t::do_length(
181     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
182     const char * from,
183     const char * from_end,
184     std::size_t max_limit
185 ) const throw()
186 {
187     // RG - this code is confusing!  I need a better way to express it.
188     // and test cases.
189 
190     // Invariants:
191     // 1) last_octet_count has the size of the last measured character
192     // 2) char_count holds the number of characters shown to fit
193     // within the bounds so far (no greater than max_limit)
194     // 3) from_next points to the octet 'last_octet_count' before the
195     // last measured character.
196     int last_octet_count=0;
197     std::size_t char_count = 0;
198     const char* from_next = from;
199     // Use "<" because the buffer may represent incomplete characters
200     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
201         from_next += last_octet_count;
202         last_octet_count = (get_octet_count(*from_next));
203         ++char_count;
204     }
205     return boost::numeric_cast<int>(from_next - from_end);
206 }
207 
get_octet_count(unsigned char lead_octet)208 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
209     unsigned char   lead_octet
210 ){
211     // if the 0-bit (MSB) is 0, then 1 character
212     if (lead_octet <= 0x7f) return 1;
213 
214     // Otherwise the count number of consecutive 1 bits starting at MSB
215     assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
216 
217     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
218     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
219     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
220     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
221     else return 6;
222 }
223 
224 namespace {
225 template<std::size_t s>
get_cont_octet_out_count_impl(wchar_t word)226 int get_cont_octet_out_count_impl(wchar_t word){
227     if (word < 0x80) {
228         return 0;
229     }
230     if (word < 0x800) {
231         return 1;
232     }
233     return 2;
234 }
235 
236 // note the following code will generate on some platforms where
237 // wchar_t is defined as UCS2.  The warnings are superfluous as
238 // the specialization is never instantitiated with such compilers.
239 template<>
get_cont_octet_out_count_impl(wchar_t word)240 int get_cont_octet_out_count_impl<4>(wchar_t word)
241 {
242     if (word < 0x80) {
243         return 0;
244     }
245     if (word < 0x800) {
246         return 1;
247     }
248     if (word < 0x10000) {
249         return 2;
250     }
251     if (word < 0x200000) {
252         return 3;
253     }
254     if (word < 0x4000000) {
255         return 4;
256     }
257     return 5;
258 }
259 
260 } // namespace anonymous
261 
262 // How many "continuing octets" will be needed for this word
263 // ==   total octets - 1.
get_cont_octet_out_count(wchar_t word) const264 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
265     wchar_t word
266 ) const {
267     return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
268 }
269 
270 #if 0 // not used?
271 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
272 // implementation for char
273 
274 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
275     std::mbstate_t & state,
276     const char * from,
277     const char * from_end,
278     const char * & from_next,
279     char * to,
280     char * to_end,
281     char * & to_next
282 ) const
283 {
284     while(from_next < from_end){
285         wchar_t w;
286         wchar_t *wnext = & w;
287         utf8_codecvt_facet_wchar_t::result ucs4_result;
288         ucs4_result = base_class::do_in(
289             state,
290             from, from_end, from_next,
291             wnext, wnext + 1, wnext
292         );
293         if(codecvt_base::ok != ucs4_result)
294             return ucs4_result;
295         // if the conversion succeeds.
296         int length = std::wctomb(to_next, w);
297         assert(-1 != length);
298         to_next += length;
299     }
300     return codecvt_base::ok;
301 }
302 
303 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
304     mbstate_t & state,
305     const char * from,
306     const char * from_end,
307     const char * & from_next,
308     char * to,
309     char * to_end,
310     char * & to_next
311 ) const
312 {
313     while(from_next < from_end){
314         wchar_t w;
315         int result = std::mbtowc(&w, from_next,  MB_LENGTH_MAX);
316         assert(-1 != result);
317         from_next += result;
318         utf8_codecvt_facet_wchar_t::result ucs4_result;
319 
320         const wchar_t *wptr = & w;
321         ucs4_result = base_class::do_out(
322             state,
323             wptr, wptr+1, wptr,
324             to_next, to_end, to_next
325         );
326         if(codecvt_base::ok != ucs4_result)
327             return ucs4_result;
328     }
329     return codecvt_base::ok;
330 }
331 
332 // How many bytes objects can I process to get <= max_limit
333 // char objects?
334 int utf8_codecvt_facet_char::do_length(
335     // it seems that the standard doesn't use const so these librarires
336     // would be in error
337     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
338     utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
339     const char * from_next,
340     const char * from_end,
341     std::size_t max_limit
342 ) const
343 {
344     int total_length = 0;
345     const char *from = from_next;
346     mbstate_t state = initial_state;
347     while(from_next < from_end){
348         wchar_t w;
349         wchar_t *wnext = & w;
350         utf8_codecvt_facet_wchar_t::result ucs4_result;
351         ucs4_result = base_class::do_in(
352             state,
353             from_next, from_end, from_next,
354             wnext, wnext + 1, wnext
355         );
356 
357         if(codecvt_base::ok != ucs4_result)
358             break;
359 
360         char carray[MB_LENGTH_MAX];
361         std::size_t count = wctomb(carray, w);
362         if(count > max_limit)
363             break;
364 
365         max_limit -= count;
366         total_length = from_next - from;
367     }
368     return total_length;
369 }
370 #endif
371 
372 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS
373