1 /*
2  * Copyright 2006-2008 The FLWOR Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "stdafx.h"
18 
19 // #define ZORBA_DEBUG_ICU_STREAMBUF
20 #ifdef ZORBA_DEBUG_ICU_STREAMBUF
21 # include <stdio.h>
22 #endif
23 
24 #include <algorithm>
25 #include <cassert>
26 #include <stdexcept>
27 
28 #include <zorba/config.h>
29 #include <zorba/diagnostic_list.h>
30 
31 #include "diagnostics/assert.h"
32 #include "diagnostics/diagnostic.h"
33 #include "diagnostics/zorba_exception.h"
34 #include "util/cxx_util.h"
35 #include "util/string_util.h"
36 #include "util/utf8_util.h"
37 
38 #include "icu_streambuf.h"
39 
40 using namespace std;
41 
42 namespace zorba {
43 
44 int const Small_External_Buf_Size = 6;
45 int const Large_External_Buf_Size = 4096;
46 
47 ///////////////////////////////////////////////////////////////////////////////
48 
reset()49 inline void icu_streambuf::buf_type_base::reset() {
50   pivot_source_ = pivot_target_ = pivot_buf_;
51 }
52 
resetg()53 inline void icu_streambuf::resetg() {
54   setg(
55     g_.utf8_char_, g_.utf8_char_ + sizeof g_.utf8_char_,
56     g_.utf8_char_ + sizeof g_.utf8_char_
57   );
58 }
59 
icu_streambuf(char const * charset,streambuf * orig)60 icu_streambuf::icu_streambuf( char const *charset, streambuf *orig ) :
61   proxy_streambuf( orig ),
62   no_conv_( !is_necessary( charset ) ),
63   external_conv_( no_conv_ ? nullptr : create_conv( charset ) ),
64   utf8_conv_( no_conv_ ? nullptr : create_conv( "UTF-8" ) )
65 {
66   if ( !orig )
67     throw invalid_argument( "null streambuf" );
68   resetg();
69 }
70 
~icu_streambuf()71 icu_streambuf::~icu_streambuf() {
72   if ( external_conv_ )
73     ucnv_close( external_conv_ );
74   if ( utf8_conv_ )
75     ucnv_close( utf8_conv_ );
76 }
77 
clear()78 void icu_streambuf::clear() {
79   if ( !no_conv_ ) {
80     ucnv_reset( external_conv_ );
81     ucnv_reset( utf8_conv_ );
82     g_.reset();
83     p_.reset();
84     resetg();
85   }
86 }
87 
create_conv(char const * charset)88 UConverter* icu_streambuf::create_conv( char const *charset ) {
89   UErrorCode err = U_ZERO_ERROR;
90   UConverter *const conv = ucnv_open( charset, &err );
91   ucnv_setFromUCallBack(
92     conv, UCNV_FROM_U_CALLBACK_STOP, nullptr, nullptr, nullptr, &err
93   );
94   ucnv_setToUCallBack(
95     conv, UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, &err
96   );
97   if ( !conv || U_FAILURE( err ) ) {
98     if ( conv )
99       ucnv_close( conv );
100     throw invalid_argument( u_errorName( err ) );
101   }
102   return conv;
103 }
104 
is_necessary(char const * cc_charset)105 bool icu_streambuf::is_necessary( char const *cc_charset ) {
106   if ( !*cc_charset )
107     throw invalid_argument( "empty charset" );
108   //
109   // Apparently, ucnv_compareNames() doesn't consider "US-ASCII" an alias for
110   // "ASCII", so check for "US-ASCII" ourselves.
111   //
112   zstring charset( cc_charset );
113   ascii::trim_whitespace( charset );
114   ascii::to_upper( charset );
115   if ( charset == "US-ASCII" )
116     cc_charset += 3; // skip "US-"
117 
118   return  ucnv_compareNames( cc_charset, "ASCII" )
119       &&  ucnv_compareNames( cc_charset, "UTF-8" );
120 }
121 
is_supported(char const * charset)122 bool icu_streambuf::is_supported( char const *charset ) {
123   try {
124     ucnv_close( create_conv( charset ) );
125     return true;
126   }
127   catch ( invalid_argument const& ) {
128     return false;
129   }
130 }
131 
seekoff(off_type o,ios_base::seekdir d,ios_base::openmode m)132 icu_streambuf::pos_type icu_streambuf::seekoff( off_type o, ios_base::seekdir d,
133                                                 ios_base::openmode m ) {
134   clear();
135   return original()->pubseekoff( o, d, m );
136 }
137 
seekpos(pos_type p,ios_base::openmode m)138 icu_streambuf::pos_type icu_streambuf::seekpos( pos_type p,
139                                                 ios_base::openmode m ) {
140   clear();
141   return original()->pubseekpos( p, m );
142 }
143 
setbuf(char_type * p,streamsize s)144 streambuf* icu_streambuf::setbuf( char_type *p, streamsize s ) {
145   original()->pubsetbuf( p, s );
146   return this;
147 }
148 
sync()149 int icu_streambuf::sync() {
150   return original()->pubsync();
151 }
152 
overflow(int_type c)153 icu_streambuf::int_type icu_streambuf::overflow( int_type c ) {
154 #if ZORBA_DEBUG_ICU_STREAMBUF
155   printf( "overflow()\n" );
156 #endif
157   if ( no_conv_ )
158     return original()->sputc( c );
159 
160   if ( traits_type::eq_int_type( c, traits_type::eof() ) )
161     return traits_type::eof();
162 
163   char_type const utf8_byte = traits_type::to_char_type( c );
164   char_type const *from = &utf8_byte;
165   char ebuf[ Small_External_Buf_Size ], *to = ebuf;
166 
167 #ifdef NDEBUG
168   to_external( &from, from + 1, &to, to + sizeof ebuf );
169 #else
170   bool const ok = to_external( &from, from + 1, &to, to + sizeof ebuf );
171   assert( ok );
172 #endif /* NDEBUG */
173   if ( streamsize const n = to - ebuf ) {
174     original()->sputn( ebuf, n );
175     p_.reset();
176   }
177 
178   return c;
179 }
180 
181 #ifdef __GNUC__
182 # ifdef GCC_PRAGMA_DIAGNOSTIC_PUSH
183 #   pragma GCC diagnostic push
184 # endif /* GCC_PRAGMA_DIAGNOSTIC_PUSH */
185 //
186 // Disables warnings about p.pivot_buf_ + sizeof p.pivot_buf_.
187 //
188 # pragma GCC diagnostic ignored "-Warray-bounds"
189 #endif /* __GNUC__ */
190 
to_external(char_type const ** from,char_type const * from_end,char ** to,char const * to_end,bool flush)191 bool icu_streambuf::to_external( char_type const **from,
192                                  char_type const *from_end, char **to,
193                                  char const *to_end, bool flush ) {
194   UErrorCode err = U_ZERO_ERROR;
195   ucnv_convertEx(
196     external_conv_, utf8_conv_, to, to_end, from, from_end,
197     p_.pivot_buf_, &p_.pivot_source_, &p_.pivot_target_,
198     p_.pivot_buf_ + sizeof p_.pivot_buf_,
199     /*reset*/ false, flush, &err
200   );
201   if ( err == U_TRUNCATED_CHAR_FOUND || err == U_BUFFER_OVERFLOW_ERROR )
202     return false;
203   if ( U_FAILURE( err ) )
204     throw ZORBA_EXCEPTION(
205       zerr::ZOSE0006_TRANSCODING_ERROR, ERROR_PARAMS( u_errorName( err ) )
206     );
207   return true;
208 }
209 
to_utf8(char const ** from,char const * from_end,char_type ** to,char_type const * to_end,bool flush)210 bool icu_streambuf::to_utf8( char const **from, char const *from_end,
211                              char_type **to, char_type const *to_end,
212                              bool flush ) {
213   UErrorCode err = U_ZERO_ERROR;
214   ucnv_convertEx(
215     utf8_conv_, external_conv_, to, to_end, from, from_end,
216     g_.pivot_buf_, &g_.pivot_source_, &g_.pivot_target_,
217     g_.pivot_buf_ + sizeof g_.pivot_buf_,
218     /*reset*/ false, flush, &err
219   );
220   if ( err == U_TRUNCATED_CHAR_FOUND || err == U_BUFFER_OVERFLOW_ERROR )
221     return false;
222   if ( U_FAILURE( err ) )
223     throw ZORBA_EXCEPTION(
224       zerr::ZOSE0006_TRANSCODING_ERROR, ERROR_PARAMS( u_errorName( err ) )
225     );
226   return true;
227 }
228 
229 #ifdef GCC_PRAGMA_DIAGNOSTIC_PUSH
230 # pragma GCC diagnostic pop
231 #else
232 # pragma GCC diagnostic warning "-Warray-bounds"
233 #endif /* GCC_PRAGMA_DIAGNOSTIC_PUSH */
234 
underflow()235 icu_streambuf::int_type icu_streambuf::underflow() {
236 #if ZORBA_DEBUG_ICU_STREAMBUF
237   printf( "underflow()\n" );
238 #endif
239   if ( no_conv_ )
240     return original()->sgetc();
241 
242   if ( gptr() >= egptr() ) {
243     utf8::storage_type *to = g_.utf8_char_;
244     utf8::storage_type const *const to_end = to + sizeof g_.utf8_char_;
245 
246     while ( true ) {
247       int_type const c = original()->sbumpc();
248       if ( traits_type::eq_int_type( c, traits_type::eof() ) )
249         return traits_type::eof();
250 
251       char const ebyte = traits_type::to_char_type( c );
252       char const *from = &ebyte;
253 
254       to_utf8( &from, from + 1, &to, to_end );
255       if ( to > g_.utf8_char_ ) {
256         setg( g_.utf8_char_, g_.utf8_char_, to );
257         g_.reset();
258         break;
259       }
260     }
261   }
262   return traits_type::to_int_type( *gptr() );
263 }
264 
xsgetn(char_type * to,streamsize size)265 streamsize icu_streambuf::xsgetn( char_type *to, streamsize size ) {
266 #if ZORBA_DEBUG_ICU_STREAMBUF
267   printf( "xsgetn()\n" );
268 #endif
269   if ( no_conv_ )
270     return original()->sgetn( to, size );
271 
272   streamsize return_size = 0;
273   char_type *const to_end = to + size;
274 
275   if ( streamsize const gsize = egptr() - gptr() ) {
276     // must first get any chars in g_.utf8_char_
277     streamsize const n = min( gsize, size );
278     traits_type::copy( to, gptr(), n );
279     gbump( n );
280     to += n;
281     size -= n, return_size += n;
282   }
283 
284   while ( size > 0 ) {
285     char ebuf[ Large_External_Buf_Size ];
286     streamsize const get = min( (streamsize)(sizeof ebuf), size );
287     if ( streamsize const got = original()->sgetn( ebuf, get ) ) {
288       char const *from = ebuf;
289       char_type const *const to_orig = to;
290       int_type const peek = original()->sgetc();
291       bool const flush = traits_type::eq_int_type( peek, traits_type::eof() );
292       to_utf8( &from, from + got, &to, to_end, flush );
293       streamsize const n = to - to_orig;
294       size -= n, return_size += n;
295       if ( flush )
296         break;
297     } else
298       break;
299   }
300   return return_size;
301 }
302 
xsputn(char_type const * from,streamsize size)303 streamsize icu_streambuf::xsputn( char_type const *from, streamsize size ) {
304 #if ZORBA_DEBUG_ICU_STREAMBUF
305   printf( "xsputn()\n" );
306 #endif
307   if ( no_conv_ )
308     return original()->sputn( from, size );
309 
310   streamsize return_size = 0;
311   char_type const *const from_end = from + size;
312   char ebuf[ Large_External_Buf_Size ], *to = ebuf;
313   char const *const to_end = to + sizeof ebuf;
314 
315   while ( size > 0 ) {
316     char_type const *const from_orig = from;
317     to_external( &from, from_end, &to, to_end );
318     streamsize n = to - ebuf;
319     if ( n && !original()->sputn( ebuf, n ) )
320       break;
321     to = ebuf;
322     n = from - from_orig;
323     size -= n, return_size += n;
324   }
325   return return_size;
326 }
327 
328 ///////////////////////////////////////////////////////////////////////////////
329 
330 } // namespace zorba
331 /* vim:set et sw=2 ts=2: */
332