1 /*
2 * Copyright 2006-2008 The FLWOR Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "stdafx.h"
18
19 // #define ZORBA_DEBUG_ICU_STREAMBUF
20 #ifdef ZORBA_DEBUG_ICU_STREAMBUF
21 # include <stdio.h>
22 #endif
23
24 #include <algorithm>
25 #include <cassert>
26 #include <stdexcept>
27
28 #include <zorba/config.h>
29 #include <zorba/diagnostic_list.h>
30
31 #include "diagnostics/assert.h"
32 #include "diagnostics/diagnostic.h"
33 #include "diagnostics/zorba_exception.h"
34 #include "util/cxx_util.h"
35 #include "util/string_util.h"
36 #include "util/utf8_util.h"
37
38 #include "icu_streambuf.h"
39
40 using namespace std;
41
42 namespace zorba {
43
44 int const Small_External_Buf_Size = 6;
45 int const Large_External_Buf_Size = 4096;
46
47 ///////////////////////////////////////////////////////////////////////////////
48
reset()49 inline void icu_streambuf::buf_type_base::reset() {
50 pivot_source_ = pivot_target_ = pivot_buf_;
51 }
52
resetg()53 inline void icu_streambuf::resetg() {
54 setg(
55 g_.utf8_char_, g_.utf8_char_ + sizeof g_.utf8_char_,
56 g_.utf8_char_ + sizeof g_.utf8_char_
57 );
58 }
59
icu_streambuf(char const * charset,streambuf * orig)60 icu_streambuf::icu_streambuf( char const *charset, streambuf *orig ) :
61 proxy_streambuf( orig ),
62 no_conv_( !is_necessary( charset ) ),
63 external_conv_( no_conv_ ? nullptr : create_conv( charset ) ),
64 utf8_conv_( no_conv_ ? nullptr : create_conv( "UTF-8" ) )
65 {
66 if ( !orig )
67 throw invalid_argument( "null streambuf" );
68 resetg();
69 }
70
~icu_streambuf()71 icu_streambuf::~icu_streambuf() {
72 if ( external_conv_ )
73 ucnv_close( external_conv_ );
74 if ( utf8_conv_ )
75 ucnv_close( utf8_conv_ );
76 }
77
clear()78 void icu_streambuf::clear() {
79 if ( !no_conv_ ) {
80 ucnv_reset( external_conv_ );
81 ucnv_reset( utf8_conv_ );
82 g_.reset();
83 p_.reset();
84 resetg();
85 }
86 }
87
create_conv(char const * charset)88 UConverter* icu_streambuf::create_conv( char const *charset ) {
89 UErrorCode err = U_ZERO_ERROR;
90 UConverter *const conv = ucnv_open( charset, &err );
91 ucnv_setFromUCallBack(
92 conv, UCNV_FROM_U_CALLBACK_STOP, nullptr, nullptr, nullptr, &err
93 );
94 ucnv_setToUCallBack(
95 conv, UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, &err
96 );
97 if ( !conv || U_FAILURE( err ) ) {
98 if ( conv )
99 ucnv_close( conv );
100 throw invalid_argument( u_errorName( err ) );
101 }
102 return conv;
103 }
104
is_necessary(char const * cc_charset)105 bool icu_streambuf::is_necessary( char const *cc_charset ) {
106 if ( !*cc_charset )
107 throw invalid_argument( "empty charset" );
108 //
109 // Apparently, ucnv_compareNames() doesn't consider "US-ASCII" an alias for
110 // "ASCII", so check for "US-ASCII" ourselves.
111 //
112 zstring charset( cc_charset );
113 ascii::trim_whitespace( charset );
114 ascii::to_upper( charset );
115 if ( charset == "US-ASCII" )
116 cc_charset += 3; // skip "US-"
117
118 return ucnv_compareNames( cc_charset, "ASCII" )
119 && ucnv_compareNames( cc_charset, "UTF-8" );
120 }
121
is_supported(char const * charset)122 bool icu_streambuf::is_supported( char const *charset ) {
123 try {
124 ucnv_close( create_conv( charset ) );
125 return true;
126 }
127 catch ( invalid_argument const& ) {
128 return false;
129 }
130 }
131
seekoff(off_type o,ios_base::seekdir d,ios_base::openmode m)132 icu_streambuf::pos_type icu_streambuf::seekoff( off_type o, ios_base::seekdir d,
133 ios_base::openmode m ) {
134 clear();
135 return original()->pubseekoff( o, d, m );
136 }
137
seekpos(pos_type p,ios_base::openmode m)138 icu_streambuf::pos_type icu_streambuf::seekpos( pos_type p,
139 ios_base::openmode m ) {
140 clear();
141 return original()->pubseekpos( p, m );
142 }
143
setbuf(char_type * p,streamsize s)144 streambuf* icu_streambuf::setbuf( char_type *p, streamsize s ) {
145 original()->pubsetbuf( p, s );
146 return this;
147 }
148
sync()149 int icu_streambuf::sync() {
150 return original()->pubsync();
151 }
152
overflow(int_type c)153 icu_streambuf::int_type icu_streambuf::overflow( int_type c ) {
154 #if ZORBA_DEBUG_ICU_STREAMBUF
155 printf( "overflow()\n" );
156 #endif
157 if ( no_conv_ )
158 return original()->sputc( c );
159
160 if ( traits_type::eq_int_type( c, traits_type::eof() ) )
161 return traits_type::eof();
162
163 char_type const utf8_byte = traits_type::to_char_type( c );
164 char_type const *from = &utf8_byte;
165 char ebuf[ Small_External_Buf_Size ], *to = ebuf;
166
167 #ifdef NDEBUG
168 to_external( &from, from + 1, &to, to + sizeof ebuf );
169 #else
170 bool const ok = to_external( &from, from + 1, &to, to + sizeof ebuf );
171 assert( ok );
172 #endif /* NDEBUG */
173 if ( streamsize const n = to - ebuf ) {
174 original()->sputn( ebuf, n );
175 p_.reset();
176 }
177
178 return c;
179 }
180
181 #ifdef __GNUC__
182 # ifdef GCC_PRAGMA_DIAGNOSTIC_PUSH
183 # pragma GCC diagnostic push
184 # endif /* GCC_PRAGMA_DIAGNOSTIC_PUSH */
185 //
186 // Disables warnings about p.pivot_buf_ + sizeof p.pivot_buf_.
187 //
188 # pragma GCC diagnostic ignored "-Warray-bounds"
189 #endif /* __GNUC__ */
190
to_external(char_type const ** from,char_type const * from_end,char ** to,char const * to_end,bool flush)191 bool icu_streambuf::to_external( char_type const **from,
192 char_type const *from_end, char **to,
193 char const *to_end, bool flush ) {
194 UErrorCode err = U_ZERO_ERROR;
195 ucnv_convertEx(
196 external_conv_, utf8_conv_, to, to_end, from, from_end,
197 p_.pivot_buf_, &p_.pivot_source_, &p_.pivot_target_,
198 p_.pivot_buf_ + sizeof p_.pivot_buf_,
199 /*reset*/ false, flush, &err
200 );
201 if ( err == U_TRUNCATED_CHAR_FOUND || err == U_BUFFER_OVERFLOW_ERROR )
202 return false;
203 if ( U_FAILURE( err ) )
204 throw ZORBA_EXCEPTION(
205 zerr::ZOSE0006_TRANSCODING_ERROR, ERROR_PARAMS( u_errorName( err ) )
206 );
207 return true;
208 }
209
to_utf8(char const ** from,char const * from_end,char_type ** to,char_type const * to_end,bool flush)210 bool icu_streambuf::to_utf8( char const **from, char const *from_end,
211 char_type **to, char_type const *to_end,
212 bool flush ) {
213 UErrorCode err = U_ZERO_ERROR;
214 ucnv_convertEx(
215 utf8_conv_, external_conv_, to, to_end, from, from_end,
216 g_.pivot_buf_, &g_.pivot_source_, &g_.pivot_target_,
217 g_.pivot_buf_ + sizeof g_.pivot_buf_,
218 /*reset*/ false, flush, &err
219 );
220 if ( err == U_TRUNCATED_CHAR_FOUND || err == U_BUFFER_OVERFLOW_ERROR )
221 return false;
222 if ( U_FAILURE( err ) )
223 throw ZORBA_EXCEPTION(
224 zerr::ZOSE0006_TRANSCODING_ERROR, ERROR_PARAMS( u_errorName( err ) )
225 );
226 return true;
227 }
228
229 #ifdef GCC_PRAGMA_DIAGNOSTIC_PUSH
230 # pragma GCC diagnostic pop
231 #else
232 # pragma GCC diagnostic warning "-Warray-bounds"
233 #endif /* GCC_PRAGMA_DIAGNOSTIC_PUSH */
234
underflow()235 icu_streambuf::int_type icu_streambuf::underflow() {
236 #if ZORBA_DEBUG_ICU_STREAMBUF
237 printf( "underflow()\n" );
238 #endif
239 if ( no_conv_ )
240 return original()->sgetc();
241
242 if ( gptr() >= egptr() ) {
243 utf8::storage_type *to = g_.utf8_char_;
244 utf8::storage_type const *const to_end = to + sizeof g_.utf8_char_;
245
246 while ( true ) {
247 int_type const c = original()->sbumpc();
248 if ( traits_type::eq_int_type( c, traits_type::eof() ) )
249 return traits_type::eof();
250
251 char const ebyte = traits_type::to_char_type( c );
252 char const *from = &ebyte;
253
254 to_utf8( &from, from + 1, &to, to_end );
255 if ( to > g_.utf8_char_ ) {
256 setg( g_.utf8_char_, g_.utf8_char_, to );
257 g_.reset();
258 break;
259 }
260 }
261 }
262 return traits_type::to_int_type( *gptr() );
263 }
264
xsgetn(char_type * to,streamsize size)265 streamsize icu_streambuf::xsgetn( char_type *to, streamsize size ) {
266 #if ZORBA_DEBUG_ICU_STREAMBUF
267 printf( "xsgetn()\n" );
268 #endif
269 if ( no_conv_ )
270 return original()->sgetn( to, size );
271
272 streamsize return_size = 0;
273 char_type *const to_end = to + size;
274
275 if ( streamsize const gsize = egptr() - gptr() ) {
276 // must first get any chars in g_.utf8_char_
277 streamsize const n = min( gsize, size );
278 traits_type::copy( to, gptr(), n );
279 gbump( n );
280 to += n;
281 size -= n, return_size += n;
282 }
283
284 while ( size > 0 ) {
285 char ebuf[ Large_External_Buf_Size ];
286 streamsize const get = min( (streamsize)(sizeof ebuf), size );
287 if ( streamsize const got = original()->sgetn( ebuf, get ) ) {
288 char const *from = ebuf;
289 char_type const *const to_orig = to;
290 int_type const peek = original()->sgetc();
291 bool const flush = traits_type::eq_int_type( peek, traits_type::eof() );
292 to_utf8( &from, from + got, &to, to_end, flush );
293 streamsize const n = to - to_orig;
294 size -= n, return_size += n;
295 if ( flush )
296 break;
297 } else
298 break;
299 }
300 return return_size;
301 }
302
xsputn(char_type const * from,streamsize size)303 streamsize icu_streambuf::xsputn( char_type const *from, streamsize size ) {
304 #if ZORBA_DEBUG_ICU_STREAMBUF
305 printf( "xsputn()\n" );
306 #endif
307 if ( no_conv_ )
308 return original()->sputn( from, size );
309
310 streamsize return_size = 0;
311 char_type const *const from_end = from + size;
312 char ebuf[ Large_External_Buf_Size ], *to = ebuf;
313 char const *const to_end = to + sizeof ebuf;
314
315 while ( size > 0 ) {
316 char_type const *const from_orig = from;
317 to_external( &from, from_end, &to, to_end );
318 streamsize n = to - ebuf;
319 if ( n && !original()->sputn( ebuf, n ) )
320 break;
321 to = ebuf;
322 n = from - from_orig;
323 size -= n, return_size += n;
324 }
325 return return_size;
326 }
327
328 ///////////////////////////////////////////////////////////////////////////////
329
330 } // namespace zorba
331 /* vim:set et sw=2 ts=2: */
332