1 /*
2 * Copyright (C) 2004-2009 Marc Boris Duerner
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * As a special exception, you may use this file as part of a free
10 * software library without restriction. Specifically, if other files
11 * instantiate templates or use macros or inline functions from this
12 * file, or you compile this file and link it with other files to
13 * produce an executable, this file does not by itself cause the
14 * resulting executable to be covered by the GNU General Public
15 * License. This exception does not however invalidate any other
16 * reasons why the executable file might be covered by the GNU Library
17 * General Public License.
18 *
19 * This library is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * Lesser General Public License for more details.
23 *
24 * You should have received a copy of the GNU Lesser General Public
25 * License along with this library; if not, write to the Free Software
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 */
28 #ifndef cxxtools_TextCodec_h
29 #define cxxtools_TextCodec_h
30
31 #include <cxxtools/api.h>
32 #include <cxxtools/char.h>
33 #include <cxxtools/conversionerror.h>
34 #include <string>
35
36 #ifdef CXXTOOLS_WITH_STD_LOCALE
37
38 namespace std {
39
40 template<>
41 class CXXTOOLS_API codecvt<cxxtools::Char, char, cxxtools::MBState> : public codecvt_base, public locale::facet
42 {
43 public:
44 static locale::id id;
__get_id(void)45 virtual locale::id& __get_id (void) const { return id; }
46
47 public:
48 explicit codecvt(size_t ref = 0);
49
50 virtual ~codecvt();
51
out(cxxtools::MBState & state,const cxxtools::Char * from,const cxxtools::Char * from_end,const cxxtools::Char * & from_next,char * to,char * to_end,char * & to_next)52 codecvt_base::result out(cxxtools::MBState& state,
53 const cxxtools::Char* from,
54 const cxxtools::Char* from_end,
55 const cxxtools::Char*& from_next,
56 char* to,
57 char* to_end,
58 char*& to_next) const
59 { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
60
unshift(cxxtools::MBState & state,char * to,char * to_end,char * & to_next)61 codecvt_base::result unshift(cxxtools::MBState& state,
62 char* to,
63 char* to_end,
64 char*& to_next) const
65 { return this->do_unshift(state, to, to_end, to_next); }
66
in(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,cxxtools::Char * to,cxxtools::Char * to_end,cxxtools::Char * & to_next)67 codecvt_base::result in(cxxtools::MBState& state,
68 const char* from,
69 const char* from_end,
70 const char*& from_next,
71 cxxtools::Char* to,
72 cxxtools::Char* to_end,
73 cxxtools::Char*& to_next) const
74 { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
75
encoding()76 int encoding() const
77 { return this->do_encoding(); }
78
always_noconv()79 bool always_noconv() const
80 { return this->do_always_noconv(); }
81
length(cxxtools::MBState & state,const char * from,const char * end,size_t max)82 int length(cxxtools::MBState& state, const char* from,
83 const char* end, size_t max) const
84 { return this->do_length(state, from, end, max); }
85
max_length()86 int max_length() const
87 { return this->do_max_length(); }
88
89 protected:
90 virtual codecvt_base::result do_out(cxxtools::MBState& state,
91 const cxxtools::Char* from,
92 const cxxtools::Char* from_end,
93 const cxxtools::Char*& from_next,
94 char* to,
95 char* to_end,
96 char*& to_next) const = 0;
97
98 virtual codecvt_base::result do_unshift(cxxtools::MBState& state,
99 char* to,
100 char* to_end,
101 char*& to_next) const = 0;
102
103 virtual codecvt_base::result do_in(cxxtools::MBState& state,
104 const char* from,
105 const char* from_end,
106 const char*& from_next,
107 cxxtools::Char* to,
108 cxxtools::Char* to_end,
109 cxxtools::Char*& to_next) const = 0;
110
111 virtual int do_encoding() const throw() = 0;
112
113 virtual bool do_always_noconv() const throw() = 0;
114
115 virtual int do_length(cxxtools::MBState&,
116 const char* from,
117 const char* end,
118 size_t max) const = 0;
119
120 virtual int do_max_length() const throw() = 0;
121 };
122
123
124 template<>
125 class CXXTOOLS_API codecvt<char, char, cxxtools::MBState> : public codecvt_base, public locale::facet
126 {
127 public:
128 static locale::id id;
__get_id(void)129 virtual locale::id& __get_id (void) const { return id; }
130
131 public:
132 explicit codecvt(size_t ref = 0);
133
134 virtual ~codecvt();
135
out(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next)136 codecvt_base::result out(cxxtools::MBState& state,
137 const char* from,
138 const char* from_end,
139 const char*& from_next,
140 char* to,
141 char* to_end,
142 char*& to_next) const
143 { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
144
unshift(cxxtools::MBState & state,char * to,char * to_end,char * & to_next)145 codecvt_base::result unshift(cxxtools::MBState& state,
146 char* to,
147 char* to_end,
148 char*& to_next) const
149 { return this->do_unshift(state, to, to_end, to_next); }
150
in(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next)151 codecvt_base::result in(cxxtools::MBState& state,
152 const char* from,
153 const char* from_end,
154 const char*& from_next,
155 char* to, char* to_end,
156 char*& to_next) const
157 { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
158
encoding()159 int encoding() const
160 { return this->do_encoding(); }
161
always_noconv()162 bool always_noconv() const
163 { return this->do_always_noconv(); }
164
length(cxxtools::MBState & state,const char * from,const char * end,size_t max)165 int length(cxxtools::MBState& state, const char* from,
166 const char* end, size_t max) const
167 { return this->do_length(state, from, end, max); }
168
max_length()169 int max_length() const
170 { return this->do_max_length(); }
171
172 protected:
173 virtual codecvt_base::result do_out(cxxtools::MBState& state,
174 const char* from,
175 const char* from_end,
176 const char*& from_next,
177 char* to,
178 char* to_end,
179 char*& to_next) const = 0;
180
181 virtual codecvt_base::result do_unshift(cxxtools::MBState& state,
182 char* to,
183 char* to_end,
184 char*& to_next) const = 0;
185
186 virtual codecvt_base::result do_in(cxxtools::MBState& state,
187 const char* from,
188 const char* from_end,
189 const char*& from_next,
190 char* to,
191 char* to_end,
192 char*& to_next) const = 0;
193
194 virtual int do_encoding() const throw() = 0;
195
196 virtual bool do_always_noconv() const throw() = 0;
197
198 virtual int do_length(cxxtools::MBState&,
199 const char* from,
200 const char* end,
201 size_t max) const = 0;
202
203 virtual int do_max_length() const throw() = 0;
204 };
205
206 }
207
208 #else // no CXXTOOLS_WITH_STD_LOCALE
209
210 namespace std {
211
212 class codecvt_base
213 {
214 public:
215 enum { ok, partial, error, noconv };
216 typedef int result;
217
~codecvt_base()218 virtual ~codecvt_base()
219 { }
220 };
221
222 template <typename I, typename E, typename S>
223 class codecvt : public std::codecvt_base
224 {
225 public:
226 typedef I InternT;
227 typedef E ExternT;
228 typedef S StateT;
229
230 public:
231 explicit codecvt(size_t ref = 0)
232 {}
233
~codecvt()234 virtual ~codecvt()
235 { }
236
out(StateT & state,const InternT * from,const InternT * from_end,const InternT * & from_next,ExternT * to,ExternT * to_end,ExternT * & to_next)237 codecvt_base::result out(StateT& state,
238 const InternT* from,
239 const InternT* from_end,
240 const InternT*& from_next,
241 ExternT* to,
242 ExternT* to_end,
243 ExternT*& to_next) const
244 { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
245
unshift(StateT & state,ExternT * to,ExternT * to_end,ExternT * & to_next)246 codecvt_base::result unshift(StateT& state,
247 ExternT* to,
248 ExternT* to_end,
249 ExternT*& to_next) const
250 { return this->do_unshift(state, to, to_end, to_next); }
251
in(StateT & state,const ExternT * from,const ExternT * from_end,const ExternT * & from_next,InternT * to,InternT * to_end,InternT * & to_next)252 codecvt_base::result in(StateT& state,
253 const ExternT* from,
254 const ExternT* from_end,
255 const ExternT*& from_next,
256 InternT* to,
257 InternT* to_end,
258 InternT*& to_next) const
259 { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
260
encoding()261 int encoding() const
262 { return this->do_encoding(); }
263
always_noconv()264 bool always_noconv() const
265 { return this->do_always_noconv(); }
266
length(StateT & state,const ExternT * from,const ExternT * end,size_t max)267 int length(StateT& state, const ExternT* from,
268 const ExternT* end, size_t max) const
269 { return this->do_length(state, from, end, max); }
270
max_length()271 int max_length() const
272 { return this->do_max_length(); }
273
274 protected:
275 virtual result do_in(StateT& s, const ExternT* fromBegin,
276 const ExternT* fromEnd, const ExternT*& fromNext,
277 InternT* toBegin, InternT* toEnd, InternT*& toNext) const = 0;
278
279 virtual result do_out(StateT& s, const InternT* fromBegin,
280 const InternT* fromEnd, const InternT*& fromNext,
281 ExternT* toBegin, ExternT* toEnd, ExternT*& toNext) const = 0;
282
283 virtual bool do_always_noconv() const = 0;
284
285 virtual int do_length(StateT& s, const ExternT* fromBegin,
286 const ExternT* fromEnd, size_t max) const = 0;
287
288 virtual int do_max_length() const = 0;
289
290 virtual std::codecvt_base::result do_unshift(StateT&,
291 ExternT*,
292 ExternT*,
293 ExternT*&) const = 0;
294
295 virtual int do_encoding() const = 0;
296 };
297
298 }
299
300 #endif // CXXTOOLS_WITH_STD_LOCALE
301
302 namespace cxxtools {
303
304 /**
305 * @brief Generic TextCodec class/facet which may be subclassed by specific Codec classes.
306 *
307 * This class contains default implementations for the methods do_unshift(), do_encoding()
308 * and do_always_noconv() so sub-classes do not have to implement this default behaviour.
309 *
310 * Codecs are used to convert one Text-encoding into another Text-encoding. The internal
311 * and external data type can be specified using the template parameter 'I' (internal) and
312 * 'E' (external).
313 *
314 * When used on a platform which supports locales and facets the conversion may use
315 * locale-specific conversion of the Text.
316 *
317 * This class derives from facet std::codecvt. Further documentation can be found there.
318 *
319 * @param I The character type associated with the internal code set.
320 * @param E The character type associated with the external code set.
321 *
322 * @see Utf8Codec
323 * @see Utf16Codec
324 * @see Utf32Codec
325 */
326 template <typename I, typename E>
327 class TextCodec : public std::codecvt<I, E, cxxtools::MBState>
328 {
329 public:
330 typedef I InternT;
331 typedef E ExternT;
332
333 public:
334 /**
335 * @brief Constructs a new TextCodec object.
336 *
337 * The internal and external type are specified by the template parameters of the class.
338 *
339 * @param ref This parameter is passed to std::codecvt. When ref == 0 the locale takes care
340 * of deleting the facet. If ref == 1 the locale does not destroy the facet.
341 */
342 explicit TextCodec(size_t ref = 0)
343 : std::codecvt<InternT, ExternT, MBState>(ref)
344 , _refs(ref)
345 {}
346
347 public:
348 //! Empty desctructor
~TextCodec()349 virtual ~TextCodec()
350 {}
351
refs()352 size_t refs() const
353 { return _refs; }
354
355 private:
356 size_t _refs;
357 };
358
359 /** @brief helper template function for decoding data using a codec.
360
361 This template function makes it easy to use a codec to convert a string with
362 one encoding to another.
363 */
364 template <typename CodecType>
decode(const typename CodecType::ExternT * data,unsigned size)365 std::basic_string<typename CodecType::InternT> decode(const typename CodecType::ExternT* data, unsigned size)
366 {
367 CodecType codec;
368
369 typename CodecType::InternT to[64];
370 MBState state;
371 std::basic_string<typename CodecType::InternT> ret;
372 const typename CodecType::ExternT* from = data;
373
374 typename CodecType::result r;
375 do
376 {
377 typename CodecType::InternT* to_next = to;
378
379 const typename CodecType::ExternT* from_next = from;
380 r = codec.in(state, from, from + size, from_next, to, to + sizeof(to)/sizeof(typename CodecType::InternT), to_next);
381
382 if (r == CodecType::error)
383 throw ConversionError("character conversion failed");
384
385 if (r == CodecType::partial && from_next == from)
386 throw ConversionError("character conversion failed - unexpected end of input sequence");
387
388 ret.append(to, to_next);
389
390 size -= (from_next - from);
391 from = from_next;
392
393 } while (r == CodecType::partial);
394
395 return ret;
396 }
397
398 /** @brief helper template function for decoding strings using a codec.
399
400 This template function makes it easy to use a codec to convert a string with
401 one encoding to another.
402
403 @code
404 std::string utf8data = ...;
405 cxxtools::String unicodeString = cxxtools::decode<cxxtools::Utf8Codec>(utf8data);
406
407 std::string base64data = ...;
408 std::string data = cxxtools::decode<cxxtools::Base64Codec>(base64data);
409 @endcode
410 */
411 template <typename CodecType>
decode(const std::basic_string<typename CodecType::ExternT> & data)412 std::basic_string<typename CodecType::InternT> decode(const std::basic_string<typename CodecType::ExternT>& data)
413 {
414 return decode<CodecType>(data.data(), data.size());
415 }
416
417
418 template <typename CodecType>
encode(const typename CodecType::InternT * data,unsigned size)419 std::basic_string<typename CodecType::ExternT> encode(const typename CodecType::InternT* data, unsigned size)
420 {
421 CodecType codec;
422 char to[64];
423 MBState state;
424
425 typename CodecType::result r;
426 const typename CodecType::InternT* from = data;
427 std::basic_string<typename CodecType::ExternT> ret;
428
429 do{
430 const typename CodecType::InternT* from_next;
431
432 typename CodecType::ExternT* to_next = to;
433 r = codec.out(state, from, from + size, from_next, to, to + sizeof(to), to_next);
434
435 if (r == CodecType::error)
436 throw ConversionError("character conversion failed");
437
438 ret.append(to, to_next);
439
440 size -= (from_next - from);
441 from = from_next;
442
443 } while (r == CodecType::partial);
444
445 typename CodecType::ExternT* to_next = to;
446 r = codec.unshift(state, to, to + sizeof(to), to_next);
447 if (r == CodecType::error)
448 throw ConversionError("character conversion failed");
449
450 ret.append(to, to_next);
451
452 return ret;
453 }
454
455 /** @brief helper template function for encoding strings using a codec.
456
457 This template function makes it easy to use a codec to convert a string with
458 one encoding to another.
459
460 @code
461 cxxtools::String unicodeString = ...;
462 std::string utf8data = cxxtools::encode<cxxtools::Utf8Codec>(unicodeString);
463
464 std::string base64data = cxxtools::encode<cxxtools::Base64Codec>("some data");
465 @endcode
466 */
467 template <typename CodecType>
encode(const std::basic_string<typename CodecType::InternT> & data)468 std::basic_string<typename CodecType::ExternT> encode(const std::basic_string<typename CodecType::InternT>& data)
469 {
470 return encode<CodecType>(data.data(), data.size());
471 }
472
473 }
474
475 #endif
476