1 /*
2  * Copyright (C) 2004-2009 Marc Boris Duerner
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * As a special exception, you may use this file as part of a free
10  * software library without restriction. Specifically, if other files
11  * instantiate templates or use macros or inline functions from this
12  * file, or you compile this file and link it with other files to
13  * produce an executable, this file does not by itself cause the
14  * resulting executable to be covered by the GNU General Public
15  * License. This exception does not however invalidate any other
16  * reasons why the executable file might be covered by the GNU Library
17  * General Public License.
18  *
19  * This library is distributed in the hope that it will be useful,
20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  * Lesser General Public License for more details.
23  *
24  * You should have received a copy of the GNU Lesser General Public
25  * License along with this library; if not, write to the Free Software
26  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
27  */
28 #ifndef cxxtools_TextCodec_h
29 #define cxxtools_TextCodec_h
30 
31 #include <cxxtools/api.h>
32 #include <cxxtools/char.h>
33 #include <cxxtools/conversionerror.h>
34 #include <string>
35 
36 #ifdef CXXTOOLS_WITH_STD_LOCALE
37 
38 namespace std {
39 
40 template<>
41 class CXXTOOLS_API codecvt<cxxtools::Char, char, cxxtools::MBState> : public codecvt_base, public locale::facet
42 {
43     public:
44         static locale::id id;
__get_id(void)45         virtual locale::id& __get_id (void) const { return id; }
46 
47     public:
48         explicit codecvt(size_t ref = 0);
49 
50         virtual ~codecvt();
51 
out(cxxtools::MBState & state,const cxxtools::Char * from,const cxxtools::Char * from_end,const cxxtools::Char * & from_next,char * to,char * to_end,char * & to_next)52         codecvt_base::result out(cxxtools::MBState& state,
53                                  const cxxtools::Char* from,
54                                  const cxxtools::Char* from_end,
55                                  const cxxtools::Char*& from_next,
56                                  char* to,
57                                  char* to_end,
58                                  char*& to_next) const
59         { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
60 
unshift(cxxtools::MBState & state,char * to,char * to_end,char * & to_next)61         codecvt_base::result unshift(cxxtools::MBState& state,
62                                      char* to,
63                                      char* to_end,
64                                      char*& to_next) const
65         { return this->do_unshift(state, to, to_end, to_next); }
66 
in(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,cxxtools::Char * to,cxxtools::Char * to_end,cxxtools::Char * & to_next)67         codecvt_base::result in(cxxtools::MBState& state,
68                                 const char* from,
69                                 const char* from_end,
70                                 const char*& from_next,
71                                 cxxtools::Char* to,
72                                 cxxtools::Char* to_end,
73                                 cxxtools::Char*& to_next) const
74         { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
75 
encoding()76         int encoding() const
77         { return this->do_encoding(); }
78 
always_noconv()79         bool always_noconv() const
80         { return this->do_always_noconv(); }
81 
length(cxxtools::MBState & state,const char * from,const char * end,size_t max)82         int length(cxxtools::MBState& state, const char* from,
83                    const char* end, size_t max) const
84         { return this->do_length(state, from, end, max); }
85 
max_length()86         int max_length() const
87         { return this->do_max_length(); }
88 
89     protected:
90         virtual codecvt_base::result do_out(cxxtools::MBState& state,
91                                             const cxxtools::Char* from,
92                                             const cxxtools::Char* from_end,
93                                             const cxxtools::Char*& from_next,
94                                             char* to,
95                                             char* to_end,
96                                             char*& to_next) const = 0;
97 
98         virtual codecvt_base::result do_unshift(cxxtools::MBState& state,
99                                                 char* to,
100                                                 char* to_end,
101                                                 char*& to_next) const = 0;
102 
103         virtual codecvt_base::result do_in(cxxtools::MBState& state,
104                                            const char* from,
105                                            const char* from_end,
106                                            const char*& from_next,
107                                            cxxtools::Char* to,
108                                            cxxtools::Char* to_end,
109                                            cxxtools::Char*& to_next) const = 0;
110 
111         virtual int do_encoding() const throw() = 0;
112 
113         virtual bool do_always_noconv() const throw() = 0;
114 
115         virtual int do_length(cxxtools::MBState&,
116                               const char* from,
117                               const char* end,
118                               size_t max) const = 0;
119 
120         virtual int do_max_length() const throw() = 0;
121 };
122 
123 
124 template<>
125 class CXXTOOLS_API codecvt<char, char, cxxtools::MBState> : public codecvt_base, public locale::facet
126 {
127     public:
128         static locale::id id;
__get_id(void)129         virtual locale::id& __get_id (void) const { return id; }
130 
131     public:
132         explicit codecvt(size_t ref = 0);
133 
134         virtual ~codecvt();
135 
out(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next)136         codecvt_base::result out(cxxtools::MBState& state,
137                                  const char* from,
138                                  const char* from_end,
139                                  const char*& from_next,
140                                  char* to,
141                                  char* to_end,
142                                  char*& to_next) const
143         { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
144 
unshift(cxxtools::MBState & state,char * to,char * to_end,char * & to_next)145         codecvt_base::result unshift(cxxtools::MBState& state,
146                                      char* to,
147                                      char* to_end,
148                                      char*& to_next) const
149         { return this->do_unshift(state, to, to_end, to_next); }
150 
in(cxxtools::MBState & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next)151         codecvt_base::result in(cxxtools::MBState& state,
152                                 const char* from,
153                                 const char* from_end,
154                                 const char*& from_next,
155                                 char* to, char* to_end,
156                                 char*& to_next) const
157         { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
158 
encoding()159         int encoding() const
160         { return this->do_encoding(); }
161 
always_noconv()162         bool always_noconv() const
163         { return this->do_always_noconv(); }
164 
length(cxxtools::MBState & state,const char * from,const char * end,size_t max)165         int length(cxxtools::MBState& state, const char* from,
166                    const char* end, size_t max) const
167         { return this->do_length(state, from, end, max); }
168 
max_length()169         int max_length() const
170         { return this->do_max_length(); }
171 
172     protected:
173         virtual codecvt_base::result do_out(cxxtools::MBState& state,
174                                             const char* from,
175                                             const char* from_end,
176                                             const char*& from_next,
177                                             char* to,
178                                             char* to_end,
179                                             char*& to_next) const = 0;
180 
181         virtual codecvt_base::result do_unshift(cxxtools::MBState& state,
182                                                 char* to,
183                                                 char* to_end,
184                                                 char*& to_next) const = 0;
185 
186         virtual codecvt_base::result do_in(cxxtools::MBState& state,
187                                            const char* from,
188                                            const char* from_end,
189                                            const char*& from_next,
190                                            char* to,
191                                            char* to_end,
192                                            char*& to_next) const = 0;
193 
194         virtual int do_encoding() const throw() = 0;
195 
196         virtual bool do_always_noconv() const throw() = 0;
197 
198         virtual int do_length(cxxtools::MBState&,
199                               const char* from,
200                               const char* end,
201                               size_t max) const = 0;
202 
203         virtual int do_max_length() const throw() = 0;
204 };
205 
206 }
207 
208 #else // no CXXTOOLS_WITH_STD_LOCALE
209 
210 namespace std {
211 
212 class codecvt_base
213 {
214     public:
215         enum { ok, partial, error, noconv };
216         typedef int result;
217 
~codecvt_base()218         virtual ~codecvt_base()
219         { }
220 };
221 
222 template <typename I, typename E, typename S>
223 class codecvt : public std::codecvt_base
224 {
225     public:
226         typedef I InternT;
227         typedef E ExternT;
228         typedef S StateT;
229 
230     public:
231         explicit codecvt(size_t ref = 0)
232         {}
233 
~codecvt()234         virtual ~codecvt()
235         { }
236 
out(StateT & state,const InternT * from,const InternT * from_end,const InternT * & from_next,ExternT * to,ExternT * to_end,ExternT * & to_next)237         codecvt_base::result out(StateT& state,
238                                  const InternT* from,
239                                  const InternT* from_end,
240                                  const InternT*& from_next,
241                                  ExternT* to,
242                                  ExternT* to_end,
243                                  ExternT*& to_next) const
244         { return this->do_out(state, from, from_end, from_next, to, to_end, to_next); }
245 
unshift(StateT & state,ExternT * to,ExternT * to_end,ExternT * & to_next)246         codecvt_base::result unshift(StateT& state,
247                                      ExternT* to,
248                                      ExternT* to_end,
249                                      ExternT*& to_next) const
250         { return this->do_unshift(state, to, to_end, to_next); }
251 
in(StateT & state,const ExternT * from,const ExternT * from_end,const ExternT * & from_next,InternT * to,InternT * to_end,InternT * & to_next)252         codecvt_base::result in(StateT& state,
253                                 const ExternT* from,
254                                 const ExternT* from_end,
255                                 const ExternT*& from_next,
256                                 InternT* to,
257                                 InternT* to_end,
258                                 InternT*& to_next) const
259         { return this->do_in(state, from, from_end, from_next, to, to_end, to_next); }
260 
encoding()261         int encoding() const
262         { return this->do_encoding(); }
263 
always_noconv()264         bool always_noconv() const
265         { return this->do_always_noconv(); }
266 
length(StateT & state,const ExternT * from,const ExternT * end,size_t max)267         int length(StateT& state, const ExternT* from,
268                    const ExternT* end, size_t max) const
269         { return this->do_length(state, from, end, max); }
270 
max_length()271         int max_length() const
272         { return this->do_max_length(); }
273 
274     protected:
275         virtual result do_in(StateT& s, const ExternT* fromBegin,
276                              const ExternT* fromEnd, const ExternT*& fromNext,
277                              InternT* toBegin, InternT* toEnd, InternT*& toNext) const = 0;
278 
279         virtual result do_out(StateT& s, const InternT* fromBegin,
280                               const InternT* fromEnd, const InternT*& fromNext,
281                               ExternT* toBegin, ExternT* toEnd, ExternT*& toNext) const = 0;
282 
283         virtual bool do_always_noconv() const = 0;
284 
285         virtual int do_length(StateT& s, const ExternT* fromBegin,
286                               const ExternT* fromEnd, size_t max) const = 0;
287 
288         virtual int do_max_length() const = 0;
289 
290         virtual std::codecvt_base::result do_unshift(StateT&,
291                                                      ExternT*,
292                                                      ExternT*,
293                                                      ExternT*&) const = 0;
294 
295         virtual int do_encoding() const = 0;
296 };
297 
298 }
299 
300 #endif // CXXTOOLS_WITH_STD_LOCALE
301 
302 namespace cxxtools {
303 
304 /**
305  * @brief Generic TextCodec class/facet which may be subclassed by specific Codec classes.
306  *
307  * This class contains default implementations for the methods do_unshift(), do_encoding()
308  * and do_always_noconv() so sub-classes do not have to implement this default behaviour.
309  *
310  * Codecs are used to convert one Text-encoding into another Text-encoding. The internal
311  * and external data type can be specified using the template parameter 'I' (internal) and
312  * 'E' (external).
313  *
314  * When used on a platform which supports locales and facets the conversion may use
315  * locale-specific conversion of the Text.
316  *
317  * This class derives from facet std::codecvt. Further documentation can be found there.
318  *
319  * @param I The character type associated with the internal code set.
320  * @param E The character type associated with the external code set.
321  *
322  * @see Utf8Codec
323  * @see Utf16Codec
324  * @see Utf32Codec
325  */
326 template <typename I, typename E>
327 class TextCodec : public std::codecvt<I, E, cxxtools::MBState>
328 {
329     public:
330         typedef I InternT;
331         typedef E ExternT;
332 
333     public:
334         /**
335          * @brief Constructs a new TextCodec object.
336          *
337          * The internal and external type are specified by the template parameters of the class.
338          *
339          * @param ref This parameter is passed to std::codecvt. When ref == 0 the locale takes care
340          * of deleting the facet. If ref == 1 the locale does not destroy the facet.
341          */
342         explicit TextCodec(size_t ref = 0)
343         : std::codecvt<InternT, ExternT, MBState>(ref)
344         , _refs(ref)
345         {}
346 
347     public:
348         //! Empty desctructor
~TextCodec()349         virtual ~TextCodec()
350         {}
351 
refs()352         size_t refs() const
353         { return _refs; }
354 
355     private:
356         size_t _refs;
357 };
358 
359 /** @brief helper template function for decoding data using a codec.
360 
361     This template function makes it easy to use a codec to convert a string with
362     one encoding to another.
363  */
364 template <typename CodecType>
decode(const typename CodecType::ExternT * data,unsigned size)365 std::basic_string<typename CodecType::InternT> decode(const typename CodecType::ExternT* data, unsigned size)
366 {
367     CodecType codec;
368 
369     typename CodecType::InternT to[64];
370     MBState state;
371     std::basic_string<typename CodecType::InternT> ret;
372     const typename CodecType::ExternT* from = data;
373 
374     typename CodecType::result r;
375     do
376     {
377         typename CodecType::InternT* to_next = to;
378 
379         const typename CodecType::ExternT* from_next = from;
380         r = codec.in(state, from, from + size, from_next, to, to + sizeof(to)/sizeof(typename CodecType::InternT), to_next);
381 
382         if (r == CodecType::error)
383             throw ConversionError("character conversion failed");
384 
385         if (r == CodecType::partial && from_next == from)
386             throw ConversionError("character conversion failed - unexpected end of input sequence");
387 
388         ret.append(to, to_next);
389 
390         size -= (from_next - from);
391         from = from_next;
392 
393     } while (r == CodecType::partial);
394 
395     return ret;
396 }
397 
398 /** @brief helper template function for decoding strings using a codec.
399 
400     This template function makes it easy to use a codec to convert a string with
401     one encoding to another.
402 
403     @code
404       std::string utf8data = ...;
405       cxxtools::String unicodeString = cxxtools::decode<cxxtools::Utf8Codec>(utf8data);
406 
407       std::string base64data = ...;
408       std::string data = cxxtools::decode<cxxtools::Base64Codec>(base64data);
409     @endcode
410  */
411 template <typename CodecType>
decode(const std::basic_string<typename CodecType::ExternT> & data)412 std::basic_string<typename CodecType::InternT> decode(const std::basic_string<typename CodecType::ExternT>& data)
413 {
414     return decode<CodecType>(data.data(), data.size());
415 }
416 
417 
418 template <typename CodecType>
encode(const typename CodecType::InternT * data,unsigned size)419 std::basic_string<typename CodecType::ExternT> encode(const typename CodecType::InternT* data, unsigned size)
420 {
421     CodecType codec;
422     char to[64];
423     MBState state;
424 
425     typename CodecType::result r;
426     const typename CodecType::InternT* from = data;
427     std::basic_string<typename CodecType::ExternT> ret;
428 
429     do{
430         const typename CodecType::InternT* from_next;
431 
432         typename CodecType::ExternT* to_next = to;
433         r = codec.out(state, from, from + size, from_next, to, to + sizeof(to), to_next);
434 
435         if (r == CodecType::error)
436             throw ConversionError("character conversion failed");
437 
438         ret.append(to, to_next);
439 
440         size -= (from_next - from);
441         from = from_next;
442 
443     } while (r == CodecType::partial);
444 
445     typename CodecType::ExternT* to_next = to;
446     r = codec.unshift(state, to, to + sizeof(to), to_next);
447     if (r == CodecType::error)
448         throw ConversionError("character conversion failed");
449 
450     ret.append(to, to_next);
451 
452     return ret;
453 }
454 
455 /** @brief helper template function for encoding strings using a codec.
456 
457     This template function makes it easy to use a codec to convert a string with
458     one encoding to another.
459 
460     @code
461       cxxtools::String unicodeString = ...;
462       std::string utf8data = cxxtools::encode<cxxtools::Utf8Codec>(unicodeString);
463 
464       std::string base64data = cxxtools::encode<cxxtools::Base64Codec>("some data");
465     @endcode
466  */
467 template <typename CodecType>
encode(const std::basic_string<typename CodecType::InternT> & data)468 std::basic_string<typename CodecType::ExternT> encode(const std::basic_string<typename CodecType::InternT>& data)
469 {
470     return encode<CodecType>(data.data(), data.size());
471 }
472 
473 }
474 
475 #endif
476