1 /********************************************************************************
2 * *
3 * U T F - 3 2 T e x t C o d e c *
4 * *
5 *********************************************************************************
6 * Copyright (C) 2002,2006 by L.Johnson & J.van der Zijp. All Rights Reserved. *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of the GNU Lesser General Public *
10 * License as published by the Free Software Foundation; either *
11 * version 2.1 of the License, or (at your option) any later version. *
12 * *
13 * This library is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
16 * Lesser General Public License for more details. *
17 * *
18 * You should have received a copy of the GNU Lesser General Public *
19 * License along with this library; if not, write to the Free Software *
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. *
21 *********************************************************************************
22 * $Id: FXUTF32Codec.cpp,v 1.16 2006/01/22 17:58:50 fox Exp $ *
23 ********************************************************************************/
24 #include "xincs.h"
25 #include "fxver.h"
26 #include "fxdefs.h"
27 #include "FXHash.h"
28 #include "FXStream.h"
29 #include "FXDict.h"
30 #include "FXString.h"
31 #include "FXTextCodec.h"
32 #include "FXUTF32Codec.h"
33
34
35 /*
36 Notes:
37 */
38
39 /*******************************************************************************/
40
41 namespace FX {
42
43
44 const FXwchar BOM_BE=0x0000FEFF;
45 const FXwchar BOM_LE=0xFFFE0000;
46
47
48 FXIMPLEMENT(FXUTF32BECodec,FXTextCodec,NULL,0)
49
50
51 // Convert from utf32be
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const52 FXint FXUTF32BECodec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
53 if(nsrc<4) return -4;
54 #if FOX_BIGENDIAN
55 ((FXuchar*)&wc)[0]=src[0];
56 ((FXuchar*)&wc)[1]=src[1];
57 ((FXuchar*)&wc)[2]=src[2];
58 ((FXuchar*)&wc)[3]=src[3];
59 #else
60 ((FXuchar*)&wc)[3]=src[0];
61 ((FXuchar*)&wc)[2]=src[1];
62 ((FXuchar*)&wc)[1]=src[2];
63 ((FXuchar*)&wc)[0]=src[3];
64 #endif
65 return 4;
66 }
67
68
69 // Convert to utf32be
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const70 FXint FXUTF32BECodec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
71 if(ndst<4) return -4;
72 #if FOX_BIGENDIAN
73 dst[0]=((FXuchar*)&wc)[0];
74 dst[1]=((FXuchar*)&wc)[1];
75 dst[2]=((FXuchar*)&wc)[2];
76 dst[3]=((FXuchar*)&wc)[3];
77 #else
78 dst[0]=((FXuchar*)&wc)[3];
79 dst[1]=((FXuchar*)&wc)[2];
80 dst[2]=((FXuchar*)&wc)[1];
81 dst[3]=((FXuchar*)&wc)[0];
82 #endif
83 return 4;
84 }
85
86
87 // Return name
name() const88 const FXchar* FXUTF32BECodec::name() const {
89 return "UTF-32BE";
90 }
91
92
93 // Return the IANA mime name for this codec
mimeName() const94 const FXchar* FXUTF32BECodec::mimeName() const {
95 return "UTF-32BE";
96 }
97
98
99 // Return code for UTF-32
mibEnum() const100 FXint FXUTF32BECodec::mibEnum() const {
101 return 1018;
102 }
103
104
105 // Return aliases
aliases() const106 const FXchar* const* FXUTF32BECodec::aliases() const {
107 static const FXchar *const list[]={"UTF-32BE",NULL};
108 return list;
109 }
110
111
112 /*******************************************************************************/
113
114 FXIMPLEMENT(FXUTF32LECodec,FXTextCodec,NULL,0)
115
116
117 // Convert from utf32le
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const118 FXint FXUTF32LECodec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
119 if(nsrc<4) return -4;
120 #if FOX_BIGENDIAN
121 ((FXuchar*)&wc)[0]=src[3];
122 ((FXuchar*)&wc)[1]=src[2];
123 ((FXuchar*)&wc)[2]=src[1];
124 ((FXuchar*)&wc)[3]=src[0];
125 #else
126 ((FXuchar*)&wc)[3]=src[3];
127 ((FXuchar*)&wc)[2]=src[2];
128 ((FXuchar*)&wc)[1]=src[1];
129 ((FXuchar*)&wc)[0]=src[0];
130 #endif
131 return 4;
132 }
133
134
135 // Convert to utf32le
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const136 FXint FXUTF32LECodec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
137 if(ndst<4) return -4;
138 #if FOX_BIGENDIAN
139 dst[3]=((FXuchar*)&wc)[0];
140 dst[2]=((FXuchar*)&wc)[1];
141 dst[1]=((FXuchar*)&wc)[2];
142 dst[0]=((FXuchar*)&wc)[3];
143 #else
144 dst[3]=((FXuchar*)&wc)[3];
145 dst[2]=((FXuchar*)&wc)[2];
146 dst[1]=((FXuchar*)&wc)[1];
147 dst[0]=((FXuchar*)&wc)[0];
148 #endif
149 return 4;
150 }
151
152
153 // Return name
name() const154 const FXchar* FXUTF32LECodec::name() const {
155 return "UTF-32LE";
156 }
157
158
159 // Return the IANA mime name for this codec
mimeName() const160 const FXchar* FXUTF32LECodec::mimeName() const {
161 return "UTF-32LE";
162 }
163
164
165 // Return code for UTF-32
mibEnum() const166 FXint FXUTF32LECodec::mibEnum() const {
167 return 1019;
168 }
169
170
171 // Return aliases
aliases() const172 const FXchar* const* FXUTF32LECodec::aliases() const {
173 static const FXchar *const list[]={"UTF-32LE",NULL};
174 return list;
175 }
176
177
178 /*******************************************************************************/
179
180 FXIMPLEMENT(FXUTF32Codec,FXTextCodec,NULL,0)
181
182
183 // Convert utf32
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const184 FXint FXUTF32Codec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
185 if(nsrc<4) return -4;
186 #if FOX_BIGENDIAN
187 ((FXuchar*)&wc)[0]=src[0];
188 ((FXuchar*)&wc)[1]=src[1];
189 ((FXuchar*)&wc)[2]=src[2];
190 ((FXuchar*)&wc)[3]=src[3];
191 #else
192 ((FXuchar*)&wc)[3]=src[0];
193 ((FXuchar*)&wc)[2]=src[1];
194 ((FXuchar*)&wc)[1]=src[2];
195 ((FXuchar*)&wc)[0]=src[3];
196 #endif
197 if(wc==BOM_BE){
198 if(nsrc<8) return -8;
199 #if FOX_BIGENDIAN
200 ((FXuchar*)&wc)[0]=src[4];
201 ((FXuchar*)&wc)[1]=src[5];
202 ((FXuchar*)&wc)[2]=src[6];
203 ((FXuchar*)&wc)[3]=src[7];
204 #else
205 ((FXuchar*)&wc)[3]=src[4];
206 ((FXuchar*)&wc)[2]=src[5];
207 ((FXuchar*)&wc)[1]=src[6];
208 ((FXuchar*)&wc)[0]=src[7];
209 #endif
210 return 8;
211 }
212 if(wc==BOM_LE){
213 if(nsrc<8) return -8;
214 #if FOX_BIGENDIAN
215 ((FXuchar*)&wc)[0]=src[7];
216 ((FXuchar*)&wc)[1]=src[6];
217 ((FXuchar*)&wc)[2]=src[5];
218 ((FXuchar*)&wc)[3]=src[4];
219 #else
220 ((FXuchar*)&wc)[3]=src[7];
221 ((FXuchar*)&wc)[2]=src[6];
222 ((FXuchar*)&wc)[1]=src[5];
223 ((FXuchar*)&wc)[0]=src[4];
224 #endif
225 return 8;
226 }
227 return 4;
228 }
229
230
231 // Number of bytes for wide character
utflen(FXwchar w)232 static inline FXint utflen(FXwchar w){
233 if(w<0x80) return 1;
234 if(w<0x800) return 2;
235 if(w<0x10000) return 3;
236 if(w<0x200000) return 4;
237 if(w<0x4000000) return 5;
238 return 6;
239 }
240
241
242 // Count number of utf8 characters needed to convert multi-byte characters from src
mb2utflen(const FXchar * src,FXint nsrc) const243 FXint FXUTF32Codec::mb2utflen(const FXchar* src,FXint nsrc) const {
244 register FXint len=0;
245 FXwchar w;
246 if(src && 0<nsrc){
247 if(nsrc<4) return -4;
248 #if FOX_BIGENDIAN
249 ((FXuchar*)&w)[0]=src[0];
250 ((FXuchar*)&w)[1]=src[1];
251 ((FXuchar*)&w)[2]=src[2];
252 ((FXuchar*)&w)[3]=src[3];
253 #else
254 ((FXuchar*)&w)[3]=src[0];
255 ((FXuchar*)&w)[2]=src[1];
256 ((FXuchar*)&w)[1]=src[2];
257 ((FXuchar*)&w)[0]=src[3];
258 #endif
259 if(w!=BOM_LE){ // Big-endian (default)
260 if(w==BOM_BE){
261 src+=4;
262 nsrc-=4;
263 }
264 while(0<nsrc){
265 if(nsrc<4) return -4;
266 #if FOX_BIGENDIAN
267 ((FXuchar*)&w)[0]=src[0];
268 ((FXuchar*)&w)[1]=src[1];
269 ((FXuchar*)&w)[2]=src[2];
270 ((FXuchar*)&w)[3]=src[3];
271 #else
272 ((FXuchar*)&w)[3]=src[0];
273 ((FXuchar*)&w)[2]=src[1];
274 ((FXuchar*)&w)[1]=src[2];
275 ((FXuchar*)&w)[0]=src[3];
276 #endif
277 src+=4;
278 nsrc-=4;
279 len+=utflen(w);
280 }
281 }
282 else{ // Little-endian
283 src+=4;
284 nsrc-=4;
285 while(0<nsrc){
286 if(nsrc<4) return -4;
287 #if FOX_BIGENDIAN
288 ((FXuchar*)&w)[0]=src[3];
289 ((FXuchar*)&w)[1]=src[2];
290 ((FXuchar*)&w)[2]=src[1];
291 ((FXuchar*)&w)[3]=src[0];
292 #else
293 ((FXuchar*)&w)[3]=src[3];
294 ((FXuchar*)&w)[2]=src[2];
295 ((FXuchar*)&w)[1]=src[1];
296 ((FXuchar*)&w)[0]=src[0];
297 #endif
298 src+=4;
299 nsrc-=4;
300 len+=utflen(w);
301 }
302 }
303 }
304 return len;
305 }
306
307
308 // Convert multi-byte characters from src to utf8 characters at dst
mb2utf(FXchar * dst,FXint ndst,const FXchar * src,FXint nsrc) const309 FXint FXUTF32Codec::mb2utf(FXchar* dst,FXint ndst,const FXchar* src,FXint nsrc) const {
310 register FXint nw,len=0;
311 FXwchar w;
312 if(dst && src && 0<nsrc){
313 if(nsrc<4) return -4;
314 #if FOX_BIGENDIAN
315 ((FXuchar*)&w)[0]=src[0];
316 ((FXuchar*)&w)[1]=src[1];
317 ((FXuchar*)&w)[2]=src[2];
318 ((FXuchar*)&w)[3]=src[3];
319 #else
320 ((FXuchar*)&w)[3]=src[0];
321 ((FXuchar*)&w)[2]=src[1];
322 ((FXuchar*)&w)[1]=src[2];
323 ((FXuchar*)&w)[0]=src[3];
324 #endif
325 if(w!=BOM_LE){ // Big-endian (default)
326 if(w==BOM_BE){
327 src+=4;
328 nsrc-=4;
329 }
330 while(0<nsrc){
331 if(nsrc<4) return -4;
332 #if FOX_BIGENDIAN
333 ((FXuchar*)&w)[0]=src[0];
334 ((FXuchar*)&w)[1]=src[1];
335 ((FXuchar*)&w)[2]=src[2];
336 ((FXuchar*)&w)[3]=src[3];
337 #else
338 ((FXuchar*)&w)[3]=src[0];
339 ((FXuchar*)&w)[2]=src[1];
340 ((FXuchar*)&w)[1]=src[2];
341 ((FXuchar*)&w)[0]=src[3];
342 #endif
343 src+=4;
344 nsrc-=4;
345 nw=wc2utf(dst,ndst,w);
346 if(nw<=0) return nw;
347 len+=nw;
348 dst+=nw;
349 ndst-=nw;
350 }
351 }
352 else{ // Little-endian
353 src+=4;
354 nsrc-=4;
355 while(0<nsrc){
356 if(nsrc<4) return -4;
357 #if FOX_BIGENDIAN
358 ((FXuchar*)&w)[0]=src[3];
359 ((FXuchar*)&w)[1]=src[2];
360 ((FXuchar*)&w)[2]=src[1];
361 ((FXuchar*)&w)[3]=src[0];
362 #else
363 ((FXuchar*)&w)[3]=src[3];
364 ((FXuchar*)&w)[2]=src[2];
365 ((FXuchar*)&w)[1]=src[1];
366 ((FXuchar*)&w)[0]=src[0];
367 #endif
368 src+=4;
369 nsrc-=4;
370 nw=wc2utf(dst,ndst,w);
371 if(nw<=0) return nw;
372 len+=nw;
373 dst+=nw;
374 ndst-=nw;
375 }
376 }
377 }
378 return len;
379 }
380
381
382 // Convert to utf32
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const383 FXint FXUTF32Codec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
384 if(ndst<4) return -4;
385 #if FOX_BIGENDIAN
386 dst[0]=((FXuchar*)&wc)[0];
387 dst[1]=((FXuchar*)&wc)[1];
388 dst[2]=((FXuchar*)&wc)[2];
389 dst[3]=((FXuchar*)&wc)[3];
390 #else
391 dst[0]=((FXuchar*)&wc)[3];
392 dst[1]=((FXuchar*)&wc)[2];
393 dst[2]=((FXuchar*)&wc)[1];
394 dst[3]=((FXuchar*)&wc)[0];
395 #endif
396 return 4;
397 }
398
399
400 // Count multi-byte characters characters needed to convert utf8 from src
utf2mblen(const FXchar * src,FXint nsrc) const401 FXint FXUTF32Codec::utf2mblen(const FXchar* src,FXint nsrc) const {
402 register FXint nr,len=0;
403 FXwchar w;
404 if(src && 0<nsrc){
405 len+=4;
406 while(0<nsrc){
407 nr=utf2wc(w,src,nsrc);
408 if(nr<=0) return nr;
409 src+=nr;
410 nsrc-=nr;
411 len+=4;
412 }
413 }
414 return len;
415 }
416
417
418 // Convert utf8 characters at src to multi-byte characters at dst
utf2mb(FXchar * dst,FXint ndst,const FXchar * src,FXint nsrc) const419 FXint FXUTF32Codec::utf2mb(FXchar* dst,FXint ndst,const FXchar* src,FXint nsrc) const {
420 register FXint nr,len=0;
421 FXwchar w;
422 if(dst && src && 0<nsrc){
423 if(ndst<4) return -4;
424 dst[0]='\0';
425 dst[1]='\0';
426 dst[2]='\xFE';
427 dst[3]='\xFF';
428 dst+=4;
429 len+=4;
430 while(0<nsrc){
431 nr=utf2wc(w,src,nsrc);
432 if(nr<=0) return nr;
433 src+=nr;
434 nsrc-=nr;
435 if(ndst<4) return -4;
436 #if FOX_BIGENDIAN
437 dst[0]=((FXuchar*)&w)[0];
438 dst[1]=((FXuchar*)&w)[1];
439 dst[2]=((FXuchar*)&w)[2];
440 dst[3]=((FXuchar*)&w)[3];
441 #else
442 dst[0]=((FXuchar*)&w)[3];
443 dst[1]=((FXuchar*)&w)[2];
444 dst[2]=((FXuchar*)&w)[1];
445 dst[3]=((FXuchar*)&w)[0];
446 #endif
447 len+=4;
448 dst+=4;
449 ndst-=4;
450 }
451 }
452 return len;
453 }
454
455
456 // Return name
name() const457 const FXchar* FXUTF32Codec::name() const {
458 return "UTF-32";
459 }
460
461
462 // Return the IANA mime name for this codec
mimeName() const463 const FXchar* FXUTF32Codec::mimeName() const {
464 return "UTF-32";
465 }
466
467
468 // Return code for UTF-32
mibEnum() const469 FXint FXUTF32Codec::mibEnum() const {
470 return 1017;
471 }
472
473
474 // Return aliases
aliases() const475 const FXchar* const* FXUTF32Codec::aliases() const {
476 static const FXchar *const list[]={"UTF-32",NULL};
477 return list;
478 }
479
480 }
481
482