1 /********************************************************************************
2 * *
3 * U T F - 3 2 T e x t C o d e c *
4 * *
5 *********************************************************************************
6 * Copyright (C) 2002,2021 by L.Johnson & J.van der Zijp. All Rights Reserved. *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or modify *
9 * it under the terms of the GNU Lesser General Public License as published by *
10 * the Free Software Foundation; either version 3 of the License, or *
11 * (at your option) any later version. *
12 * *
13 * This library is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU Lesser General Public License for more details. *
17 * *
18 * You should have received a copy of the GNU Lesser General Public License *
19 * along with this program. If not, see <http://www.gnu.org/licenses/> *
20 ********************************************************************************/
21 #include "xincs.h"
22 #include "fxver.h"
23 #include "fxdefs.h"
24 #include "fxmath.h"
25 #include "FXArray.h"
26 #include "FXHash.h"
27 #include "FXStream.h"
28 #include "FXString.h"
29 #include "FXTextCodec.h"
30 #include "FXUTF32Codec.h"
31
32
33 /*
34 Notes:
35 */
36
37 /*******************************************************************************/
38
39 namespace FX {
40
41
42 const FXwchar BOM_BE=0x0000FEFF;
43 const FXwchar BOM_LE=0xFFFE0000;
44
45
46 FXIMPLEMENT(FXUTF32BECodec,FXTextCodec,NULL,0)
47
48
49 // Convert from utf32be
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const50 FXint FXUTF32BECodec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
51 if(nsrc<4) return -4;
52 #if (FOX_BIGENDIAN == 1)
53 ((FXuchar*)&wc)[0]=src[0];
54 ((FXuchar*)&wc)[1]=src[1];
55 ((FXuchar*)&wc)[2]=src[2];
56 ((FXuchar*)&wc)[3]=src[3];
57 #else
58 ((FXuchar*)&wc)[3]=src[0];
59 ((FXuchar*)&wc)[2]=src[1];
60 ((FXuchar*)&wc)[1]=src[2];
61 ((FXuchar*)&wc)[0]=src[3];
62 #endif
63 return 4;
64 }
65
66
67 // Convert to utf32be
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const68 FXint FXUTF32BECodec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
69 if(ndst<4) return -4;
70 #if (FOX_BIGENDIAN == 1)
71 dst[0]=((FXuchar*)&wc)[0];
72 dst[1]=((FXuchar*)&wc)[1];
73 dst[2]=((FXuchar*)&wc)[2];
74 dst[3]=((FXuchar*)&wc)[3];
75 #else
76 dst[0]=((FXuchar*)&wc)[3];
77 dst[1]=((FXuchar*)&wc)[2];
78 dst[2]=((FXuchar*)&wc)[1];
79 dst[3]=((FXuchar*)&wc)[0];
80 #endif
81 return 4;
82 }
83
84
85 // Return name
name() const86 const FXchar* FXUTF32BECodec::name() const {
87 return "UTF-32BE";
88 }
89
90
91 // Return the IANA mime name for this codec
mimeName() const92 const FXchar* FXUTF32BECodec::mimeName() const {
93 return "UTF-32BE";
94 }
95
96
97 // Return code for UTF-32
mibEnum() const98 FXint FXUTF32BECodec::mibEnum() const {
99 return 1018;
100 }
101
102
103 // Return aliases
aliases() const104 const FXchar* const* FXUTF32BECodec::aliases() const {
105 static const FXchar *const list[]={"UTF-32BE",NULL};
106 return list;
107 }
108
109
110 /*******************************************************************************/
111
112 FXIMPLEMENT(FXUTF32LECodec,FXTextCodec,NULL,0)
113
114
115 // Convert from utf32le
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const116 FXint FXUTF32LECodec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
117 if(nsrc<4) return -4;
118 #if (FOX_BIGENDIAN == 1)
119 ((FXuchar*)&wc)[0]=src[3];
120 ((FXuchar*)&wc)[1]=src[2];
121 ((FXuchar*)&wc)[2]=src[1];
122 ((FXuchar*)&wc)[3]=src[0];
123 #else
124 ((FXuchar*)&wc)[3]=src[3];
125 ((FXuchar*)&wc)[2]=src[2];
126 ((FXuchar*)&wc)[1]=src[1];
127 ((FXuchar*)&wc)[0]=src[0];
128 #endif
129 return 4;
130 }
131
132
133 // Convert to utf32le
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const134 FXint FXUTF32LECodec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
135 if(ndst<4) return -4;
136 #if (FOX_BIGENDIAN == 1)
137 dst[3]=((FXuchar*)&wc)[0];
138 dst[2]=((FXuchar*)&wc)[1];
139 dst[1]=((FXuchar*)&wc)[2];
140 dst[0]=((FXuchar*)&wc)[3];
141 #else
142 dst[3]=((FXuchar*)&wc)[3];
143 dst[2]=((FXuchar*)&wc)[2];
144 dst[1]=((FXuchar*)&wc)[1];
145 dst[0]=((FXuchar*)&wc)[0];
146 #endif
147 return 4;
148 }
149
150
151 // Return name
name() const152 const FXchar* FXUTF32LECodec::name() const {
153 return "UTF-32LE";
154 }
155
156
157 // Return the IANA mime name for this codec
mimeName() const158 const FXchar* FXUTF32LECodec::mimeName() const {
159 return "UTF-32LE";
160 }
161
162
163 // Return code for UTF-32
mibEnum() const164 FXint FXUTF32LECodec::mibEnum() const {
165 return 1019;
166 }
167
168
169 // Return aliases
aliases() const170 const FXchar* const* FXUTF32LECodec::aliases() const {
171 static const FXchar *const list[]={"UTF-32LE",NULL};
172 return list;
173 }
174
175
176 /*******************************************************************************/
177
178 FXIMPLEMENT(FXUTF32Codec,FXTextCodec,NULL,0)
179
180
181 // Convert utf32
mb2wc(FXwchar & wc,const FXchar * src,FXint nsrc) const182 FXint FXUTF32Codec::mb2wc(FXwchar& wc,const FXchar* src,FXint nsrc) const {
183 if(nsrc<4) return -4;
184 #if (FOX_BIGENDIAN == 1)
185 ((FXuchar*)&wc)[0]=src[0];
186 ((FXuchar*)&wc)[1]=src[1];
187 ((FXuchar*)&wc)[2]=src[2];
188 ((FXuchar*)&wc)[3]=src[3];
189 #else
190 ((FXuchar*)&wc)[3]=src[0];
191 ((FXuchar*)&wc)[2]=src[1];
192 ((FXuchar*)&wc)[1]=src[2];
193 ((FXuchar*)&wc)[0]=src[3];
194 #endif
195 if(wc==BOM_BE){
196 if(nsrc<8) return -8;
197 #if (FOX_BIGENDIAN == 1)
198 ((FXuchar*)&wc)[0]=src[4];
199 ((FXuchar*)&wc)[1]=src[5];
200 ((FXuchar*)&wc)[2]=src[6];
201 ((FXuchar*)&wc)[3]=src[7];
202 #else
203 ((FXuchar*)&wc)[3]=src[4];
204 ((FXuchar*)&wc)[2]=src[5];
205 ((FXuchar*)&wc)[1]=src[6];
206 ((FXuchar*)&wc)[0]=src[7];
207 #endif
208 return 8;
209 }
210 if(wc==BOM_LE){
211 if(nsrc<8) return -8;
212 #if (FOX_BIGENDIAN == 1)
213 ((FXuchar*)&wc)[0]=src[7];
214 ((FXuchar*)&wc)[1]=src[6];
215 ((FXuchar*)&wc)[2]=src[5];
216 ((FXuchar*)&wc)[3]=src[4];
217 #else
218 ((FXuchar*)&wc)[3]=src[7];
219 ((FXuchar*)&wc)[2]=src[6];
220 ((FXuchar*)&wc)[1]=src[5];
221 ((FXuchar*)&wc)[0]=src[4];
222 #endif
223 return 8;
224 }
225 return 4;
226 }
227
228
229 // Number of bytes for wide character
utflen(FXwchar w)230 static inline FXint utflen(FXwchar w){
231 if(w<0x80) return 1;
232 if(w<0x800) return 2;
233 if(w<0x10000) return 3;
234 if(w<0x200000) return 4;
235 if(w<0x4000000) return 5;
236 return 6;
237 }
238
239
240 // Count number of utf8 characters needed to convert multi-byte characters from src
mb2utflen(const FXchar * src,FXint nsrc) const241 FXint FXUTF32Codec::mb2utflen(const FXchar* src,FXint nsrc) const {
242 FXint len=0;
243 FXwchar w;
244 if(src && 0<nsrc){
245 if(nsrc<4) return -4;
246 #if (FOX_BIGENDIAN == 1)
247 ((FXuchar*)&w)[0]=src[0];
248 ((FXuchar*)&w)[1]=src[1];
249 ((FXuchar*)&w)[2]=src[2];
250 ((FXuchar*)&w)[3]=src[3];
251 #else
252 ((FXuchar*)&w)[3]=src[0];
253 ((FXuchar*)&w)[2]=src[1];
254 ((FXuchar*)&w)[1]=src[2];
255 ((FXuchar*)&w)[0]=src[3];
256 #endif
257 if(w!=BOM_LE){ // Big-endian (default)
258 if(w==BOM_BE){
259 src+=4;
260 nsrc-=4;
261 }
262 while(0<nsrc){
263 if(nsrc<4) return -4;
264 #if (FOX_BIGENDIAN == 1)
265 ((FXuchar*)&w)[0]=src[0];
266 ((FXuchar*)&w)[1]=src[1];
267 ((FXuchar*)&w)[2]=src[2];
268 ((FXuchar*)&w)[3]=src[3];
269 #else
270 ((FXuchar*)&w)[3]=src[0];
271 ((FXuchar*)&w)[2]=src[1];
272 ((FXuchar*)&w)[1]=src[2];
273 ((FXuchar*)&w)[0]=src[3];
274 #endif
275 src+=4;
276 nsrc-=4;
277 len+=utflen(w);
278 }
279 }
280 else{ // Little-endian
281 src+=4;
282 nsrc-=4;
283 while(0<nsrc){
284 if(nsrc<4) return -4;
285 #if (FOX_BIGENDIAN == 1)
286 ((FXuchar*)&w)[0]=src[3];
287 ((FXuchar*)&w)[1]=src[2];
288 ((FXuchar*)&w)[2]=src[1];
289 ((FXuchar*)&w)[3]=src[0];
290 #else
291 ((FXuchar*)&w)[3]=src[3];
292 ((FXuchar*)&w)[2]=src[2];
293 ((FXuchar*)&w)[1]=src[1];
294 ((FXuchar*)&w)[0]=src[0];
295 #endif
296 src+=4;
297 nsrc-=4;
298 len+=utflen(w);
299 }
300 }
301 }
302 return len;
303 }
304
305
306 // Convert multi-byte characters from src to utf8 characters at dst
mb2utf(FXchar * dst,FXint ndst,const FXchar * src,FXint nsrc) const307 FXint FXUTF32Codec::mb2utf(FXchar* dst,FXint ndst,const FXchar* src,FXint nsrc) const {
308 FXint nw,len=0;
309 FXwchar w;
310 if(dst && src && 0<nsrc){
311 if(nsrc<4) return -4;
312 #if (FOX_BIGENDIAN == 1)
313 ((FXuchar*)&w)[0]=src[0];
314 ((FXuchar*)&w)[1]=src[1];
315 ((FXuchar*)&w)[2]=src[2];
316 ((FXuchar*)&w)[3]=src[3];
317 #else
318 ((FXuchar*)&w)[3]=src[0];
319 ((FXuchar*)&w)[2]=src[1];
320 ((FXuchar*)&w)[1]=src[2];
321 ((FXuchar*)&w)[0]=src[3];
322 #endif
323 if(w!=BOM_LE){ // Big-endian (default)
324 if(w==BOM_BE){
325 src+=4;
326 nsrc-=4;
327 }
328 while(0<nsrc){
329 if(nsrc<4) return -4;
330 #if (FOX_BIGENDIAN == 1)
331 ((FXuchar*)&w)[0]=src[0];
332 ((FXuchar*)&w)[1]=src[1];
333 ((FXuchar*)&w)[2]=src[2];
334 ((FXuchar*)&w)[3]=src[3];
335 #else
336 ((FXuchar*)&w)[3]=src[0];
337 ((FXuchar*)&w)[2]=src[1];
338 ((FXuchar*)&w)[1]=src[2];
339 ((FXuchar*)&w)[0]=src[3];
340 #endif
341 if(FX::wc2utf(w)>ndst) break;
342 nw=FX::wc2utf(dst,w);
343 src+=4;
344 nsrc-=4;
345 len+=nw;
346 dst+=nw;
347 ndst-=nw;
348 }
349 }
350 else{ // Little-endian
351 src+=4;
352 nsrc-=4;
353 while(0<nsrc){
354 if(nsrc<4) return -4;
355 #if (FOX_BIGENDIAN == 1)
356 ((FXuchar*)&w)[0]=src[3];
357 ((FXuchar*)&w)[1]=src[2];
358 ((FXuchar*)&w)[2]=src[1];
359 ((FXuchar*)&w)[3]=src[0];
360 #else
361 ((FXuchar*)&w)[3]=src[3];
362 ((FXuchar*)&w)[2]=src[2];
363 ((FXuchar*)&w)[1]=src[1];
364 ((FXuchar*)&w)[0]=src[0];
365 #endif
366 if(FX::wc2utf(w)>ndst) break;
367 nw=FX::wc2utf(dst,w);
368 src+=4;
369 nsrc-=4;
370 len+=nw;
371 dst+=nw;
372 ndst-=nw;
373 }
374 }
375 }
376 return len;
377 }
378
379
380 // Convert to utf32
wc2mb(FXchar * dst,FXint ndst,FXwchar wc) const381 FXint FXUTF32Codec::wc2mb(FXchar* dst,FXint ndst,FXwchar wc) const {
382 if(ndst<4) return 0;
383 #if (FOX_BIGENDIAN == 1)
384 dst[0]=((FXuchar*)&wc)[0];
385 dst[1]=((FXuchar*)&wc)[1];
386 dst[2]=((FXuchar*)&wc)[2];
387 dst[3]=((FXuchar*)&wc)[3];
388 #else
389 dst[0]=((FXuchar*)&wc)[3];
390 dst[1]=((FXuchar*)&wc)[2];
391 dst[2]=((FXuchar*)&wc)[1];
392 dst[3]=((FXuchar*)&wc)[0];
393 #endif
394 return 4;
395 }
396
397
398 // Count multi-byte characters characters needed to convert utf8 from src
utf2mblen(const FXchar * src,FXint nsrc) const399 FXint FXUTF32Codec::utf2mblen(const FXchar* src,FXint nsrc) const {
400 FXint nr,len=0;
401 if(src && 0<nsrc){
402 len+=4;
403 while(0<nsrc){
404 nr=FX::wclen(src);
405 if(nr>nsrc) break;
406 src+=nr;
407 nsrc-=nr;
408 len+=4;
409 }
410 }
411 return len;
412 }
413
414
415 // Convert utf8 characters at src to multi-byte characters at dst
utf2mb(FXchar * dst,FXint ndst,const FXchar * src,FXint nsrc) const416 FXint FXUTF32Codec::utf2mb(FXchar* dst,FXint ndst,const FXchar* src,FXint nsrc) const {
417 FXint nr,len=0;
418 FXwchar w;
419 if(dst && src && 0<nsrc){
420 if(ndst<4) return 0;
421 dst[0]='\0';
422 dst[1]='\0';
423 dst[2]='\xFE';
424 dst[3]='\xFF';
425 dst+=4;
426 len+=4;
427 while(0<nsrc){
428 nr=FX::wclen(src);
429 if(nr>nsrc) break;
430 w=wc(src);
431 if(ndst<4) break;
432 #if (FOX_BIGENDIAN == 1)
433 dst[0]=((FXuchar*)&w)[0];
434 dst[1]=((FXuchar*)&w)[1];
435 dst[2]=((FXuchar*)&w)[2];
436 dst[3]=((FXuchar*)&w)[3];
437 #else
438 dst[0]=((FXuchar*)&w)[3];
439 dst[1]=((FXuchar*)&w)[2];
440 dst[2]=((FXuchar*)&w)[1];
441 dst[3]=((FXuchar*)&w)[0];
442 #endif
443 src+=nr;
444 nsrc-=nr;
445 len+=4;
446 dst+=4;
447 ndst-=4;
448 }
449 }
450 return len;
451 }
452
453
454 // Return name
name() const455 const FXchar* FXUTF32Codec::name() const {
456 return "UTF-32";
457 }
458
459
460 // Return the IANA mime name for this codec
mimeName() const461 const FXchar* FXUTF32Codec::mimeName() const {
462 return "UTF-32";
463 }
464
465
466 // Return code for UTF-32
mibEnum() const467 FXint FXUTF32Codec::mibEnum() const {
468 return 1017;
469 }
470
471
472 // Return aliases
aliases() const473 const FXchar* const* FXUTF32Codec::aliases() const {
474 static const FXchar *const list[]={"UTF-32",NULL};
475 return list;
476 }
477
478 }
479
480