1 /* Copyright (C) 2000-2012 by George Williams */
2 /*
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions are met:
5
6 * Redistributions of source code must retain the above copyright notice, this
7 * list of conditions and the following disclaimer.
8
9 * Redistributions in binary form must reproduce the above copyright notice,
10 * this list of conditions and the following disclaimer in the documentation
11 * and/or other materials provided with the distribution.
12
13 * The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include <config.h> /* FF config file */
29 #include <gwwiconv.h>
30 #include <stddef.h>
31 #include <ustring.h>
32 #include <utype.h>
33 #include <charset.h>
34 #include <chardata.h>
35
36 enum encoding local_encoding = e_iso8859_1;
37 #if HAVE_ICONV_H
38 char *iconv_local_encoding_name = NULL;
39 #endif
40
41 static int bad_enc_warn = false;
42
43 /* Does not handle conversions to Extended unix */
44
encoding2u_strncpy(unichar_t * uto,const char * _from,int n,enum encoding cs)45 unichar_t *encoding2u_strncpy(unichar_t *uto, const char *_from, int n, enum encoding cs) {
46 unichar_t *upt=uto;
47 const unichar_t *table;
48 int offset;
49 const unsigned char *from = (const unsigned char *) _from;
50
51 if ( cs<e_first2byte ) {
52 table = unicode_from_alphabets[cs];
53 if ( table==NULL ) {
54 while ( *from && n>0 ) {
55 *upt++ = *(unsigned char *) (from++);
56 --n;
57 }
58 } else {
59 while ( *from && n>0 ) {
60 *upt ++ = table[*(unsigned char *) (from++)];
61 --n;
62 }
63 }
64 } else if ( cs<e_unicode ) {
65 *uto = '\0';
66 switch ( cs ) {
67 default:
68 if ( !bad_enc_warn ) {
69 bad_enc_warn = true;
70 fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
71 }
72 return( encoding2u_strncpy(uto,_from,n,e_iso8859_1));
73 case e_johab: case e_big5: case e_big5hkscs:
74 if ( cs==e_big5 ) {
75 offset = 0xa100;
76 table = unicode_from_big5;
77 } else if ( cs==e_big5hkscs ) {
78 offset = 0x8100;
79 table = unicode_from_big5hkscs;
80 } else {
81 offset = 0x8400;
82 table = unicode_from_johab;
83 }
84 while ( *from && n>0 ) {
85 if ( *from>=(offset>>8) && from[1]!='\0' ) {
86 *upt++ = table[ ((*from<<8) | from[1]) - offset ];
87 from += 2;
88 } else
89 *upt++ = *from++;
90 --n;
91 }
92 break;
93 case e_wansung:
94 while ( *from && n>0 ) {
95 if ( *from>=0xa1 && from[1]>=0xa1 ) {
96 *upt++ = unicode_from_ksc5601[ (*from-0xa1)*94+(from[1]-0xa1) ];
97 from += 2;
98 } else
99 *upt++ = *from++;
100 --n;
101 }
102 break;
103 case e_jisgb:
104 while ( *from && n>0 ) {
105 if ( *from>=0xa1 && from[1]>=0xa1 ) {
106 *upt++ = unicode_from_gb2312[ (*from-0xa1)*94+(from[1]-0xa1) ];
107 from += 2;
108 } else
109 *upt++ = *from++;
110 --n;
111 }
112 break;
113 case e_sjis:
114 while ( *from && n>0 ) {
115 if ( *from<127 || ( *from>=161 && *from<=223 )) {
116 *upt++ = unicode_from_jis201[*from++];
117 } else {
118 int ch1 = *from++;
119 int ch2 = *from++;
120 if ( ch1 >= 129 && ch1<= 159 )
121 ch1 -= 112;
122 else
123 ch1 -= 176;
124 ch1 <<= 1;
125 if ( ch2>=159 )
126 ch2-= 126;
127 else if ( ch2>127 ) {
128 --ch1;
129 ch2 -= 32;
130 } else {
131 --ch1;
132 ch2 -= 31;
133 }
134 *upt++ = unicode_from_jis208[(ch1-0x21)*94+(ch2-0x21)];
135 }
136 --n;
137 }
138 break;
139 }
140 } else if ( cs==e_unicode ) {
141 unichar_t *ufrom = (unichar_t *) from;
142 while ( *ufrom && n>0 ) {
143 *upt++ = *ufrom++;
144 --n;
145 }
146 } else if ( cs==e_unicode_backwards ) {
147 unichar_t *ufrom = (unichar_t *) from;
148 while ( *ufrom && n>0 ) {
149 unichar_t ch = (*ufrom>>8)||((*ufrom&0xff)<<8);
150 *upt++ = ch;
151 ++ufrom;
152 --n;
153 }
154 } else if ( cs==e_utf8 ) {
155 while ( *from && n>0 ) {
156 if ( *from<=127 )
157 *upt = *from++;
158 else if ( *from<=0xdf ) {
159 if ( from[1]>=0x80 ) {
160 *upt = ((*from&0x1f)<<6) | (from[1]&0x3f);
161 from += 2;
162 } else {
163 ++from; /* Badly formed utf */
164 *upt = 0xfffd;
165 }
166 } else if ( *from<=0xef ) {
167 if ( from[1]>=0x80 && from[2]>=0x80 ) {
168 *upt = ((*from&0xf)<<12) | ((from[1]&0x3f)<<6) | (from[2]&0x3f);
169 from += 3;
170 } else {
171 ++from; /* Badly formed utf */
172 *upt = 0xfffd;
173 }
174 } else if ( n>2 ) {
175 if ( from[1]>=0x80 && from[2]>=0x80 && from[3]>=0x80 ) {
176 int w = ( ((*from&0x7)<<2) | ((from[1]&0x30)>>4) )-1;
177 *upt++ = 0xd800 | (w<<6) | ((from[1]&0xf)<<2) | ((from[2]&0x30)>>4);
178 *upt = 0xdc00 | ((from[2]&0xf)<<6) | (from[3]&0x3f);
179 from += 4;
180 } else {
181 ++from; /* Badly formed utf */
182 *upt = 0xfffd;
183 }
184 } else {
185 /* no space for surrogate */
186 from += 4;
187 }
188 ++upt;
189 }
190 } else {
191 if ( !bad_enc_warn ) {
192 bad_enc_warn = true;
193 fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
194 }
195 return( encoding2u_strncpy(uto,_from,n,e_iso8859_1));
196 }
197
198 if ( n>0 )
199 *upt = '\0';
200
201 return( uto );
202 }
203
u2encoding_strncpy(char * to,const unichar_t * ufrom,int n,enum encoding cs)204 char *u2encoding_strncpy(char *to, const unichar_t *ufrom, int n, enum encoding cs) {
205 char *pt = to;
206
207 /* we just ignore anything that doesn't fit in the encoding we look at */
208 if ( cs<e_first2byte ) {
209 struct charmap *table = NULL;
210 unsigned char *plane;
211 table = alphabets_from_unicode[cs];
212 if ( table==NULL ) { /* ASCII */
213 while ( *ufrom && n>0 ) {
214 int ch = *ufrom;
215 if ( ch<127 ) {
216 *pt++ = ch;
217 --n;
218 }
219 ++ufrom;
220 }
221 } else {
222 while ( *ufrom && n>0 ) {
223 int highch = *ufrom>>8, ch;
224 if ( highch>=table->first && highch<=table->last &&
225 (plane = table->table[highch])!=NULL &&
226 (ch=plane[*ufrom&0xff])!=0 ) {
227 *pt++ = ch;
228 --n;
229 }
230 ++ufrom;
231 }
232 }
233 if ( n>0 )
234 *pt = '\0';
235 } else if ( cs<e_unicode ) {
236 struct charmap2 *table;
237 unsigned short *plane;
238 unsigned char *plane1;
239
240 *to = '\0';
241 switch ( cs ) {
242 default:
243 if ( !bad_enc_warn ) {
244 bad_enc_warn = true;
245 fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
246 }
247 return( u2encoding_strncpy(to,ufrom,n,e_iso8859_1));
248 }
249 if ( n>0 )
250 *pt = '\0';
251 } else if ( cs==e_unicode ) {
252 unichar_t *uto = (unichar_t *) to;
253 while ( *ufrom && n>1 ) {
254 *uto++ = *ufrom++;
255 n-=sizeof(unichar_t);
256 }
257 if ( n>1 )
258 *uto = '\0';
259 } else if ( cs==e_unicode_backwards ) {
260 unichar_t *uto = (unichar_t *) to;
261 while ( *ufrom && n>sizeof(unichar_t)-1 ) {
262 unichar_t ch = (*ufrom>>24)|((*ufrom>>8)&0xff00)|
263 ((*ufrom<<8)&0xff0000)|(*ufrom<<24);
264 *uto++ = ch;
265 ++ufrom;
266 n-=sizeof(unichar_t);
267 }
268 if ( n>1 )
269 *uto = '\0';
270 } else if ( cs==e_utf8 ) {
271 while ( *ufrom ) {
272 if ( *ufrom<0x80 ) {
273 if ( n<=1 )
274 break;
275 *pt++ = *ufrom;
276 --n;
277 } else if ( *ufrom<0x800 ) {
278 if ( n<=2 )
279 break;
280 *pt++ = 0xc0 | (*ufrom>>6);
281 *pt++ = 0x80 | (*ufrom&0x3f);
282 n -= 2;
283 } else if ( *ufrom>=0xd800 && *ufrom<0xdc00 && ufrom[1]>=0xdc00 && ufrom[1]<0xe000 ) {
284 int u = ((*ufrom>>6)&0xf)+1, y = ((*ufrom&3)<<4) | ((ufrom[1]>>6)&0xf);
285 if ( n<=4 )
286 break;
287 *pt++ = 0xf0 | (u>>2);
288 *pt++ = 0x80 | ((u&3)<<4) | ((*ufrom>>2)&0xf);
289 *pt++ = 0x80 | y;
290 *pt++ = 0x80 | (ufrom[1]&0x3f);
291 n -= 4;
292 } else {
293 if ( n<=3 )
294 break;
295 *pt++ = 0xe0 | (*ufrom>>12);
296 *pt++ = 0x80 | ((*ufrom>>6)&0x3f);
297 *pt++ = 0x80 | (*ufrom&0x3f);
298 }
299 ++ufrom;
300 }
301 if ( n>1 )
302 *pt = '\0';
303 } else {
304 if ( !bad_enc_warn ) {
305 bad_enc_warn = true;
306 fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
307 }
308 return( u2encoding_strncpy(to,ufrom,n,e_iso8859_1));
309 }
310
311 return( to );
312 }
313
314 #if HAVE_ICONV_H
315 static char *old_local_name=NULL;
316 static iconv_t to_unicode=(iconv_t) (-1), from_unicode=(iconv_t) (-1);
317 static iconv_t to_utf8=(iconv_t) (-1), from_utf8=(iconv_t) (-1);
318 static char *names[] = { "UCS-4-INTERNAL", "UCS-4", "UCS4", "ISO-10646-UCS-4", "UTF-32", NULL };
319 static char *namesle[] = { "UCS-4LE", "UTF-32LE", NULL };
320 static char *namesbe[] = { "UCS-4BE", "UTF-32BE", NULL };
321 static char *unicode_name = NULL;
322 static int byteswapped = false;
323
BytesNormal(iconv_t latin1_2_unicode)324 static int BytesNormal(iconv_t latin1_2_unicode) {
325 union {
326 int32 s;
327 char c[4];
328 } u[8];
329 char *from = "A", *to = &u[0].c[0];
330 size_t in_left = 1, out_left = sizeof(u);
331 memset(u,0,sizeof(u));
332 iconv( latin1_2_unicode, (iconv_arg2_t) &from, &in_left, &to, &out_left);
333 if ( u[0].s=='A' )
334 return( true );
335
336 return( false );
337 }
338
my_iconv_setup(void)339 static int my_iconv_setup(void) {
340 char **testnames;
341 int i;
342 union {
343 short s;
344 char c[2];
345 } u;
346 iconv_t test;
347
348 if ( iconv_local_encoding_name==NULL ) {
349 if ( to_unicode!=(iconv_t) (-1) ) {
350 iconv_close(to_unicode);
351 iconv_close(from_unicode);
352 to_unicode = from_unicode = (iconv_t) (-1);
353 }
354 return(false);
355 }
356 if ( old_local_name!=NULL && strcmp(old_local_name,iconv_local_encoding_name)==0 )
357 return( to_unicode!=(iconv_t) (-1) );
358
359 free(old_local_name);
360 old_local_name = xstrdup(iconv_local_encoding_name);
361 to_utf8 = iconv_open("UTF-8",iconv_local_encoding_name);
362 from_utf8 = iconv_open(iconv_local_encoding_name,"UTF-8");
363
364 if ( unicode_name==NULL ) {
365 u.c[0] = 0x1; u.c[1] = 0x2;
366 if ( u.s==0x201 ) { /* Little endian */
367 testnames = namesle;
368 } else {
369 testnames = namesbe;
370 }
371 for ( i=0; testnames[i]!=NULL; ++i ) {
372 test = iconv_open(testnames[i],"ISO-8859-1");
373 if ( test!=(iconv_t) -1 && test!=NULL ) {
374 iconv_close(test);
375 unicode_name = testnames[i];
376 break;
377 }
378 }
379 if ( unicode_name==NULL ) {
380 for ( i=0; names[i]!=NULL; ++i ) {
381 test = iconv_open(names[i],"ISO-8859-1");
382 if ( test!=(iconv_t) -1 && test!=NULL ) {
383 byteswapped = !BytesNormal(test);
384 iconv_close(test);
385 unicode_name = names[i];
386 break;
387 }
388 }
389 }
390 }
391 if ( unicode_name == NULL ) {
392 fprintf( stderr, "Could not find a name for Unicode which iconv could understand.\n" );
393 return( false );
394 } else if ( byteswapped ) {
395 fprintf( stderr, "The only name for Unicode that iconv understood produced unexpected results.\nPerhaps %s was byte swapped.\n", unicode_name );
396 return( false );
397 }
398
399 to_unicode = iconv_open(unicode_name,iconv_local_encoding_name);
400 from_unicode = iconv_open(iconv_local_encoding_name,unicode_name);
401 if ( to_unicode == (iconv_t) (-1) || to_utf8 == (iconv_t) (-1) ) {
402 fprintf( stderr, "iconv failed to understand encoding %s\n",
403 iconv_local_encoding_name);
404 return( false );
405 }
406 return( true );
407 }
408 #endif
409
def2u_strncpy(unichar_t * uto,const char * from,int n)410 unichar_t *def2u_strncpy(unichar_t *uto, const char *from, int n) {
411 #if HAVE_ICONV_H
412 if ( my_iconv_setup() ) {
413 size_t in_left = n, out_left = sizeof(unichar_t)*n;
414 char *cto = (char *) uto;
415 iconv(to_unicode, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
416 if ( cto<((char *) uto)+2*n) *cto++ = '\0';
417 if ( cto<((char *) uto)+2*n) *cto++ = '\0';
418 if ( cto<((char *) uto)+4*n) *cto++ = '\0';
419 if ( cto<((char *) uto)+4*n) *cto++ = '\0';
420 return( uto );
421 }
422 #endif
423 return( encoding2u_strncpy(uto,from,n,local_encoding));
424 }
425
u2def_strncpy(char * to,const unichar_t * ufrom,int n)426 char *u2def_strncpy(char *to, const unichar_t *ufrom, int n) {
427 #if HAVE_ICONV_H
428 if ( my_iconv_setup() ) {
429 size_t in_left = sizeof(unichar_t)*n, out_left = n;
430 char *cfrom = (char *) ufrom, *cto=to;
431 iconv(from_unicode, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
432 if ( cto<to+n ) *cto++ = '\0';
433 if ( cto<to+n ) *cto++ = '\0';
434 if ( cto<to+n ) *cto++ = '\0';
435 if ( cto<to+n ) *cto++ = '\0';
436 return( to );
437 }
438 #endif
439 return( u2encoding_strncpy(to,ufrom,n,local_encoding));
440 }
441
def2u_copy(const char * from)442 unichar_t *def2u_copy(const char *from) {
443 int len;
444 unichar_t *uto, *ret;
445
446 if ( from==NULL ) return( NULL );
447 len = strlen(from);
448 uto = (unichar_t *) malloc((len+1)*sizeof(unichar_t));
449 if ( uto==NULL ) return( NULL );
450 #if HAVE_ICONV_H
451 if ( my_iconv_setup() ) {
452 size_t in_left = len, out_left = sizeof(unichar_t)*len;
453 char *cto = (char *) uto;
454 iconv(to_unicode, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
455 *cto++ = '\0';
456 *cto++ = '\0';
457 *cto++ = '\0';
458 *cto++ = '\0';
459 return( uto );
460 }
461 #endif
462 ret = encoding2u_strncpy(uto,from,len,local_encoding);
463 if ( ret==NULL )
464 free( uto );
465 else
466 uto[len] = '\0';
467 return( ret );
468 }
469
u2def_copy(const unichar_t * ufrom)470 char *u2def_copy(const unichar_t *ufrom) {
471 int len;
472 char *to, *ret;
473
474 if ( ufrom==NULL ) return( NULL );
475 len = u_strlen(ufrom);
476 #if HAVE_ICONV_H
477 if ( my_iconv_setup() ) {
478 size_t in_left = sizeof(unichar_t)*len, out_left = 3*len;
479 char *cfrom = (char *) ufrom, *cto;
480 cto = to = (char *) malloc(3*len+2);
481 if ( cto==NULL ) return( NULL );
482 iconv(from_unicode, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
483 *cto++ = '\0';
484 *cto++ = '\0';
485 *cto++ = '\0';
486 *cto++ = '\0';
487 return( to );
488 }
489 #endif
490 if ( local_encoding==e_utf8 )
491 len *= 3;
492 if ( local_encoding>=e_first2byte )
493 len *= 2;
494 to = (char *) malloc(len+sizeof(unichar_t));
495 if ( to==NULL ) return( NULL );
496 ret = u2encoding_strncpy(to,ufrom,len,local_encoding);
497 if ( ret==NULL )
498 free( to );
499 else if ( local_encoding<e_first2byte )
500 to[len] = '\0';
501 else {
502 to[len] = '\0';
503 to[len+1] = '\0';
504 }
505 return( ret );
506 }
507
def2utf8_copy(const char * from)508 char *def2utf8_copy(const char *from) {
509 int len;
510 char *ret;
511 unichar_t *temp, *uto;
512
513 if ( from==NULL ) return( NULL );
514 len = strlen(from);
515 #if HAVE_ICONV_H
516 if ( my_iconv_setup() ) {
517 size_t in_left = len, out_left = 3*(len+1);
518 char *cto = (char *) malloc(3*(len+1)), *cret = cto;
519 if ( cto==NULL ) return( NULL );
520 iconv(to_utf8, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
521 *cto++ = '\0';
522 *cto++ = '\0';
523 *cto++ = '\0';
524 *cto++ = '\0';
525 return( cret );
526 }
527 #endif
528 uto = (unichar_t *) malloc(sizeof(unichar_t)*(len+1));
529 if ( uto==NULL ) return( NULL );
530 temp = encoding2u_strncpy(uto,from,len,local_encoding);
531 if ( temp==NULL ) {
532 free( uto );
533 return( NULL );
534 }
535 uto[len] = '\0';
536 ret = u2utf8_copy(uto);
537 free(uto);
538 return( ret );
539 }
540
utf82def_copy(const char * ufrom)541 char *utf82def_copy(const char *ufrom) {
542 int len;
543 char *ret;
544 unichar_t *u2from;
545
546 if ( ufrom==NULL ) return( NULL );
547 len = strlen(ufrom);
548 #if HAVE_ICONV_H
549 if ( my_iconv_setup() ) {
550 size_t in_left = len, out_left = 3*len;
551 char *cfrom = (char *) ufrom, *cto, *to;
552 cto = to = (char *) malloc(3*len+2);
553 if ( cto==NULL ) return( NULL );
554 iconv(from_utf8, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
555 *cto++ = '\0';
556 *cto++ = '\0';
557 *cto++ = '\0';
558 *cto++ = '\0';
559 return( to );
560 }
561 #endif
562 if ( local_encoding==e_utf8 )
563 return( xstrdup( ufrom )); /* Well, that's easy */
564 u2from = utf82u_copy(ufrom);
565 ret = u2def_copy(u2from);
566 free(u2from);
567 return( ret );
568 }
569