1 /* Copyright (C) 2000-2012 by George Williams */
2 /*
3  * Redistribution and use in source and binary forms, with or without
4  * modification, are permitted provided that the following conditions are met:
5 
6  * Redistributions of source code must retain the above copyright notice, this
7  * list of conditions and the following disclaimer.
8 
9  * Redistributions in binary form must reproduce the above copyright notice,
10  * this list of conditions and the following disclaimer in the documentation
11  * and/or other materials provided with the distribution.
12 
13  * The name of the author may not be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15 
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
25  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <config.h>		/* FF config file */
29 #include <gwwiconv.h>
30 #include <stddef.h>
31 #include <ustring.h>
32 #include <utype.h>
33 #include <charset.h>
34 #include <chardata.h>
35 
36 enum encoding local_encoding = e_iso8859_1;
37 #if HAVE_ICONV_H
38 char *iconv_local_encoding_name = NULL;
39 #endif
40 
41 static int bad_enc_warn = false;
42 
43 /* Does not handle conversions to Extended unix */
44 
encoding2u_strncpy(unichar_t * uto,const char * _from,int n,enum encoding cs)45 unichar_t *encoding2u_strncpy(unichar_t *uto, const char *_from, int n, enum encoding cs) {
46     unichar_t *upt=uto;
47     const unichar_t *table;
48     int offset;
49     const unsigned char *from = (const unsigned char *) _from;
50 
51     if ( cs<e_first2byte ) {
52 	table = unicode_from_alphabets[cs];
53 	if ( table==NULL ) {
54 	    while ( *from && n>0 ) {
55 		*upt++ = *(unsigned char *) (from++);
56 		--n;
57 	    }
58 	} else {
59 	    while ( *from && n>0 ) {
60 		*upt ++ = table[*(unsigned char *) (from++)];
61 		--n;
62 	    }
63 	}
64     } else if ( cs<e_unicode ) {
65 	*uto = '\0';
66 	switch ( cs ) {
67 	  default:
68 	    if ( !bad_enc_warn ) {
69 		bad_enc_warn = true;
70 		fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
71 	    }
72 return( encoding2u_strncpy(uto,_from,n,e_iso8859_1));
73 	  case e_johab: case e_big5: case e_big5hkscs:
74 	    if ( cs==e_big5 ) {
75 		offset = 0xa100;
76 		table = unicode_from_big5;
77 	    } else if ( cs==e_big5hkscs ) {
78 		offset = 0x8100;
79 		table = unicode_from_big5hkscs;
80 	    } else {
81 		offset = 0x8400;
82 		table = unicode_from_johab;
83 	    }
84 	    while ( *from && n>0 ) {
85 		if ( *from>=(offset>>8) && from[1]!='\0' ) {
86 		    *upt++ = table[ ((*from<<8) | from[1]) - offset ];
87 		    from += 2;
88 		} else
89 		    *upt++ = *from++;
90 		--n;
91 	    }
92 	  break;
93 	  case e_wansung:
94 	    while ( *from && n>0 ) {
95 		if ( *from>=0xa1 && from[1]>=0xa1 ) {
96 		    *upt++ = unicode_from_ksc5601[ (*from-0xa1)*94+(from[1]-0xa1) ];
97 		    from += 2;
98 		} else
99 		    *upt++ = *from++;
100 		--n;
101 	    }
102 	  break;
103 	  case e_jisgb:
104 	    while ( *from && n>0 ) {
105 		if ( *from>=0xa1 && from[1]>=0xa1 ) {
106 		    *upt++ = unicode_from_gb2312[ (*from-0xa1)*94+(from[1]-0xa1) ];
107 		    from += 2;
108 		} else
109 		    *upt++ = *from++;
110 		--n;
111 	    }
112 	  break;
113 	  case e_sjis:
114 	    while ( *from && n>0 ) {
115 		if ( *from<127 || ( *from>=161 && *from<=223 )) {
116 		    *upt++ = unicode_from_jis201[*from++];
117 		} else {
118 		    int ch1 = *from++;
119 		    int ch2 = *from++;
120 		    if ( ch1 >= 129 && ch1<= 159 )
121 			ch1 -= 112;
122 		    else
123 			ch1 -= 176;
124 		    ch1 <<= 1;
125 		    if ( ch2>=159 )
126 			ch2-= 126;
127 		    else if ( ch2>127 ) {
128 			--ch1;
129 			ch2 -= 32;
130 		    } else {
131 			--ch1;
132 			ch2 -= 31;
133 		    }
134 		    *upt++ = unicode_from_jis208[(ch1-0x21)*94+(ch2-0x21)];
135 		}
136 		--n;
137 	    }
138 	  break;
139 	}
140     } else if ( cs==e_unicode ) {
141 	unichar_t *ufrom = (unichar_t *) from;
142 	while ( *ufrom && n>0 ) {
143 	    *upt++ = *ufrom++;
144 	    --n;
145 	}
146     } else if ( cs==e_unicode_backwards ) {
147 	unichar_t *ufrom = (unichar_t *) from;
148 	while ( *ufrom && n>0 ) {
149 	    unichar_t ch = (*ufrom>>8)||((*ufrom&0xff)<<8);
150 	    *upt++ = ch;
151 	    ++ufrom;
152 	    --n;
153 	}
154     } else if ( cs==e_utf8 ) {
155 	while ( *from && n>0 ) {
156 	    if ( *from<=127 )
157 		*upt = *from++;
158 	    else if ( *from<=0xdf ) {
159 		if ( from[1]>=0x80 ) {
160 		    *upt = ((*from&0x1f)<<6) | (from[1]&0x3f);
161 		    from += 2;
162 		} else {
163 		    ++from;	/* Badly formed utf */
164 		    *upt = 0xfffd;
165 		}
166 	    } else if ( *from<=0xef ) {
167 		if ( from[1]>=0x80 && from[2]>=0x80 ) {
168 		    *upt = ((*from&0xf)<<12) | ((from[1]&0x3f)<<6) | (from[2]&0x3f);
169 		    from += 3;
170 		} else {
171 		    ++from;	/* Badly formed utf */
172 		    *upt = 0xfffd;
173 		}
174 	    } else if ( n>2 ) {
175 		if ( from[1]>=0x80 && from[2]>=0x80 && from[3]>=0x80 ) {
176 		    int w = ( ((*from&0x7)<<2) | ((from[1]&0x30)>>4) )-1;
177 		    *upt++ = 0xd800 | (w<<6) | ((from[1]&0xf)<<2) | ((from[2]&0x30)>>4);
178 		    *upt   = 0xdc00 | ((from[2]&0xf)<<6) | (from[3]&0x3f);
179 		    from += 4;
180 		} else {
181 		    ++from;	/* Badly formed utf */
182 		    *upt = 0xfffd;
183 		}
184 	    } else {
185 		/* no space for surrogate */
186 		from += 4;
187 	    }
188 	    ++upt;
189 	}
190     } else {
191 	if ( !bad_enc_warn ) {
192 	    bad_enc_warn = true;
193 	    fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
194 	}
195 return( encoding2u_strncpy(uto,_from,n,e_iso8859_1));
196     }
197 
198     if ( n>0 )
199 	*upt = '\0';
200 
201 return( uto );
202 }
203 
u2encoding_strncpy(char * to,const unichar_t * ufrom,int n,enum encoding cs)204 char *u2encoding_strncpy(char *to, const unichar_t *ufrom, int n, enum encoding cs) {
205     char *pt = to;
206 
207     /* we just ignore anything that doesn't fit in the encoding we look at */
208     if ( cs<e_first2byte ) {
209 	struct charmap *table = NULL;
210 	unsigned char *plane;
211 	table = alphabets_from_unicode[cs];
212 	if ( table==NULL ) {	/* ASCII */
213 	    while ( *ufrom && n>0 ) {
214 		int ch = *ufrom;
215 		if ( ch<127 ) {
216 		    *pt++ = ch;
217 		    --n;
218 		}
219 		++ufrom;
220 	    }
221 	} else {
222 	    while ( *ufrom && n>0 ) {
223 		int highch = *ufrom>>8, ch;
224 		if ( highch>=table->first && highch<=table->last &&
225 			    (plane = table->table[highch])!=NULL &&
226 			    (ch=plane[*ufrom&0xff])!=0 ) {
227 		    *pt++ = ch;
228 		    --n;
229 		}
230 		++ufrom;
231 	    }
232 	}
233 	if ( n>0 )
234 	    *pt = '\0';
235     } else if ( cs<e_unicode ) {
236 	struct charmap2 *table;
237 	unsigned short *plane;
238 	unsigned char *plane1;
239 
240 	*to = '\0';
241 	switch ( cs ) {
242 	  default:
243 	    if ( !bad_enc_warn ) {
244 		bad_enc_warn = true;
245 		fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
246 	    }
247 return( u2encoding_strncpy(to,ufrom,n,e_iso8859_1));
248 	}
249 	if ( n>0 )
250 	    *pt = '\0';
251     } else if ( cs==e_unicode ) {
252 	unichar_t *uto = (unichar_t *) to;
253 	while ( *ufrom && n>1 ) {
254 	    *uto++ = *ufrom++;
255 	    n-=sizeof(unichar_t);
256 	}
257 	if ( n>1 )
258 	    *uto = '\0';
259     } else if ( cs==e_unicode_backwards ) {
260 	unichar_t *uto = (unichar_t *) to;
261 	while ( *ufrom && n>sizeof(unichar_t)-1 ) {
262 	    unichar_t ch = (*ufrom>>24)|((*ufrom>>8)&0xff00)|
263 		    ((*ufrom<<8)&0xff0000)|(*ufrom<<24);
264 	    *uto++ = ch;
265 	    ++ufrom;
266 	    n-=sizeof(unichar_t);
267 	}
268 	if ( n>1 )
269 	    *uto = '\0';
270     } else if ( cs==e_utf8 ) {
271 	while ( *ufrom ) {
272 	    if ( *ufrom<0x80 ) {
273 		if ( n<=1 )
274 	break;
275 		*pt++ = *ufrom;
276 		--n;
277 	    } else if ( *ufrom<0x800 ) {
278 		if ( n<=2 )
279 	break;
280 		*pt++ = 0xc0 | (*ufrom>>6);
281 		*pt++ = 0x80 | (*ufrom&0x3f);
282 		n -= 2;
283 	    } else if ( *ufrom>=0xd800 && *ufrom<0xdc00 && ufrom[1]>=0xdc00 && ufrom[1]<0xe000 ) {
284 		int u = ((*ufrom>>6)&0xf)+1, y = ((*ufrom&3)<<4) | ((ufrom[1]>>6)&0xf);
285 		if ( n<=4 )
286 	    break;
287 		*pt++ = 0xf0 | (u>>2);
288 		*pt++ = 0x80 | ((u&3)<<4) | ((*ufrom>>2)&0xf);
289 		*pt++ = 0x80 | y;
290 		*pt++ = 0x80 | (ufrom[1]&0x3f);
291 		n -= 4;
292 	    } else {
293 		if ( n<=3 )
294 	    break;
295 		*pt++ = 0xe0 | (*ufrom>>12);
296 		*pt++ = 0x80 | ((*ufrom>>6)&0x3f);
297 		*pt++ = 0x80 | (*ufrom&0x3f);
298 	    }
299 	    ++ufrom;
300 	}
301 	if ( n>1 )
302 	    *pt = '\0';
303     } else {
304 	if ( !bad_enc_warn ) {
305 	    bad_enc_warn = true;
306 	    fprintf( stderr, "Unexpected encoding %d, I'll pretend it's latin1\n", cs );
307 	}
308 return( u2encoding_strncpy(to,ufrom,n,e_iso8859_1));
309     }
310 
311 return( to );
312 }
313 
314 #if HAVE_ICONV_H
315 static char *old_local_name=NULL;
316 static iconv_t to_unicode=(iconv_t) (-1), from_unicode=(iconv_t) (-1);
317 static iconv_t to_utf8=(iconv_t) (-1), from_utf8=(iconv_t) (-1);
318 static char *names[] = { "UCS-4-INTERNAL", "UCS-4", "UCS4", "ISO-10646-UCS-4", "UTF-32", NULL };
319 static char *namesle[] = { "UCS-4LE", "UTF-32LE", NULL };
320 static char *namesbe[] = { "UCS-4BE", "UTF-32BE", NULL };
321 static char *unicode_name = NULL;
322 static int byteswapped = false;
323 
BytesNormal(iconv_t latin1_2_unicode)324 static int BytesNormal(iconv_t latin1_2_unicode) {
325     union {
326 	int32 s;
327 	char c[4];
328     } u[8];
329     char *from = "A", *to = &u[0].c[0];
330     size_t in_left = 1, out_left = sizeof(u);
331     memset(u,0,sizeof(u));
332     iconv( latin1_2_unicode, (iconv_arg2_t) &from, &in_left, &to, &out_left);
333     if ( u[0].s=='A' )
334 return( true );
335 
336 return( false );
337 }
338 
my_iconv_setup(void)339 static int my_iconv_setup(void) {
340     char **testnames;
341     int i;
342     union {
343 	short s;
344 	char c[2];
345     } u;
346     iconv_t test;
347 
348     if ( iconv_local_encoding_name==NULL ) {
349 	if ( to_unicode!=(iconv_t) (-1) ) {
350 	    iconv_close(to_unicode);
351 	    iconv_close(from_unicode);
352 	    to_unicode = from_unicode = (iconv_t) (-1);
353 	}
354 return(false);
355     }
356     if ( old_local_name!=NULL && strcmp(old_local_name,iconv_local_encoding_name)==0 )
357 return( to_unicode!=(iconv_t) (-1) );
358 
359     free(old_local_name);
360     old_local_name = xstrdup(iconv_local_encoding_name);
361     to_utf8 = iconv_open("UTF-8",iconv_local_encoding_name);
362     from_utf8 = iconv_open(iconv_local_encoding_name,"UTF-8");
363 
364     if ( unicode_name==NULL ) {
365 	u.c[0] = 0x1; u.c[1] = 0x2;
366 	if ( u.s==0x201 ) {		/* Little endian */
367 	    testnames = namesle;
368 	} else {
369 	    testnames = namesbe;
370 	}
371 	for ( i=0; testnames[i]!=NULL; ++i ) {
372 	    test = iconv_open(testnames[i],"ISO-8859-1");
373 	    if ( test!=(iconv_t) -1 && test!=NULL ) {
374 		iconv_close(test);
375 		unicode_name = testnames[i];
376 	break;
377 	    }
378 	}
379 	if ( unicode_name==NULL ) {
380 	    for ( i=0; names[i]!=NULL; ++i ) {
381 		test = iconv_open(names[i],"ISO-8859-1");
382 		if ( test!=(iconv_t) -1 && test!=NULL ) {
383 		    byteswapped = !BytesNormal(test);
384 		    iconv_close(test);
385 		    unicode_name = names[i];
386 	    break;
387 		}
388 	    }
389 	}
390     }
391     if ( unicode_name == NULL ) {
392 	fprintf( stderr, "Could not find a name for Unicode which iconv could understand.\n" );
393 return( false );
394     } else if ( byteswapped ) {
395 	fprintf( stderr, "The only name for Unicode that iconv understood produced unexpected results.\nPerhaps %s was byte swapped.\n", unicode_name );
396 return( false );
397     }
398 
399     to_unicode = iconv_open(unicode_name,iconv_local_encoding_name);
400     from_unicode = iconv_open(iconv_local_encoding_name,unicode_name);
401     if ( to_unicode == (iconv_t) (-1) || to_utf8 == (iconv_t) (-1) ) {
402 	fprintf( stderr, "iconv failed to understand encoding %s\n",
403 		iconv_local_encoding_name);
404 return( false );
405     }
406 return( true );
407 }
408 #endif
409 
def2u_strncpy(unichar_t * uto,const char * from,int n)410 unichar_t *def2u_strncpy(unichar_t *uto, const char *from, int n) {
411 #if HAVE_ICONV_H
412     if ( my_iconv_setup() ) {
413 	size_t in_left = n, out_left = sizeof(unichar_t)*n;
414 	char *cto = (char *) uto;
415 	iconv(to_unicode, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
416 	if ( cto<((char *) uto)+2*n) *cto++ = '\0';
417 	if ( cto<((char *) uto)+2*n) *cto++ = '\0';
418 	if ( cto<((char *) uto)+4*n) *cto++ = '\0';
419 	if ( cto<((char *) uto)+4*n) *cto++ = '\0';
420 return( uto );
421     }
422 #endif
423 return( encoding2u_strncpy(uto,from,n,local_encoding));
424 }
425 
u2def_strncpy(char * to,const unichar_t * ufrom,int n)426 char *u2def_strncpy(char *to, const unichar_t *ufrom, int n) {
427 #if HAVE_ICONV_H
428     if ( my_iconv_setup() ) {
429 	size_t in_left = sizeof(unichar_t)*n, out_left = n;
430 	char *cfrom = (char *) ufrom, *cto=to;
431 	iconv(from_unicode, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
432 	if ( cto<to+n ) *cto++ = '\0';
433 	if ( cto<to+n ) *cto++ = '\0';
434 	if ( cto<to+n ) *cto++ = '\0';
435 	if ( cto<to+n ) *cto++ = '\0';
436 return( to );
437     }
438 #endif
439 return( u2encoding_strncpy(to,ufrom,n,local_encoding));
440 }
441 
def2u_copy(const char * from)442 unichar_t *def2u_copy(const char *from) {
443     int len;
444     unichar_t *uto, *ret;
445 
446     if ( from==NULL ) return( NULL );
447     len = strlen(from);
448     uto = (unichar_t *) malloc((len+1)*sizeof(unichar_t));
449     if ( uto==NULL ) return( NULL );
450 #if HAVE_ICONV_H
451     if ( my_iconv_setup() ) {
452 	size_t in_left = len, out_left = sizeof(unichar_t)*len;
453 	char *cto = (char *) uto;
454 	iconv(to_unicode, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
455 	*cto++ = '\0';
456 	*cto++ = '\0';
457 	*cto++ = '\0';
458 	*cto++ = '\0';
459 	return( uto );
460     }
461 #endif
462     ret = encoding2u_strncpy(uto,from,len,local_encoding);
463     if ( ret==NULL )
464 	free( uto );
465     else
466 	uto[len] = '\0';
467     return( ret );
468 }
469 
u2def_copy(const unichar_t * ufrom)470 char *u2def_copy(const unichar_t *ufrom) {
471     int len;
472     char *to, *ret;
473 
474     if ( ufrom==NULL ) return( NULL );
475     len = u_strlen(ufrom);
476 #if HAVE_ICONV_H
477     if ( my_iconv_setup() ) {
478 	size_t in_left = sizeof(unichar_t)*len, out_left = 3*len;
479 	char *cfrom = (char *) ufrom, *cto;
480 	cto = to = (char *) malloc(3*len+2);
481 	if ( cto==NULL ) return( NULL );
482 	iconv(from_unicode, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
483 	*cto++ = '\0';
484 	*cto++ = '\0';
485 	*cto++ = '\0';
486 	*cto++ = '\0';
487 	return( to );
488     }
489 #endif
490     if ( local_encoding==e_utf8 )
491 	len *= 3;
492     if ( local_encoding>=e_first2byte )
493 	len *= 2;
494     to = (char *) malloc(len+sizeof(unichar_t));
495     if ( to==NULL ) return( NULL );
496     ret = u2encoding_strncpy(to,ufrom,len,local_encoding);
497     if ( ret==NULL )
498 	free( to );
499     else if ( local_encoding<e_first2byte )
500 	to[len] = '\0';
501     else {
502 	to[len] = '\0';
503 	to[len+1] = '\0';
504     }
505     return( ret );
506 }
507 
def2utf8_copy(const char * from)508 char *def2utf8_copy(const char *from) {
509     int len;
510     char *ret;
511     unichar_t *temp, *uto;
512 
513     if ( from==NULL ) return( NULL );
514     len = strlen(from);
515 #if HAVE_ICONV_H
516     if ( my_iconv_setup() ) {
517 	size_t in_left = len, out_left = 3*(len+1);
518 	char *cto = (char *) malloc(3*(len+1)), *cret = cto;
519 	if ( cto==NULL ) return( NULL );
520 	iconv(to_utf8, (iconv_arg2_t) &from, &in_left, &cto, &out_left);
521 	*cto++ = '\0';
522 	*cto++ = '\0';
523 	*cto++ = '\0';
524 	*cto++ = '\0';
525 	return( cret );
526     }
527 #endif
528     uto = (unichar_t *) malloc(sizeof(unichar_t)*(len+1));
529     if ( uto==NULL ) return( NULL );
530     temp = encoding2u_strncpy(uto,from,len,local_encoding);
531     if ( temp==NULL ) {
532 	free( uto );
533 	return( NULL );
534     }
535     uto[len] = '\0';
536     ret = u2utf8_copy(uto);
537     free(uto);
538     return( ret );
539 }
540 
utf82def_copy(const char * ufrom)541 char *utf82def_copy(const char *ufrom) {
542     int len;
543     char *ret;
544     unichar_t *u2from;
545 
546     if ( ufrom==NULL ) return( NULL );
547     len = strlen(ufrom);
548 #if HAVE_ICONV_H
549     if ( my_iconv_setup() ) {
550 	size_t in_left = len, out_left = 3*len;
551 	char *cfrom = (char *) ufrom, *cto, *to;
552 	cto = to = (char *) malloc(3*len+2);
553 	if ( cto==NULL ) return( NULL );
554 	iconv(from_utf8, (iconv_arg2_t) &cfrom, &in_left, &cto, &out_left);
555 	*cto++ = '\0';
556 	*cto++ = '\0';
557 	*cto++ = '\0';
558 	*cto++ = '\0';
559 	return( to );
560     }
561 #endif
562     if ( local_encoding==e_utf8 )
563        return( xstrdup( ufrom )); /* Well, that's easy */
564     u2from = utf82u_copy(ufrom);
565     ret = u2def_copy(u2from);
566     free(u2from);
567     return( ret );
568 }
569