1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <errno.h>
32 #include "unicode_big5.h"	/* UTF8 to Big-5 mapping table */
33 #include "common_defs.h"
34 
35 #define	MSB	0x80	/* most significant bit */
36 #define ONEBYTE	0xff	/* right most byte */
37 
38 #define NON_ID_CHAR   '?' /* non-identified character */
39 
40 typedef struct _icv_state {
41 	char	keepc[6];	/* maximum # byte of UTF8 code */
42 	short	ustate;
43 	int	_errno;		/* internal errno */
44         boolean little_endian;
45         boolean bom_written;
46 } _iconv_st;
47 
48 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
49 
50 static int get_big5_by_utf(uint_t, int *, unsigned long *);
51 static int utf8_to_big5(int, unsigned long, char *, size_t, int *);
52 static int binsearch(unsigned long, utf_big5[], int);
53 
54 
55 /*
56  * Open; called from iconv_open()
57  */
58 void *
_icv_open()59 _icv_open()
60 {
61 	_iconv_st *st;
62 
63 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
64 		errno = ENOMEM;
65 		return ((void *) -1);
66 	}
67 
68 	st->ustate = U0;
69 	st->_errno = 0;
70         st->little_endian = false;
71         st->bom_written = false;
72 #if defined(UCS_2LE)
73         st->little_endian = true;
74         st->bom_written = true;
75 #endif
76 	return ((void *) st);
77 }
78 
79 
80 /*
81  * Close; called from iconv_close()
82  */
83 void
_icv_close(_iconv_st * st)84 _icv_close(_iconv_st *st)
85 {
86 	if (!st)
87 		errno = EBADF;
88 	else
89 		free(st);
90 }
91 
92 
93 /*
94  * Actual conversion; called from iconv()
95  */
96 /*=========================================================
97  *
98  *       State Machine for interpreting UTF8 code
99  *
100  *=========================================================
101  *                        2nd byte  3rd byte  4th byte
102  *          +----->------->------>U5----->U6------------>U7
103  *          |                                            |
104  *          |     3 byte unicode                         |
105  *          +----->------->-------+                      |
106  *          |                     |                      |
107  *          ^                     v                      |
108  *          |  2 byte             U2 ---> U3             |
109  *          |  unicode                    v              |
110  * +------> U0 -------> U1                +-------->U4---+
111  * ^  ascii |           |                           ^    |
112  * |        |           +-------->--------->--------+    |
113  * |        v                                            v
114  * +----<---+-----<------------<------------<------------+
115  *
116  *=========================================================*/
117 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)118 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
119 				char **outbuf, size_t *outbytesleft)
120 {
121 	int		n, unidx;
122 	unsigned long	big5code;
123 	int		uconv_num = 0;
124 	int		utf8_len = 0;
125 	uint_t          ucs;
126 
127 #ifdef DEBUG
128     fprintf(stderr, "==========     iconv(): UTF2 --> Big-5     ==========\n");
129 #endif
130 	if (st == NULL) {
131 		errno = EBADF;
132 		return ((size_t) -1);
133 	}
134 
135 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
136 		st->ustate = U0;
137 		st->_errno = 0;
138 		return ((size_t) 0);
139 	}
140 
141 	st->_errno = 0;		/* reset internal errno */
142 	errno = 0;		/* reset external errno */
143 
144 	/* a state machine for interpreting UTF8 code */
145 	while (*inbytesleft > 0 && *outbytesleft > 0) {
146 
147 	        uchar_t  first_byte;
148 		int	 uconv_num_internal = 0;
149 
150 		switch (st->ustate) {
151 		case U0:		/* assuming ASCII in the beginning */
152                        /*
153                         * Code converion for UCS-2LE to support Samba
154                         */
155                         if (st->little_endian) {
156                           st->ustate = U1;
157                           st->keepc[0] = **inbuf;
158                         }
159 			else if ((**inbuf & MSB) == 0) {	/* ASCII */
160 				**outbuf = **inbuf;
161 				(*outbuf)++;
162 				(*outbytesleft)--;
163 			} else {	/* Chinese character */
164 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc2..0xdf */
165 
166 				        /* invalid sequence if the first char is either 0xc0 or 0xc1 */
167 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
168 				            st->_errno = errno = EILSEQ;
169 				        else {
170 					    st->ustate = U1;
171 					    st->keepc[0] = **inbuf;
172 					}
173 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
174 					st->ustate = U2;
175 					st->keepc[0] = **inbuf;
176 				} else {
177 				        /* four bytes of UTF-8 sequences */
178 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
179 					   st->_errno = errno = EILSEQ;
180 				        else {
181 					   st->ustate = U5;
182 					   st->keepc[0] = **inbuf;
183 					}
184 				}
185 			}
186 			break;
187 		case U1:		/* 2 byte unicode */
188 			if ((**inbuf & 0xc0) == MSB || st->little_endian) {
189 				utf8_len = 2;
190 				st->keepc[1] = **inbuf;
191 
192                                 /*
193                                  * Code conversion for UCS-2LE to support Samba
194                                  */
195                                 if  (st->little_endian) {
196                                   /*
197                                    * It's ASCII
198                                    */
199                                   if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
200                                     *(*outbuf)++ = st->keepc[0];
201 				    (*outbytesleft)--;
202                                     st->ustate = U0;
203                                     break;
204                                   }
205 
206                                   ucs = ((st->keepc[1] & 0xff) << 8) | (st->keepc[0] & 0xff);
207 
208                                 } else
209 				  convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
210 
211 				st->ustate = U4;
212 #ifdef DEBUG
213     fprintf(stderr, "UTF8: %02x%02x   --> ",
214 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
215 #endif
216 				continue;	/* should not advance *inbuf */
217 			} else {
218 				st->_errno = errno = EILSEQ;
219 			}
220 			break;
221 		case U2:		/* 3 byte unicode - 2nd byte */
222 
223 		        first_byte = st->keepc[0];
224 
225 		        /* if the first byte is 0xed, it is illegal sequence if the second
226 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
227 			 */
228 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 		            st->_errno = errno = EILSEQ;
231 			else {
232 				st->ustate = U3;
233 				st->keepc[1] = **inbuf;
234 			}
235 			break;
236 		case U3:		/* 3 byte unicode - 3rd byte */
237 			if ((**inbuf & 0xc0) == MSB) {
238 				st->ustate = U4;
239 				utf8_len = 3;
240 				st->keepc[2] = **inbuf;
241 
242 				convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
243 #ifdef DEBUG
244     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
245 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
246 #endif
247 				continue;	/* should not advance *inbuf */
248 			} else {
249 				st->_errno = errno = EILSEQ;
250 			}
251 			break;
252 		case U4:
253 
254 			n = get_big5_by_utf(ucs, &unidx, &big5code);
255 
256 		        if ( n == -1 )
257 		         {  /* unicode is either 0xfffe or 0xffff */
258 			    st->_errno = errno = EILSEQ;
259 			    break;
260 			 }
261 
262 /* comment the following lines out to ignore the non-Big5 characters
263 			if (n != 0) {	* legal unicode;illegal Big5 *
264 				st->_errno = errno = EILSEQ;
265 				break;
266 			}
267 */
268 
269 			n = utf8_to_big5(unidx, big5code,
270 					*outbuf, *outbytesleft, &uconv_num_internal);
271 			if (n > 0) {
272 				(*outbuf) += n;
273 				(*outbytesleft) -= n;
274 
275 				uconv_num += uconv_num_internal;
276 
277 				st->ustate = U0;
278 			} else {
279 				st->_errno = errno = E2BIG;
280 			}
281 			break;
282 		case U5:
283 
284 		        first_byte = st->keepc[0];
285 
286 		        /* if the first byte is 0xf0, it is illegal sequence if
287 			 * the second one is between 0x80 and 0x8f
288 			 * for Four-Byte UTF: U+10000..U+10FFFF
289 			 */
290 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
291 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
292 		            st->_errno = errno = EILSEQ;
293 		        else
294 		           {
295 			      st->ustate = U6;
296 			      st->keepc[1] = **inbuf;
297 		           }
298 		        break;
299 		case U6:
300 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
301 		          {
302 			     st->ustate = U7;
303 			     st->keepc[2] = **inbuf;
304 			  }
305 		        else
306 		          st->_errno = errno = EILSEQ;
307 		        break;
308 		case U7:
309 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
310 		          {  /* replace with double NON_ID_CHARs */
311 
312 			     utf8_len = 4;
313 			     st->keepc[3] = **inbuf;
314 
315 			     convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
316 
317 			     st->ustate = U4;
318 			     continue;
319 
320 #if 0
321 			     if ( *outbytesleft < 2 )
322 			        st->_errno = errno = E2BIG;
323 			     else
324 			       {
325 				  **outbuf = NON_ID_CHAR;
326 				  *(*outbuf+1) = NON_ID_CHAR;
327 				  (*outbytesleft) -= 2;
328 
329 				  uconv_num++;
330 
331 				  st->ustate = U0;
332 			       }
333 #endif
334 			  }
335 		        else
336 		          st->_errno = errno = EILSEQ;
337 		        break;
338 		default:			/* should never come here */
339 			st->_errno = errno = EILSEQ;
340 			st->ustate = U0;	/* reset state */
341 			break;
342 		}
343 
344 		if (st->_errno) {
345 #ifdef DEBUG
346     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
347 		st->_errno, st->ustate);
348 #endif
349 			break;
350 		}
351 
352 		(*inbuf)++;
353 		(*inbytesleft)--;
354 
355 	}
356 
357         if (*inbytesleft == 0 && st->ustate != U0)
358                 errno = EINVAL;
359 
360 	if (*inbytesleft > 0 && *outbytesleft == 0)
361 		errno = E2BIG;
362 
363 	if (errno) {
364 		int num_reversed_bytes = 0;
365 
366 		switch (st->ustate)
367 	        {
368 		 case U1:
369 		   num_reversed_bytes = 1;
370 		   break;
371 		 case U2:
372 		   num_reversed_bytes = 1;
373 		   break;
374 		 case U3:
375 		   num_reversed_bytes = 2;
376 		   break;
377 		 case U4:
378 		   num_reversed_bytes = utf8_len - 1;
379 		   break;
380 		 case U5:
381 		   num_reversed_bytes = 1;
382 		   break;
383 		 case U6:
384 		   num_reversed_bytes = 2;
385 		   break;
386 		 case U7:
387 		   num_reversed_bytes = 3;
388 		   break;
389 	        }
390 
391 		/*
392 		 * if error, *inbuf points to the byte following the last byte
393 		 * successfully used in the conversion.
394 		 */
395 		*inbuf -= num_reversed_bytes;
396 		*inbytesleft += num_reversed_bytes;
397 		st->ustate = U0;
398 		return ((size_t) -1);
399 	}
400 
401 	return uconv_num;
402 }
403 
404 /*
405  * Match Big-5 code by UTF8 code;
406  * Return: = 0 - match from Unicode to Big-5 found
407  *         = 1 - match from Unicode to Big-5 NOT found
408  *         =-1 - illegal sequence
409  *
410  * Since binary search of the UTF8 to Big-5 table is necessary, might as well
411  * return index and Big-5 code matching to the unicode.
412  */
get_big5_by_utf(uint_t ucs,int * unidx,unsigned long * big5code)413 static int get_big5_by_utf(uint_t ucs, int *unidx, unsigned long *big5code)
414 {
415         /* 0xfffe and 0xffff should not be allowed */
416         if ( ucs == 0xFFFE || ucs == 0xFFFF ) return -1;
417 
418 	*unidx = binsearch(ucs, utf_big5_tab, MAX_BIG5_NUM);
419 	if ((*unidx) >= 0)
420 		*big5code = utf_big5_tab[*unidx].big5code;
421 	else
422 		return(1);	/* match from UTF8 to Big-5 not found */
423 #ifdef DEBUG
424     fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", ucs, *unidx, *big5code);
425 #endif
426 
427 	return(0);
428 }
429 
430 
431 /*
432  * ISO/IEC 10646 (Unicode) --> Big-5
433  * Unicode --> UTF8 (FSS-UTF)
434  *             (File System Safe Universal Character Set Transformation Format)
435  * Return: > 0 - converted with enough space in output buffer
436  *         = 0 - no space in outbuf
437  */
utf8_to_big5(int unidx,unsigned long big5code,char * buf,size_t buflen,int * uconv_num)438 static int utf8_to_big5(int unidx, unsigned long big5code, char *buf, size_t buflen, int *uconv_num)
439 {
440 	unsigned long	val;		/* Big-5 value */
441 	char		c1, c2, big5_str[3];
442 
443 	if (buflen < 2) {
444 		errno = E2BIG;
445 		return(0);
446 	}
447 
448 	if (unidx < 0) {	/* no match from UTF8 to Big-5 */
449 		*buf = *(buf+1) = NON_ID_CHAR;
450 
451 		/* non-identical conversion */
452 		*uconv_num = 1;
453 
454 	} else {
455 		val = big5code & 0xffff;
456 		c1 = (char) ((val & 0xff00) >> 8);
457 		c2 = (char) (val & 0xff);
458 
459 	*buf = big5_str[0] = c1;
460 	*(buf+1) = big5_str[1] = c2;
461 	big5_str[2] = '\0';
462 	}
463 
464 #ifdef DEBUG
465     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
466 #endif
467 
468 	return(2);
469 }
470 
471 
472 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_big5 v[],int n)473 static int binsearch(unsigned long x, utf_big5 v[], int n)
474 {
475 	int low, high, mid;
476 
477 	low = 0;
478 	high = n - 1;
479 	while (low <= high) {
480 		mid = (low + high) / 2;
481 		if (x < v[mid].unicode)
482 			high = mid - 1;
483 		else if (x > v[mid].unicode)
484 			low = mid + 1;
485 		else	/* found match */
486 			return mid;
487 	}
488 	return (-1);	/* no match */
489 }
490