1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov  * CDDL HEADER START
3*16d86563SAlexander Pyhalov  *
4*16d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov  *
8*16d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov  * and limitations under the License.
12*16d86563SAlexander Pyhalov  *
13*16d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov  *
19*16d86563SAlexander Pyhalov  * CDDL HEADER END
20*16d86563SAlexander Pyhalov  */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov  * Copyright (c) 1998-1999, 2001 by Sun Microsystems, Inc.
23*16d86563SAlexander Pyhalov  * All rights reserved.
24*16d86563SAlexander Pyhalov  *
25*16d86563SAlexander Pyhalov  * Following is how we process BOM and subsequent bytes in this program:
26*16d86563SAlexander Pyhalov  * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27*16d86563SAlexander Pyhalov  *   UTF-32LE don't care about BOM. From the beginning, they are properly
28*16d86563SAlexander Pyhalov  *   serializedi without the BOM character.
29*16d86563SAlexander Pyhalov  * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30*16d86563SAlexander Pyhalov  *   ordering is of the current processor's byte ordering. During the first
31*16d86563SAlexander Pyhalov  *   iconv() call, if BOM appears as the first character of the entier
32*16d86563SAlexander Pyhalov  *   iconv input stream, the byte order will be changed accordingly.
33*16d86563SAlexander Pyhalov  *   We will use 'bom_written' data field of the conversion descriptor to
34*16d86563SAlexander Pyhalov  *   save this particular information, in other words, whether we've been
35*16d86563SAlexander Pyhalov  *   encountered the first character as the BOM.
36*16d86563SAlexander Pyhalov  */
37*16d86563SAlexander Pyhalov 
38*16d86563SAlexander Pyhalov 
39*16d86563SAlexander Pyhalov #include <stdlib.h>
40*16d86563SAlexander Pyhalov #include <errno.h>
41*16d86563SAlexander Pyhalov #include <sys/types.h>
42*16d86563SAlexander Pyhalov #include <sys/isa_defs.h>
43*16d86563SAlexander Pyhalov #include "ucs_to_unihan.h"
44*16d86563SAlexander Pyhalov #include "common_def.h"
45*16d86563SAlexander Pyhalov #include "common_han.h"
46*16d86563SAlexander Pyhalov 
47*16d86563SAlexander Pyhalov typedef struct {
48*16d86563SAlexander Pyhalov   int         _magic;
49*16d86563SAlexander Pyhalov   boolean     _need_byte_swap;
50*16d86563SAlexander Pyhalov   boolean     _bom_written;
51*16d86563SAlexander Pyhalov   boolean     _is_little_endian;
52*16d86563SAlexander Pyhalov 
53*16d86563SAlexander Pyhalov } _icv_state_t;
54*16d86563SAlexander Pyhalov 
55*16d86563SAlexander Pyhalov static hcode_type ucs_to_unihan (uint_t ucs_char);
56*16d86563SAlexander Pyhalov extern hcode_type _utf8_to_unified_hangul (hcode_type);
57*16d86563SAlexander Pyhalov 
58*16d86563SAlexander Pyhalov void *
_icv_open()59*16d86563SAlexander Pyhalov _icv_open()
60*16d86563SAlexander Pyhalov {
61*16d86563SAlexander Pyhalov   _icv_state_t *cd = (_icv_state_t *)calloc(1, sizeof(_icv_state_t));
62*16d86563SAlexander Pyhalov 
63*16d86563SAlexander Pyhalov   if (cd == (_icv_state_t *)NULL) {
64*16d86563SAlexander Pyhalov     errno = ENOMEM;
65*16d86563SAlexander Pyhalov     return((void *)-1);
66*16d86563SAlexander Pyhalov   }
67*16d86563SAlexander Pyhalov 
68*16d86563SAlexander Pyhalov   cd->_magic = MAGIC_NUMBER;
69*16d86563SAlexander Pyhalov 
70*16d86563SAlexander Pyhalov #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
71*16d86563SAlexander Pyhalov 	defined(UTF_32BE)
72*16d86563SAlexander Pyhalov   cd->_is_little_endian = false;
73*16d86563SAlexander Pyhalov   cd->_bom_written = true;
74*16d86563SAlexander Pyhalov #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
75*16d86563SAlexander Pyhalov 	defined(UTF_32LE)
76*16d86563SAlexander Pyhalov   cd->_is_little_endian = true;
77*16d86563SAlexander Pyhalov   cd->_bom_written = true;
78*16d86563SAlexander Pyhalov #elif defined(__IS_LITTLE_ENDIAN)
79*16d86563SAlexander Pyhalov   cd->_is_little_endian = true;
80*16d86563SAlexander Pyhalov #endif
81*16d86563SAlexander Pyhalov 
82*16d86563SAlexander Pyhalov   cd->_need_byte_swap = false;
83*16d86563SAlexander Pyhalov 
84*16d86563SAlexander Pyhalov   return((void *)cd);
85*16d86563SAlexander Pyhalov }
86*16d86563SAlexander Pyhalov 
87*16d86563SAlexander Pyhalov 
88*16d86563SAlexander Pyhalov void
_icv_close(_icv_state_t * cd)89*16d86563SAlexander Pyhalov _icv_close(_icv_state_t *cd)
90*16d86563SAlexander Pyhalov {
91*16d86563SAlexander Pyhalov   if (! cd)
92*16d86563SAlexander Pyhalov     errno = EBADF;
93*16d86563SAlexander Pyhalov   else
94*16d86563SAlexander Pyhalov     free((void *)cd);
95*16d86563SAlexander Pyhalov }
96*16d86563SAlexander Pyhalov 
97*16d86563SAlexander Pyhalov 
98*16d86563SAlexander Pyhalov size_t
_icv_iconv(_icv_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)99*16d86563SAlexander Pyhalov _icv_iconv(_icv_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
100*16d86563SAlexander Pyhalov 	   size_t *outbufleft)
101*16d86563SAlexander Pyhalov {
102*16d86563SAlexander Pyhalov   size_t ret_val = 0;
103*16d86563SAlexander Pyhalov   uchar_t *ib;
104*16d86563SAlexander Pyhalov   uchar_t *ob;
105*16d86563SAlexander Pyhalov   uchar_t *ibtail;
106*16d86563SAlexander Pyhalov   uchar_t *obtail;
107*16d86563SAlexander Pyhalov   uint_t u4;
108*16d86563SAlexander Pyhalov   uint_t u4_2;
109*16d86563SAlexander Pyhalov   register int i;
110*16d86563SAlexander Pyhalov 
111*16d86563SAlexander Pyhalov   hcode_type unihan;
112*16d86563SAlexander Pyhalov   unihan.code = 0x00;
113*16d86563SAlexander Pyhalov 
114*16d86563SAlexander Pyhalov   if (! cd) {
115*16d86563SAlexander Pyhalov     errno = EBADF;
116*16d86563SAlexander Pyhalov     return((size_t)-1);
117*16d86563SAlexander Pyhalov   }
118*16d86563SAlexander Pyhalov 
119*16d86563SAlexander Pyhalov   if (!inbuf || !(*inbuf))
120*16d86563SAlexander Pyhalov     return((size_t)0);
121*16d86563SAlexander Pyhalov 
122*16d86563SAlexander Pyhalov   ib = (uchar_t *)*inbuf;
123*16d86563SAlexander Pyhalov   ob = (uchar_t *)*outbuf;
124*16d86563SAlexander Pyhalov   ibtail = ib + *inbufleft;
125*16d86563SAlexander Pyhalov   obtail = ob + *outbufleft;
126*16d86563SAlexander Pyhalov 
127*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
128*16d86563SAlexander Pyhalov   if (! cd->_bom_written) {
129*16d86563SAlexander Pyhalov     if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130*16d86563SAlexander Pyhalov       errno = EINVAL;
131*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
132*16d86563SAlexander Pyhalov       goto need_more_input_err;
133*16d86563SAlexander Pyhalov     }
134*16d86563SAlexander Pyhalov 
135*16d86563SAlexander Pyhalov     for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
136*16d86563SAlexander Pyhalov       u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
137*16d86563SAlexander Pyhalov 
138*16d86563SAlexander Pyhalov     /* Big endian, Little endian, or, not specified?? */
139*16d86563SAlexander Pyhalov     if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
140*16d86563SAlexander Pyhalov       ib += ICV_FETCH_UCS_SIZE;
141*16d86563SAlexander Pyhalov       cd->_is_little_endian = false;
142*16d86563SAlexander Pyhalov     } else if (u4 == ICV_BOM_IN__IS_LITTLE_ENDIAN) {
143*16d86563SAlexander Pyhalov       ib += ICV_FETCH_UCS_SIZE;
144*16d86563SAlexander Pyhalov       cd->_is_little_endian = true;
145*16d86563SAlexander Pyhalov     }
146*16d86563SAlexander Pyhalov   }
147*16d86563SAlexander Pyhalov   /*
148*16d86563SAlexander Pyhalov    * Once BOM checking is done, regardless of whether we had the BOM or
149*16d86563SAlexander Pyhalov    * not, we treat the BOM sequence as a ZWNBSP character from now on.
150*16d86563SAlexander Pyhalov    */
151*16d86563SAlexander Pyhalov   cd->_bom_written = true;
152*16d86563SAlexander Pyhalov #endif
153*16d86563SAlexander Pyhalov 
154*16d86563SAlexander Pyhalov   while (ib < ibtail) {
155*16d86563SAlexander Pyhalov     if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
156*16d86563SAlexander Pyhalov       errno = EINVAL;
157*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
158*16d86563SAlexander Pyhalov       break;
159*16d86563SAlexander Pyhalov     }
160*16d86563SAlexander Pyhalov 
161*16d86563SAlexander Pyhalov     u4 = u4_2 = 0;
162*16d86563SAlexander Pyhalov     if (cd->_is_little_endian) {
163*16d86563SAlexander Pyhalov       for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
164*16d86563SAlexander Pyhalov 	u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
165*16d86563SAlexander Pyhalov     } else {
166*16d86563SAlexander Pyhalov       for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
167*16d86563SAlexander Pyhalov 	u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
168*16d86563SAlexander Pyhalov     }
169*16d86563SAlexander Pyhalov 
170*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
171*16d86563SAlexander Pyhalov     if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
172*16d86563SAlexander Pyhalov       errno = EILSEQ;
173*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
174*16d86563SAlexander Pyhalov       break;
175*16d86563SAlexander Pyhalov     }
176*16d86563SAlexander Pyhalov #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
177*16d86563SAlexander Pyhalov     if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
178*16d86563SAlexander Pyhalov       errno = EILSEQ;
179*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
180*16d86563SAlexander Pyhalov       break;
181*16d86563SAlexander Pyhalov     }
182*16d86563SAlexander Pyhalov 
183*16d86563SAlexander Pyhalov     if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
184*16d86563SAlexander Pyhalov       if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
185*16d86563SAlexander Pyhalov 	errno = EINVAL;
186*16d86563SAlexander Pyhalov 	ret_val = (size_t)-1;
187*16d86563SAlexander Pyhalov 	break;
188*16d86563SAlexander Pyhalov       }
189*16d86563SAlexander Pyhalov 
190*16d86563SAlexander Pyhalov       if (cd->_is_little_endian) {
191*16d86563SAlexander Pyhalov 	for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
192*16d86563SAlexander Pyhalov 	     i >= ICV_FETCH_UCS_SIZE;
193*16d86563SAlexander Pyhalov 	     i--)
194*16d86563SAlexander Pyhalov 	  u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
195*16d86563SAlexander Pyhalov       } else {
196*16d86563SAlexander Pyhalov 	for (i = ICV_FETCH_UCS_SIZE;
197*16d86563SAlexander Pyhalov 	     i < ICV_FETCH_UCS_SIZE_TWO;
198*16d86563SAlexander Pyhalov 	     i++)
199*16d86563SAlexander Pyhalov 	  u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
200*16d86563SAlexander Pyhalov       }
201*16d86563SAlexander Pyhalov 
202*16d86563SAlexander Pyhalov       if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
203*16d86563SAlexander Pyhalov 	errno = EILSEQ;
204*16d86563SAlexander Pyhalov 	ret_val = (size_t)-1;
205*16d86563SAlexander Pyhalov 	break;
206*16d86563SAlexander Pyhalov       }
207*16d86563SAlexander Pyhalov 
208*16d86563SAlexander Pyhalov       u4 = ((((u4 - 0x00d800) * 0x400) +
209*16d86563SAlexander Pyhalov 	     (u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
210*16d86563SAlexander Pyhalov     }
211*16d86563SAlexander Pyhalov #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
212*16d86563SAlexander Pyhalov     if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
213*16d86563SAlexander Pyhalov 	(u4 >= 0x00d800 && u4 <= 0x00dfff)) {
214*16d86563SAlexander Pyhalov       errno = EILSEQ;
215*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
216*16d86563SAlexander Pyhalov       break;
217*16d86563SAlexander Pyhalov     }
218*16d86563SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
219*16d86563SAlexander Pyhalov     if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
220*16d86563SAlexander Pyhalov 	(u4 >= 0x00d800 && u4 <= 0x00dfff)) {
221*16d86563SAlexander Pyhalov       errno = EILSEQ;
222*16d86563SAlexander Pyhalov       ret_val = (size_t)-1;
223*16d86563SAlexander Pyhalov       break;
224*16d86563SAlexander Pyhalov     }
225*16d86563SAlexander Pyhalov #else
226*16d86563SAlexander Pyhalov #error	"Fatal: one of the UCS macros need to be defined."
227*16d86563SAlexander Pyhalov #endif
228*16d86563SAlexander Pyhalov 
229*16d86563SAlexander Pyhalov     /*
230*16d86563SAlexander Pyhalov      * Once we reach here, the "u4" contains a valid character
231*16d86563SAlexander Pyhalov      * and thus we don't do any other error checking in
232*16d86563SAlexander Pyhalov      * the below.
233*16d86563SAlexander Pyhalov      */
234*16d86563SAlexander Pyhalov 
235*16d86563SAlexander Pyhalov     unihan = ucs_to_unihan (u4);
236*16d86563SAlexander Pyhalov     if(unihan.byte.byte1 == '\0' && unihan.byte.byte2 == '\0' && unihan.byte.byte3 == '\0')
237*16d86563SAlexander Pyhalov     {
238*16d86563SAlexander Pyhalov 	*ob++ = unihan.byte.byte4;
239*16d86563SAlexander Pyhalov 	ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
240*16d86563SAlexander Pyhalov 	continue;
241*16d86563SAlexander Pyhalov     }
242*16d86563SAlexander Pyhalov     if (cd->_need_byte_swap){
243*16d86563SAlexander Pyhalov       *ob++ = (uchar_t) unihan.byte.byte4;
244*16d86563SAlexander Pyhalov       *ob++ = (uchar_t) unihan.byte.byte3;
245*16d86563SAlexander Pyhalov     } else {
246*16d86563SAlexander Pyhalov       *ob++ = (uchar_t) unihan.byte.byte3;
247*16d86563SAlexander Pyhalov       *ob++ = (uchar_t) unihan.byte.byte4;
248*16d86563SAlexander Pyhalov     }
249*16d86563SAlexander Pyhalov 
250*16d86563SAlexander Pyhalov     ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
251*16d86563SAlexander Pyhalov   }
252*16d86563SAlexander Pyhalov 
253*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
254*16d86563SAlexander Pyhalov  need_more_input_err:
255*16d86563SAlexander Pyhalov #endif
256*16d86563SAlexander Pyhalov   *inbuf = (char *)ib;
257*16d86563SAlexander Pyhalov   *inbufleft = ibtail - ib;
258*16d86563SAlexander Pyhalov   *outbuf = (char *)ob;
259*16d86563SAlexander Pyhalov   *outbufleft = obtail - ob;
260*16d86563SAlexander Pyhalov 
261*16d86563SAlexander Pyhalov   return(ret_val);
262*16d86563SAlexander Pyhalov }
263*16d86563SAlexander Pyhalov 
264*16d86563SAlexander Pyhalov static hcode_type
ucs_to_unihan(uint_t ucs_char)265*16d86563SAlexander Pyhalov ucs_to_unihan (uint_t ucs_char)
266*16d86563SAlexander Pyhalov {
267*16d86563SAlexander Pyhalov   hcode_type unihan_char;
268*16d86563SAlexander Pyhalov   hcode_type utf8_char;
269*16d86563SAlexander Pyhalov   unihan_char.code = 0x00;
270*16d86563SAlexander Pyhalov 
271*16d86563SAlexander Pyhalov   if (ucs_char <= 0x7f) {
272*16d86563SAlexander Pyhalov     utf8_char.code = ucs_char;
273*16d86563SAlexander Pyhalov 
274*16d86563SAlexander Pyhalov   } else if (ucs_char <= 0x7ff) {
275*16d86563SAlexander Pyhalov     utf8_char.byte.byte3 = (uchar_t)(0xc0 | ((ucs_char & 0x07c0) >> 6));
276*16d86563SAlexander Pyhalov     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x003f));
277*16d86563SAlexander Pyhalov 
278*16d86563SAlexander Pyhalov   } else if (ucs_char <= 0x00ffff) {
279*16d86563SAlexander Pyhalov     utf8_char.byte.byte2 = (uchar_t)(0xe0 | ((ucs_char & 0x0f000) >> 12));
280*16d86563SAlexander Pyhalov     utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x00fc0) >> 6));
281*16d86563SAlexander Pyhalov     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x0003f));
282*16d86563SAlexander Pyhalov   } else if (ucs_char <= 0x1fffff) {
283*16d86563SAlexander Pyhalov     utf8_char.byte.byte1 = (uchar_t)(0xf0 | ((ucs_char & 0x01c0000) >> 18));
284*16d86563SAlexander Pyhalov     utf8_char.byte.byte2 = (uchar_t)(0x80 | ((ucs_char & 0x003f000) >> 12));
285*16d86563SAlexander Pyhalov     utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x0000fc0) >> 6));
286*16d86563SAlexander Pyhalov     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x000003f));
287*16d86563SAlexander Pyhalov   } else
288*16d86563SAlexander Pyhalov     utf8_char.code = 0x00;
289*16d86563SAlexander Pyhalov 
290*16d86563SAlexander Pyhalov   unihan_char = _utf8_to_unified_hangul (utf8_char);
291*16d86563SAlexander Pyhalov   return unihan_char;
292*16d86563SAlexander Pyhalov }
293