1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov * CDDL HEADER START
3*16d86563SAlexander Pyhalov *
4*16d86563SAlexander Pyhalov * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov *
8*16d86563SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov * and limitations under the License.
12*16d86563SAlexander Pyhalov *
13*16d86563SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov *
19*16d86563SAlexander Pyhalov * CDDL HEADER END
20*16d86563SAlexander Pyhalov */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov * Copyright (c) 1998-1999, 2001 by Sun Microsystems, Inc.
23*16d86563SAlexander Pyhalov * All rights reserved.
24*16d86563SAlexander Pyhalov *
25*16d86563SAlexander Pyhalov * Following is how we process BOM and subsequent bytes in this program:
26*16d86563SAlexander Pyhalov * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27*16d86563SAlexander Pyhalov * UTF-32LE don't care about BOM. From the beginning, they are properly
28*16d86563SAlexander Pyhalov * serializedi without the BOM character.
29*16d86563SAlexander Pyhalov * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30*16d86563SAlexander Pyhalov * ordering is of the current processor's byte ordering. During the first
31*16d86563SAlexander Pyhalov * iconv() call, if BOM appears as the first character of the entier
32*16d86563SAlexander Pyhalov * iconv input stream, the byte order will be changed accordingly.
33*16d86563SAlexander Pyhalov * We will use 'bom_written' data field of the conversion descriptor to
34*16d86563SAlexander Pyhalov * save this particular information, in other words, whether we've been
35*16d86563SAlexander Pyhalov * encountered the first character as the BOM.
36*16d86563SAlexander Pyhalov */
37*16d86563SAlexander Pyhalov
38*16d86563SAlexander Pyhalov
39*16d86563SAlexander Pyhalov #include <stdlib.h>
40*16d86563SAlexander Pyhalov #include <errno.h>
41*16d86563SAlexander Pyhalov #include <sys/types.h>
42*16d86563SAlexander Pyhalov #include <sys/isa_defs.h>
43*16d86563SAlexander Pyhalov #include "ucs_to_unihan.h"
44*16d86563SAlexander Pyhalov #include "common_def.h"
45*16d86563SAlexander Pyhalov #include "common_han.h"
46*16d86563SAlexander Pyhalov
47*16d86563SAlexander Pyhalov typedef struct {
48*16d86563SAlexander Pyhalov int _magic;
49*16d86563SAlexander Pyhalov boolean _need_byte_swap;
50*16d86563SAlexander Pyhalov boolean _bom_written;
51*16d86563SAlexander Pyhalov boolean _is_little_endian;
52*16d86563SAlexander Pyhalov
53*16d86563SAlexander Pyhalov } _icv_state_t;
54*16d86563SAlexander Pyhalov
55*16d86563SAlexander Pyhalov static hcode_type ucs_to_unihan (uint_t ucs_char);
56*16d86563SAlexander Pyhalov extern hcode_type _utf8_to_unified_hangul (hcode_type);
57*16d86563SAlexander Pyhalov
58*16d86563SAlexander Pyhalov void *
_icv_open()59*16d86563SAlexander Pyhalov _icv_open()
60*16d86563SAlexander Pyhalov {
61*16d86563SAlexander Pyhalov _icv_state_t *cd = (_icv_state_t *)calloc(1, sizeof(_icv_state_t));
62*16d86563SAlexander Pyhalov
63*16d86563SAlexander Pyhalov if (cd == (_icv_state_t *)NULL) {
64*16d86563SAlexander Pyhalov errno = ENOMEM;
65*16d86563SAlexander Pyhalov return((void *)-1);
66*16d86563SAlexander Pyhalov }
67*16d86563SAlexander Pyhalov
68*16d86563SAlexander Pyhalov cd->_magic = MAGIC_NUMBER;
69*16d86563SAlexander Pyhalov
70*16d86563SAlexander Pyhalov #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
71*16d86563SAlexander Pyhalov defined(UTF_32BE)
72*16d86563SAlexander Pyhalov cd->_is_little_endian = false;
73*16d86563SAlexander Pyhalov cd->_bom_written = true;
74*16d86563SAlexander Pyhalov #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
75*16d86563SAlexander Pyhalov defined(UTF_32LE)
76*16d86563SAlexander Pyhalov cd->_is_little_endian = true;
77*16d86563SAlexander Pyhalov cd->_bom_written = true;
78*16d86563SAlexander Pyhalov #elif defined(__IS_LITTLE_ENDIAN)
79*16d86563SAlexander Pyhalov cd->_is_little_endian = true;
80*16d86563SAlexander Pyhalov #endif
81*16d86563SAlexander Pyhalov
82*16d86563SAlexander Pyhalov cd->_need_byte_swap = false;
83*16d86563SAlexander Pyhalov
84*16d86563SAlexander Pyhalov return((void *)cd);
85*16d86563SAlexander Pyhalov }
86*16d86563SAlexander Pyhalov
87*16d86563SAlexander Pyhalov
88*16d86563SAlexander Pyhalov void
_icv_close(_icv_state_t * cd)89*16d86563SAlexander Pyhalov _icv_close(_icv_state_t *cd)
90*16d86563SAlexander Pyhalov {
91*16d86563SAlexander Pyhalov if (! cd)
92*16d86563SAlexander Pyhalov errno = EBADF;
93*16d86563SAlexander Pyhalov else
94*16d86563SAlexander Pyhalov free((void *)cd);
95*16d86563SAlexander Pyhalov }
96*16d86563SAlexander Pyhalov
97*16d86563SAlexander Pyhalov
98*16d86563SAlexander Pyhalov size_t
_icv_iconv(_icv_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)99*16d86563SAlexander Pyhalov _icv_iconv(_icv_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
100*16d86563SAlexander Pyhalov size_t *outbufleft)
101*16d86563SAlexander Pyhalov {
102*16d86563SAlexander Pyhalov size_t ret_val = 0;
103*16d86563SAlexander Pyhalov uchar_t *ib;
104*16d86563SAlexander Pyhalov uchar_t *ob;
105*16d86563SAlexander Pyhalov uchar_t *ibtail;
106*16d86563SAlexander Pyhalov uchar_t *obtail;
107*16d86563SAlexander Pyhalov uint_t u4;
108*16d86563SAlexander Pyhalov uint_t u4_2;
109*16d86563SAlexander Pyhalov register int i;
110*16d86563SAlexander Pyhalov
111*16d86563SAlexander Pyhalov hcode_type unihan;
112*16d86563SAlexander Pyhalov unihan.code = 0x00;
113*16d86563SAlexander Pyhalov
114*16d86563SAlexander Pyhalov if (! cd) {
115*16d86563SAlexander Pyhalov errno = EBADF;
116*16d86563SAlexander Pyhalov return((size_t)-1);
117*16d86563SAlexander Pyhalov }
118*16d86563SAlexander Pyhalov
119*16d86563SAlexander Pyhalov if (!inbuf || !(*inbuf))
120*16d86563SAlexander Pyhalov return((size_t)0);
121*16d86563SAlexander Pyhalov
122*16d86563SAlexander Pyhalov ib = (uchar_t *)*inbuf;
123*16d86563SAlexander Pyhalov ob = (uchar_t *)*outbuf;
124*16d86563SAlexander Pyhalov ibtail = ib + *inbufleft;
125*16d86563SAlexander Pyhalov obtail = ob + *outbufleft;
126*16d86563SAlexander Pyhalov
127*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
128*16d86563SAlexander Pyhalov if (! cd->_bom_written) {
129*16d86563SAlexander Pyhalov if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130*16d86563SAlexander Pyhalov errno = EINVAL;
131*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
132*16d86563SAlexander Pyhalov goto need_more_input_err;
133*16d86563SAlexander Pyhalov }
134*16d86563SAlexander Pyhalov
135*16d86563SAlexander Pyhalov for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
136*16d86563SAlexander Pyhalov u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
137*16d86563SAlexander Pyhalov
138*16d86563SAlexander Pyhalov /* Big endian, Little endian, or, not specified?? */
139*16d86563SAlexander Pyhalov if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
140*16d86563SAlexander Pyhalov ib += ICV_FETCH_UCS_SIZE;
141*16d86563SAlexander Pyhalov cd->_is_little_endian = false;
142*16d86563SAlexander Pyhalov } else if (u4 == ICV_BOM_IN__IS_LITTLE_ENDIAN) {
143*16d86563SAlexander Pyhalov ib += ICV_FETCH_UCS_SIZE;
144*16d86563SAlexander Pyhalov cd->_is_little_endian = true;
145*16d86563SAlexander Pyhalov }
146*16d86563SAlexander Pyhalov }
147*16d86563SAlexander Pyhalov /*
148*16d86563SAlexander Pyhalov * Once BOM checking is done, regardless of whether we had the BOM or
149*16d86563SAlexander Pyhalov * not, we treat the BOM sequence as a ZWNBSP character from now on.
150*16d86563SAlexander Pyhalov */
151*16d86563SAlexander Pyhalov cd->_bom_written = true;
152*16d86563SAlexander Pyhalov #endif
153*16d86563SAlexander Pyhalov
154*16d86563SAlexander Pyhalov while (ib < ibtail) {
155*16d86563SAlexander Pyhalov if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
156*16d86563SAlexander Pyhalov errno = EINVAL;
157*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
158*16d86563SAlexander Pyhalov break;
159*16d86563SAlexander Pyhalov }
160*16d86563SAlexander Pyhalov
161*16d86563SAlexander Pyhalov u4 = u4_2 = 0;
162*16d86563SAlexander Pyhalov if (cd->_is_little_endian) {
163*16d86563SAlexander Pyhalov for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
164*16d86563SAlexander Pyhalov u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
165*16d86563SAlexander Pyhalov } else {
166*16d86563SAlexander Pyhalov for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
167*16d86563SAlexander Pyhalov u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
168*16d86563SAlexander Pyhalov }
169*16d86563SAlexander Pyhalov
170*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
171*16d86563SAlexander Pyhalov if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
172*16d86563SAlexander Pyhalov errno = EILSEQ;
173*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
174*16d86563SAlexander Pyhalov break;
175*16d86563SAlexander Pyhalov }
176*16d86563SAlexander Pyhalov #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
177*16d86563SAlexander Pyhalov if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
178*16d86563SAlexander Pyhalov errno = EILSEQ;
179*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
180*16d86563SAlexander Pyhalov break;
181*16d86563SAlexander Pyhalov }
182*16d86563SAlexander Pyhalov
183*16d86563SAlexander Pyhalov if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
184*16d86563SAlexander Pyhalov if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
185*16d86563SAlexander Pyhalov errno = EINVAL;
186*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
187*16d86563SAlexander Pyhalov break;
188*16d86563SAlexander Pyhalov }
189*16d86563SAlexander Pyhalov
190*16d86563SAlexander Pyhalov if (cd->_is_little_endian) {
191*16d86563SAlexander Pyhalov for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
192*16d86563SAlexander Pyhalov i >= ICV_FETCH_UCS_SIZE;
193*16d86563SAlexander Pyhalov i--)
194*16d86563SAlexander Pyhalov u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
195*16d86563SAlexander Pyhalov } else {
196*16d86563SAlexander Pyhalov for (i = ICV_FETCH_UCS_SIZE;
197*16d86563SAlexander Pyhalov i < ICV_FETCH_UCS_SIZE_TWO;
198*16d86563SAlexander Pyhalov i++)
199*16d86563SAlexander Pyhalov u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
200*16d86563SAlexander Pyhalov }
201*16d86563SAlexander Pyhalov
202*16d86563SAlexander Pyhalov if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
203*16d86563SAlexander Pyhalov errno = EILSEQ;
204*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
205*16d86563SAlexander Pyhalov break;
206*16d86563SAlexander Pyhalov }
207*16d86563SAlexander Pyhalov
208*16d86563SAlexander Pyhalov u4 = ((((u4 - 0x00d800) * 0x400) +
209*16d86563SAlexander Pyhalov (u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
210*16d86563SAlexander Pyhalov }
211*16d86563SAlexander Pyhalov #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
212*16d86563SAlexander Pyhalov if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
213*16d86563SAlexander Pyhalov (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
214*16d86563SAlexander Pyhalov errno = EILSEQ;
215*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
216*16d86563SAlexander Pyhalov break;
217*16d86563SAlexander Pyhalov }
218*16d86563SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
219*16d86563SAlexander Pyhalov if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
220*16d86563SAlexander Pyhalov (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
221*16d86563SAlexander Pyhalov errno = EILSEQ;
222*16d86563SAlexander Pyhalov ret_val = (size_t)-1;
223*16d86563SAlexander Pyhalov break;
224*16d86563SAlexander Pyhalov }
225*16d86563SAlexander Pyhalov #else
226*16d86563SAlexander Pyhalov #error "Fatal: one of the UCS macros need to be defined."
227*16d86563SAlexander Pyhalov #endif
228*16d86563SAlexander Pyhalov
229*16d86563SAlexander Pyhalov /*
230*16d86563SAlexander Pyhalov * Once we reach here, the "u4" contains a valid character
231*16d86563SAlexander Pyhalov * and thus we don't do any other error checking in
232*16d86563SAlexander Pyhalov * the below.
233*16d86563SAlexander Pyhalov */
234*16d86563SAlexander Pyhalov
235*16d86563SAlexander Pyhalov unihan = ucs_to_unihan (u4);
236*16d86563SAlexander Pyhalov if(unihan.byte.byte1 == '\0' && unihan.byte.byte2 == '\0' && unihan.byte.byte3 == '\0')
237*16d86563SAlexander Pyhalov {
238*16d86563SAlexander Pyhalov *ob++ = unihan.byte.byte4;
239*16d86563SAlexander Pyhalov ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
240*16d86563SAlexander Pyhalov continue;
241*16d86563SAlexander Pyhalov }
242*16d86563SAlexander Pyhalov if (cd->_need_byte_swap){
243*16d86563SAlexander Pyhalov *ob++ = (uchar_t) unihan.byte.byte4;
244*16d86563SAlexander Pyhalov *ob++ = (uchar_t) unihan.byte.byte3;
245*16d86563SAlexander Pyhalov } else {
246*16d86563SAlexander Pyhalov *ob++ = (uchar_t) unihan.byte.byte3;
247*16d86563SAlexander Pyhalov *ob++ = (uchar_t) unihan.byte.byte4;
248*16d86563SAlexander Pyhalov }
249*16d86563SAlexander Pyhalov
250*16d86563SAlexander Pyhalov ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
251*16d86563SAlexander Pyhalov }
252*16d86563SAlexander Pyhalov
253*16d86563SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
254*16d86563SAlexander Pyhalov need_more_input_err:
255*16d86563SAlexander Pyhalov #endif
256*16d86563SAlexander Pyhalov *inbuf = (char *)ib;
257*16d86563SAlexander Pyhalov *inbufleft = ibtail - ib;
258*16d86563SAlexander Pyhalov *outbuf = (char *)ob;
259*16d86563SAlexander Pyhalov *outbufleft = obtail - ob;
260*16d86563SAlexander Pyhalov
261*16d86563SAlexander Pyhalov return(ret_val);
262*16d86563SAlexander Pyhalov }
263*16d86563SAlexander Pyhalov
264*16d86563SAlexander Pyhalov static hcode_type
ucs_to_unihan(uint_t ucs_char)265*16d86563SAlexander Pyhalov ucs_to_unihan (uint_t ucs_char)
266*16d86563SAlexander Pyhalov {
267*16d86563SAlexander Pyhalov hcode_type unihan_char;
268*16d86563SAlexander Pyhalov hcode_type utf8_char;
269*16d86563SAlexander Pyhalov unihan_char.code = 0x00;
270*16d86563SAlexander Pyhalov
271*16d86563SAlexander Pyhalov if (ucs_char <= 0x7f) {
272*16d86563SAlexander Pyhalov utf8_char.code = ucs_char;
273*16d86563SAlexander Pyhalov
274*16d86563SAlexander Pyhalov } else if (ucs_char <= 0x7ff) {
275*16d86563SAlexander Pyhalov utf8_char.byte.byte3 = (uchar_t)(0xc0 | ((ucs_char & 0x07c0) >> 6));
276*16d86563SAlexander Pyhalov utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x003f));
277*16d86563SAlexander Pyhalov
278*16d86563SAlexander Pyhalov } else if (ucs_char <= 0x00ffff) {
279*16d86563SAlexander Pyhalov utf8_char.byte.byte2 = (uchar_t)(0xe0 | ((ucs_char & 0x0f000) >> 12));
280*16d86563SAlexander Pyhalov utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x00fc0) >> 6));
281*16d86563SAlexander Pyhalov utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x0003f));
282*16d86563SAlexander Pyhalov } else if (ucs_char <= 0x1fffff) {
283*16d86563SAlexander Pyhalov utf8_char.byte.byte1 = (uchar_t)(0xf0 | ((ucs_char & 0x01c0000) >> 18));
284*16d86563SAlexander Pyhalov utf8_char.byte.byte2 = (uchar_t)(0x80 | ((ucs_char & 0x003f000) >> 12));
285*16d86563SAlexander Pyhalov utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x0000fc0) >> 6));
286*16d86563SAlexander Pyhalov utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x000003f));
287*16d86563SAlexander Pyhalov } else
288*16d86563SAlexander Pyhalov utf8_char.code = 0x00;
289*16d86563SAlexander Pyhalov
290*16d86563SAlexander Pyhalov unihan_char = _utf8_to_unified_hangul (utf8_char);
291*16d86563SAlexander Pyhalov return unihan_char;
292*16d86563SAlexander Pyhalov }
293