1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 1998 Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <sys/types.h>
29 #include <sys/isa_defs.h>
30 #include <gb2312_unicode.h>
31 #include "common_defs.h"
32 #define MSB	0x80
33 
34 #define UTF8_NON_ID_CHAR1 0xEF
35 #define UTF8_NON_ID_CHAR2 0xBF
36 #define UTF8_NON_ID_CHAR3 0xBD
37 
38 #define EUC_BYTE1_LOWER   0xA1
39 #define EUC_BYTE1_UPPER   0xFE
40 #define EUC_BYTE2_LOWER   EUC_BYTE1_LOWER
41 #define EUC_BYTE2_UPPER   EUC_BYTE1_UPPER
42 
43 #define UCHAR unsigned char
44 
45 typedef struct _icv_state {
46 	char	_lastc;
47 	short	_gstate;
48         boolean little_endian;
49         boolean bom_written;
50 } _iconv_st;
51 
52 enum	_GSTATE { G0, G1 };
53 
54 static  int  is_valid_gb2312(UCHAR, UCHAR);
55 int
56 gb_to_unicode(_iconv_st *st, char in_byte2, char *buf, int buflen, int *uconv_num);
57 
58 /*
59  * Open; called from iconv_open()
60  */
61 void *
_icv_open()62 _icv_open()
63 {
64 	_iconv_st *st;
65 
66 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
67 		errno = ENOMEM;
68 		return ((void *) -1);
69 	}
70 
71 	st->_gstate = G0;
72 	st->little_endian = false;
73 	st->bom_written = false;
74 #if defined(UCS_2LE)
75 	st->little_endian = true;
76 	st->bom_written = true;
77 #endif
78 	return ((void *)st);
79 }
80 
81 
82 /*
83  * Close; called from iconv_close()
84  */
85 void
_icv_close(_iconv_st * st)86 _icv_close(_iconv_st *st)
87 {
88 	if (st == NULL)
89 		errno = EBADF;
90 	else
91 		free(st);
92 }
93 
94 
95 /*
96  * Actual conversion; called from iconv()
97  */
98 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)99 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
100 			char **outbuf, size_t*outbytesleft)
101 {
102 	int	n;
103         int	uconv_num = 0;
104 
105 	if (st == NULL) {
106 		errno = EBADF;
107 		return (size_t)-1;
108 	}
109 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
110 		st->_gstate = G0;
111 		return (size_t)0;
112 	}
113 
114 	errno = 0;
115 
116 	while (*inbytesleft > 0 && *outbytesleft > 0) {
117 	    switch (st->_gstate) {
118 	    case G0:
119 		if ( **inbuf & MSB ) {
120 		    st->_lastc = **inbuf;
121 		    st->_gstate = G1;
122 		} else {		/* ASCII */
123 		  /*
124 		   * code conversion for UCS-2LE to support Samba
125 		   */
126 		  if (st->little_endian) {
127 		      if (!st->bom_written) {
128 		         if (*outbytesleft < 4)
129 			    errno = E2BIG;
130 		         else {
131 			    *(*outbuf)++ = (uchar_t)0xff;
132 			    *(*outbuf)++ = (uchar_t)0xfe;
133 
134 			    st->bom_written = true;
135 			    *outbytesleft -= 2;
136 			 }
137 		      }
138 
139 		      if (*outbytesleft < 2)
140 			errno = E2BIG;
141 		      else {
142 			*(*outbuf)++ = **inbuf;
143 			*(*outbuf)++ = (uchar_t)0x0;
144 			*outbytesleft -= 2;
145 		      }
146 		  } else {
147 		    **outbuf = **inbuf;
148 		    (*outbuf)++, (*outbytesleft)--;
149 		  }
150 		}
151 		break;
152 	    case G1:
153 		if (**inbuf & MSB ) {
154 		    int uconv_num_internal = 0;
155 
156 		    /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */
157 		    if ( !is_valid_gb2312((UCHAR)st->_lastc, (UCHAR)**inbuf))
158 		     {
159 			errno = EILSEQ;
160 			break;
161 		     }
162 
163 		    n = gb_to_unicode(st, **inbuf, *outbuf,
164 				      *outbytesleft, &uconv_num_internal);
165 		    if (n > 0) {
166 			(*outbuf) += n, (*outbytesleft) -= n;
167 
168 		        uconv_num += uconv_num_internal;
169 
170 			st->_gstate = G0;
171 		    } else {
172 			errno = E2BIG;
173 		    }
174 	        } else {
175 		    errno = EILSEQ;
176 		}
177 		break;
178 	    }
179 
180 	    if (errno) break;
181 
182 	    (*inbuf)++, (*inbytesleft)--;
183 	}
184 
185         if (*inbytesleft == 0 && st->_gstate != G0)
186                 errno = EINVAL;
187 
188 	if (*inbytesleft > 0 && *outbytesleft == 0)
189 	    errno = E2BIG;
190 
191         if (errno) {
192 	     /*
193 	      * if error, *inbuf points to the byte following the last byte
194 	      * successfully used in the conversion.
195 	      */
196 	     *inbuf -= (st->_gstate - G0);
197 	     *inbytesleft += (st->_gstate - G0);
198 	     st->_gstate = G0;
199 	     return ((size_t) -1);
200 	}
201 
202 	return uconv_num;
203 }
204 
205 static int
is_valid_gb2312(UCHAR byte1,UCHAR byte2)206 is_valid_gb2312(UCHAR byte1, UCHAR byte2)
207 {
208    if ( (byte1 < EUC_BYTE1_LOWER || byte1 > EUC_BYTE1_UPPER) ||
209 	(byte2 < EUC_BYTE2_LOWER || byte2 > EUC_BYTE2_UPPER) ) {
210         return 0;
211     }
212 
213    return 1;
214 }
215 
216 
217 /*
218  * return: > 0 - converted with enough space
219  *	   = 0 - no space in outbuf
220  */
221 int
gb_to_unicode(st,in_byte2,buf,buflen,uconv_num)222 gb_to_unicode(st, in_byte2, buf, buflen, uconv_num)
223 _iconv_st *st;
224 char	in_byte2;
225 char	*buf;
226 int	buflen;
227 int	*uconv_num;
228 {
229 	int	idx;
230 	int	unicode;
231 	char    in_byte1 = st->_lastc;
232 
233 	idx = (((in_byte1 & 0xff) - 0xa1) * 94)  + (in_byte2 & 0xff) - 0xa1;
234 	/*
235 	 * code conversion for UCS-2LE to support samba in Solaris
236 	 */
237 	if (st->little_endian) {
238 	   int size = 0;
239 
240 	   if (idx < 0 || idx >= GBMAX) {
241 	      unicode = ICV_CHAR_UCS2_REPLACEMENT;
242 	      *uconv_num = 1;
243 	   } else
244 	      unicode = Unicode[idx];
245 
246 	   if (!st->bom_written) {
247 	      if (buflen < 4)
248 		return 0;
249 
250 	      *(buf + size++) = (uchar_t)0xff;
251 	      *(buf + size++) = (uchar_t)0xfe;
252 	      st->bom_written = true;
253 	   }
254 
255 	   if (buflen < 2)
256 	     return 0;
257 
258 	   *(buf + size++) = (uchar_t)(unicode & 0xff);
259 	   *(buf + size++) = (uchar_t)((unicode >> 8) & 0xff);
260 
261 	   return size;
262 	}
263 
264         /* bugfix - 4669831 iconv from zh_CN.euc to UTF-8 dumps core on Intel. */
265 	if (idx >= 0 && idx < GBMAX ) {
266 		unicode = Unicode[idx];
267 		if (unicode >= 0x0080 && unicode <= 0x07ff) {
268 		    if ( buflen < 2 )
269 			return 0;
270 		    *buf = ((unicode >> 6) & 0x1f) | 0xc0;
271 		    *(buf+1) = (unicode & 0x3f) | MSB;
272 		    return 2;
273 		}
274 		if (unicode >= 0x0800 && unicode <= 0xffff) {
275 		    if ( buflen < 3 )
276 			return 0;
277 		    *buf = ((unicode >> 12) & 0x0f) | 0xe0;
278 		    *(buf+1) = ((unicode >> 6) & 0x3f) | MSB;
279 		    *(buf+2) = (unicode & 0x3f) | MSB;
280 		    return 3;
281 		}
282 	}
283 	if ( buflen < 3 )
284 	    return 0;
285 
286 	*buf     = UTF8_NON_ID_CHAR1;
287 	*(buf+1) = UTF8_NON_ID_CHAR2;
288 	*(buf+2) = UTF8_NON_ID_CHAR3;
289 
290         /* non-identical conversion */
291         *uconv_num = 1;
292 
293 	return 3;
294 }
295