1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/sysmacros.h> 29 #include <sys/systm.h> 30 #include <sys/debug.h> 31 #include <sys/kmem.h> 32 #include <sys/sunddi.h> 33 #include <sys/byteorder.h> 34 #include <sys/errno.h> 35 #include <sys/u8_textprep.h> 36 #include <sys/kiconv.h> 37 #include <sys/kiconv_cck_common.h> 38 39 /* 40 * Common kiconv_open method for UTF-8 -> CCK conversion. 41 */ 42 void * 43 kiconv_open_to_cck() 44 { 45 kiconv_state_t st; 46 47 st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP); 48 49 st->bom_processed = 0; 50 51 return ((void *)st); 52 } 53 54 /* 55 * Common kiconv_close method for UTF-8 -> CCK conversion. 56 */ 57 int 58 kiconv_close_to_cck(void *kcd) 59 { 60 if (! kcd || kcd == (void *)-1) 61 return (EBADF); 62 63 kmem_free(kcd, sizeof (kiconv_state_data_t)); 64 65 return (0); 66 } 67 68 /* 69 * Common routine to convert UTF-8 sequence to CCK legal character sequence. 70 */ 71 size_t 72 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft, 73 char **outbuf, size_t *outbytesleft, int *errno, 74 kiconv_utf8tocck_t ptr_utf8tocck) 75 { 76 uchar_t *ib; 77 uchar_t *ob; 78 uchar_t *ibtail; 79 uchar_t *obtail; 80 uchar_t *oldib; 81 size_t ret_val; 82 size_t i; /* temp variable in for loop */ 83 uint32_t u8; 84 int8_t sz; 85 86 /* Check on the kiconv code conversion descriptor. */ 87 if (! kcd || kcd == (void *)-1) { 88 *errno = EBADF; 89 return ((size_t)-1); 90 } 91 92 /* If this is a state reset request, process and return. */ 93 if (! inbuf || !(*inbuf)) { 94 ((kiconv_state_t)kcd)->bom_processed = 0; 95 return (0); 96 } 97 98 ret_val = 0; 99 ib = (uchar_t *)*inbuf; 100 ob = (uchar_t *)*outbuf; 101 ibtail = ib + *inbytesleft; 102 obtail = ob + *outbytesleft; 103 104 KICONV_CHECK_UTF8_BOM(ib, ibtail); 105 106 while (ib < ibtail) { 107 sz = u8_number_of_bytes[*ib]; 108 109 /* 110 * If it is a 7-bit ASCII character, we don't need to 111 * process further and we just copy the character over. 112 * 113 * If not, we connect the chracter bytes up to four bytes, 114 * validate the bytes, and binary search for the corresponding 115 * table. If we find it from the mapping table, we put that 116 * into the output buffer; otherwise, we put a replacement 117 * character instead as a non-identical conversion. 118 */ 119 if (sz == 1) { 120 if (ob >= obtail) { 121 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 122 } 123 124 *ob++ = *ib++; 125 continue; 126 } 127 128 /* 129 * Issue EILSEQ error if the first byte is a 130 * invalid UTF-8 character leading byte. 131 */ 132 if (sz <= 0) { 133 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 134 } 135 136 /* 137 * Issue EINVAL error if input buffer has an incomplete 138 * character at the end of the buffer. 139 */ 140 if (ibtail - ib < sz) { 141 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 142 } 143 144 /* 145 * We collect UTF-8 character bytes and also check if this 146 * is a valid UTF-8 character without any bogus bytes based 147 * on the latest UTF-8 binary representation. 148 */ 149 oldib = ib; 150 u8 = *ib++; 151 152 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8)) 153 goto ILLEGAL_CHAR_PROCESS; 154 u8 = (u8 << 8) | *ib++; 155 156 for (i = 2; i < sz; i++) { 157 if (*ib < 0x80 || *ib > 0xbf) { 158 ILLEGAL_CHAR_PROCESS: 159 *errno = EILSEQ; 160 ret_val = (size_t)-1; 161 ib = oldib; 162 goto ILLEGAL_CHAR_ERR; 163 } 164 165 u8 = (u8 << 8) | *ib++; 166 } 167 168 /* Now we have a valid UTF-8 character. */ 169 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val); 170 if (sz < 0) { 171 ib = oldib; 172 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 173 } 174 175 ob += sz; 176 } 177 178 ILLEGAL_CHAR_ERR: 179 *inbuf = (char *)ib; 180 *inbytesleft = ibtail - ib; 181 *outbuf = (char *)ob; 182 *outbytesleft = obtail - ob; 183 184 return (ret_val); 185 } 186 187 size_t 188 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen, 189 int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck) 190 { 191 uchar_t *ibtail; 192 uchar_t *obtail; 193 uchar_t *oldib; 194 size_t ret_val; 195 size_t i; /* temp variable in for loop */ 196 uint32_t u8; 197 int8_t sz; 198 boolean_t do_not_ignore_null; 199 200 ret_val = 0; 201 ibtail = ib + *inlen; 202 obtail = ob + *outlen; 203 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 204 205 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail); 206 207 while (ib < ibtail) { 208 if (*ib == '\0' && do_not_ignore_null) 209 break; 210 211 sz = u8_number_of_bytes[*ib]; 212 213 if (sz == 1) { 214 if (ob >= obtail) { 215 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 216 } 217 218 *ob++ = *ib++; 219 continue; 220 } 221 222 oldib = ib; 223 224 if (sz <= 0) { 225 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 226 } 227 228 if (ibtail - ib < sz) { 229 if (flag & KICONV_REPLACE_INVALID) { 230 ib = ibtail; 231 goto REPLACE_INVALID; 232 } 233 234 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 235 } 236 237 u8 = *ib++; 238 239 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8)) 240 goto ILLEGAL_CHAR_PROCESS; 241 u8 = (u8 << 8) | *ib++; 242 243 for (i = 2; i < sz; i++) { 244 if (*ib < 0x80 || *ib > 0xbf) { 245 ILLEGAL_CHAR_PROCESS: 246 if (flag & KICONV_REPLACE_INVALID) { 247 ib = oldib + sz; 248 goto REPLACE_INVALID; 249 } 250 251 *errno = EILSEQ; 252 ret_val = (size_t)-1; 253 ib = oldib; 254 goto ILLEGAL_CHAR_ERR; 255 } 256 257 u8 = (u8 << 8) | *ib++; 258 } 259 260 /* Now we get a valid character encoded in UTF-8. */ 261 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val); 262 if (sz < 0) { 263 ib = oldib; 264 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 265 } 266 267 ob += sz; 268 continue; 269 270 REPLACE_INVALID: 271 if (ob >= obtail) { 272 ib = oldib; 273 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 274 } 275 276 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR; 277 ret_val++; 278 } 279 280 ILLEGAL_CHAR_ERR: 281 *inlen = ibtail - ib; 282 *outlen = obtail - ob; 283 284 return (ret_val); 285 } 286 287 /* 288 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found. 289 * tbl[0] is a special element for non-identical conversion. 290 */ 291 size_t 292 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems) 293 { 294 size_t low, high, mid; 295 kiconv_table_t *table; 296 297 low = 1; 298 high = nitems - 1; 299 table = (kiconv_table_t *)tbl; 300 301 while (low <= high) { 302 mid = (low + high) / 2; 303 304 if (key < table[mid].key) 305 high = mid - 1; 306 else if (key > table[mid].key) 307 low = mid + 1; 308 else 309 return (mid); 310 } 311 312 return (0); 313 } 314