1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. 5 * Copyright (c) 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Paul Borman at Krystal Technologies. 10 * 11 * Copyright (c) 2011 The FreeBSD Foundation 12 * All rights reserved. 13 * Portions of this software were developed by David Chisnall 14 * under sponsorship from the FreeBSD Foundation. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)euc.c 8.1 (Berkeley) 6/4/93 41 */ 42 43 #include <sys/param.h> 44 45 #include <errno.h> 46 #include <limits.h> 47 #include <runetype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include "mblocal.h" 52 53 static size_t _EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict, 54 size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); 55 static size_t _EUC_wcrtomb_impl(char * __restrict, wchar_t, 56 mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); 57 58 static size_t _EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict, 59 size_t, mbstate_t * __restrict); 60 static size_t _EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict, 61 size_t, mbstate_t * __restrict); 62 static size_t _EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict, 63 size_t, mbstate_t * __restrict); 64 static size_t _EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict, 65 size_t, mbstate_t * __restrict); 66 67 static size_t _EUC_CN_wcrtomb(char * __restrict, wchar_t, 68 mbstate_t * __restrict); 69 static size_t _EUC_JP_wcrtomb(char * __restrict, wchar_t, 70 mbstate_t * __restrict); 71 static size_t _EUC_KR_wcrtomb(char * __restrict, wchar_t, 72 mbstate_t * __restrict); 73 static size_t _EUC_TW_wcrtomb(char * __restrict, wchar_t, 74 mbstate_t * __restrict); 75 76 static size_t _EUC_CN_mbsnrtowcs(wchar_t * __restrict, 77 const char ** __restrict, size_t, size_t, 78 mbstate_t * __restrict); 79 static size_t _EUC_JP_mbsnrtowcs(wchar_t * __restrict, 80 const char ** __restrict, size_t, size_t, 81 mbstate_t * __restrict); 82 static size_t _EUC_KR_mbsnrtowcs(wchar_t * __restrict, 83 const char ** __restrict, size_t, size_t, 84 mbstate_t * __restrict); 85 static size_t _EUC_TW_mbsnrtowcs(wchar_t * __restrict, 86 const char ** __restrict, size_t, size_t, 87 mbstate_t * __restrict); 88 89 static size_t _EUC_CN_wcsnrtombs(char * __restrict, 90 const wchar_t ** __restrict, size_t, size_t, 91 mbstate_t * __restrict); 92 static size_t _EUC_JP_wcsnrtombs(char * __restrict, 93 const wchar_t ** __restrict, size_t, size_t, 94 mbstate_t * __restrict); 95 static size_t _EUC_KR_wcsnrtombs(char * __restrict, 96 const wchar_t ** __restrict, size_t, size_t, 97 mbstate_t * __restrict); 98 static size_t _EUC_TW_wcsnrtombs(char * __restrict, 99 const wchar_t ** __restrict, size_t, size_t, 100 mbstate_t * __restrict); 101 102 static int _EUC_mbsinit(const mbstate_t *); 103 104 typedef struct { 105 wchar_t ch; 106 int set; 107 int want; 108 } _EucState; 109 110 static int 111 _EUC_mbsinit(const mbstate_t *ps) 112 { 113 114 return (ps == NULL || ((const _EucState *)ps)->want == 0); 115 } 116 117 /* 118 * EUC-CN uses CS0, CS1 and CS2 (4 bytes). 119 */ 120 int 121 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl) 122 { 123 l->__mbrtowc = _EUC_CN_mbrtowc; 124 l->__wcrtomb = _EUC_CN_wcrtomb; 125 l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs; 126 l->__wcsnrtombs = _EUC_CN_wcsnrtombs; 127 l->__mbsinit = _EUC_mbsinit; 128 129 l->runes = rl; 130 l->__mb_cur_max = 4; 131 l->__mb_sb_limit = 256; 132 return (0); 133 } 134 135 static size_t 136 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 137 size_t n, mbstate_t * __restrict ps) 138 { 139 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 140 } 141 142 static size_t 143 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst, 144 const char ** __restrict src, 145 size_t nms, size_t len, mbstate_t * __restrict ps) 146 { 147 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc)); 148 } 149 150 static size_t 151 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc, 152 mbstate_t * __restrict ps) 153 { 154 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 155 } 156 157 static size_t 158 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 159 size_t nwc, size_t len, mbstate_t * __restrict ps) 160 { 161 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb)); 162 } 163 164 /* 165 * EUC-KR uses only CS0 and CS1. 166 */ 167 int 168 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl) 169 { 170 l->__mbrtowc = _EUC_KR_mbrtowc; 171 l->__wcrtomb = _EUC_KR_wcrtomb; 172 l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs; 173 l->__wcsnrtombs = _EUC_KR_wcsnrtombs; 174 l->__mbsinit = _EUC_mbsinit; 175 176 l->runes = rl; 177 l->__mb_cur_max = 2; 178 l->__mb_sb_limit = 128; 179 return (0); 180 } 181 182 static size_t 183 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 184 size_t n, mbstate_t * __restrict ps) 185 { 186 return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0)); 187 } 188 189 static size_t 190 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst, 191 const char ** __restrict src, 192 size_t nms, size_t len, mbstate_t * __restrict ps) 193 { 194 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc)); 195 } 196 197 static size_t 198 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc, 199 mbstate_t * __restrict ps) 200 { 201 return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); 202 } 203 204 static size_t 205 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 206 size_t nwc, size_t len, mbstate_t * __restrict ps) 207 { 208 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb)); 209 } 210 211 /* 212 * EUC-JP uses CS0, CS1, CS2, and CS3. 213 */ 214 int 215 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl) 216 { 217 l->__mbrtowc = _EUC_JP_mbrtowc; 218 l->__wcrtomb = _EUC_JP_wcrtomb; 219 l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs; 220 l->__wcsnrtombs = _EUC_JP_wcsnrtombs; 221 l->__mbsinit = _EUC_mbsinit; 222 223 l->runes = rl; 224 l->__mb_cur_max = 3; 225 l->__mb_sb_limit = 196; 226 return (0); 227 } 228 229 static size_t 230 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 231 size_t n, mbstate_t * __restrict ps) 232 { 233 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3)); 234 } 235 236 static size_t 237 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst, 238 const char ** __restrict src, 239 size_t nms, size_t len, mbstate_t * __restrict ps) 240 { 241 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc)); 242 } 243 244 static size_t 245 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc, 246 mbstate_t * __restrict ps) 247 { 248 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); 249 } 250 251 static size_t 252 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 253 size_t nwc, size_t len, mbstate_t * __restrict ps) 254 { 255 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb)); 256 } 257 258 /* 259 * EUC-TW uses CS0, CS1, and CS2. 260 */ 261 int 262 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl) 263 { 264 l->__mbrtowc = _EUC_TW_mbrtowc; 265 l->__wcrtomb = _EUC_TW_wcrtomb; 266 l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs; 267 l->__wcsnrtombs = _EUC_TW_wcsnrtombs; 268 l->__mbsinit = _EUC_mbsinit; 269 270 l->runes = rl; 271 l->__mb_cur_max = 4; 272 l->__mb_sb_limit = 256; 273 return (0); 274 } 275 276 static size_t 277 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 278 size_t n, mbstate_t * __restrict ps) 279 { 280 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 281 } 282 283 static size_t 284 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst, 285 const char ** __restrict src, 286 size_t nms, size_t len, mbstate_t * __restrict ps) 287 { 288 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc)); 289 } 290 291 static size_t 292 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc, 293 mbstate_t * __restrict ps) 294 { 295 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 296 } 297 298 static size_t 299 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 300 size_t nwc, size_t len, mbstate_t * __restrict ps) 301 { 302 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb)); 303 } 304 305 /* 306 * Common EUC code. 307 */ 308 309 static size_t 310 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s, 311 size_t n, mbstate_t * __restrict ps, 312 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 313 { 314 _EucState *es; 315 int i, want; 316 wchar_t wc = 0; 317 unsigned char ch, chs; 318 319 es = (_EucState *)ps; 320 321 if (es->want < 0 || es->want > MB_CUR_MAX) { 322 errno = EINVAL; 323 return ((size_t)-1); 324 } 325 326 if (s == NULL) { 327 s = ""; 328 n = 1; 329 pwc = NULL; 330 } 331 332 if (n == 0) 333 /* Incomplete multibyte sequence */ 334 return ((size_t)-2); 335 336 if (es->want == 0) { 337 /* Fast path for plain ASCII (CS0) */ 338 if (((ch = (unsigned char)*s) & 0x80) == 0) { 339 if (pwc != NULL) 340 *pwc = ch; 341 return (ch != '\0' ? 1 : 0); 342 } 343 344 if (ch >= 0xa1) { 345 /* CS1 */ 346 want = 2; 347 } else if (ch == cs2) { 348 want = cs2width; 349 } else if (ch == cs3) { 350 want = cs3width; 351 } else { 352 errno = EILSEQ; 353 return ((size_t)-1); 354 } 355 356 357 es->want = want; 358 es->ch = 0; 359 } else { 360 want = es->want; 361 wc = es->ch; 362 } 363 364 for (i = 0; i < MIN(want, n); i++) { 365 wc <<= 8; 366 chs = *s; 367 wc |= chs; 368 s++; 369 } 370 if (i < want) { 371 /* Incomplete multibyte sequence */ 372 es->want = want - i; 373 es->ch = wc; 374 errno = EILSEQ; 375 return ((size_t)-2); 376 } 377 if (pwc != NULL) 378 *pwc = wc; 379 es->want = 0; 380 return (wc == L'\0' ? 0 : want); 381 } 382 383 static size_t 384 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc, 385 mbstate_t * __restrict ps, 386 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 387 { 388 _EucState *es; 389 int i, len; 390 wchar_t nm; 391 392 es = (_EucState *)ps; 393 394 if (es->want != 0) { 395 errno = EINVAL; 396 return ((size_t)-1); 397 } 398 399 if (s == NULL) 400 /* Reset to initial shift state (no-op) */ 401 return (1); 402 403 if ((wc & ~0x7f) == 0) { 404 /* Fast path for plain ASCII (CS0) */ 405 *s = (char)wc; 406 return (1); 407 } 408 409 /* Determine the "length" */ 410 if ((unsigned)wc > 0xffffff) { 411 len = 4; 412 } else if ((unsigned)wc > 0xffff) { 413 len = 3; 414 } else if ((unsigned)wc > 0xff) { 415 len = 2; 416 } else { 417 len = 1; 418 } 419 420 if (len > MB_CUR_MAX) { 421 errno = EILSEQ; 422 return ((size_t)-1); 423 } 424 425 /* This first check excludes CS1, which is implicitly valid. */ 426 if ((wc < 0xa100) || (wc > 0xffff)) { 427 /* Check for valid CS2 or CS3 */ 428 nm = (wc >> ((len - 1) * 8)); 429 if (nm == cs2) { 430 if (len != cs2width) { 431 errno = EILSEQ; 432 return ((size_t)-1); 433 } 434 } else if (nm == cs3) { 435 if (len != cs3width) { 436 errno = EILSEQ; 437 return ((size_t)-1); 438 } 439 } else { 440 errno = EILSEQ; 441 return ((size_t)-1); 442 } 443 } 444 445 /* Stash the bytes, least significant last */ 446 for (i = len - 1; i >= 0; i--) { 447 s[i] = (wc & 0xff); 448 wc >>= 8; 449 } 450 return (len); 451 } 452