1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. 5 * Copyright (c) 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Paul Borman at Krystal Technologies. 10 * 11 * Copyright (c) 2011 The FreeBSD Foundation 12 * All rights reserved. 13 * Portions of this software were developed by David Chisnall 14 * under sponsorship from the FreeBSD Foundation. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * @(#)euc.c 8.1 (Berkeley) 6/4/93 41 */ 42 43 #include <sys/param.h> 44 45 #include <errno.h> 46 #include <limits.h> 47 #include <runetype.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include "mblocal.h" 52 53 extern int __mb_sb_limit; 54 55 static size_t _EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict, 56 size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); 57 static size_t _EUC_wcrtomb_impl(char * __restrict, wchar_t, 58 mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); 59 60 static size_t _EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict, 61 size_t, mbstate_t * __restrict); 62 static size_t _EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict, 63 size_t, mbstate_t * __restrict); 64 static size_t _EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict, 65 size_t, mbstate_t * __restrict); 66 static size_t _EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict, 67 size_t, mbstate_t * __restrict); 68 69 static size_t _EUC_CN_wcrtomb(char * __restrict, wchar_t, 70 mbstate_t * __restrict); 71 static size_t _EUC_JP_wcrtomb(char * __restrict, wchar_t, 72 mbstate_t * __restrict); 73 static size_t _EUC_KR_wcrtomb(char * __restrict, wchar_t, 74 mbstate_t * __restrict); 75 static size_t _EUC_TW_wcrtomb(char * __restrict, wchar_t, 76 mbstate_t * __restrict); 77 78 static size_t _EUC_CN_mbsnrtowcs(wchar_t * __restrict, 79 const char ** __restrict, size_t, size_t, 80 mbstate_t * __restrict); 81 static size_t _EUC_JP_mbsnrtowcs(wchar_t * __restrict, 82 const char ** __restrict, size_t, size_t, 83 mbstate_t * __restrict); 84 static size_t _EUC_KR_mbsnrtowcs(wchar_t * __restrict, 85 const char ** __restrict, size_t, size_t, 86 mbstate_t * __restrict); 87 static size_t _EUC_TW_mbsnrtowcs(wchar_t * __restrict, 88 const char ** __restrict, size_t, size_t, 89 mbstate_t * __restrict); 90 91 static size_t _EUC_CN_wcsnrtombs(char * __restrict, 92 const wchar_t ** __restrict, size_t, size_t, 93 mbstate_t * __restrict); 94 static size_t _EUC_JP_wcsnrtombs(char * __restrict, 95 const wchar_t ** __restrict, size_t, size_t, 96 mbstate_t * __restrict); 97 static size_t _EUC_KR_wcsnrtombs(char * __restrict, 98 const wchar_t ** __restrict, size_t, size_t, 99 mbstate_t * __restrict); 100 static size_t _EUC_TW_wcsnrtombs(char * __restrict, 101 const wchar_t ** __restrict, size_t, size_t, 102 mbstate_t * __restrict); 103 104 static int _EUC_mbsinit(const mbstate_t *); 105 106 typedef struct { 107 wchar_t ch; 108 int set; 109 int want; 110 } _EucState; 111 112 static int 113 _EUC_mbsinit(const mbstate_t *ps) 114 { 115 116 return (ps == NULL || ((const _EucState *)ps)->want == 0); 117 } 118 119 /* 120 * EUC-CN uses CS0, CS1 and CS2 (4 bytes). 121 */ 122 int 123 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl) 124 { 125 l->__mbrtowc = _EUC_CN_mbrtowc; 126 l->__wcrtomb = _EUC_CN_wcrtomb; 127 l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs; 128 l->__wcsnrtombs = _EUC_CN_wcsnrtombs; 129 l->__mbsinit = _EUC_mbsinit; 130 131 l->runes = rl; 132 l->__mb_cur_max = 4; 133 l->__mb_sb_limit = 256; 134 return (0); 135 } 136 137 static size_t 138 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 139 size_t n, mbstate_t * __restrict ps) 140 { 141 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 142 } 143 144 static size_t 145 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst, 146 const char ** __restrict src, 147 size_t nms, size_t len, mbstate_t * __restrict ps) 148 { 149 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc)); 150 } 151 152 static size_t 153 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc, 154 mbstate_t * __restrict ps) 155 { 156 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 157 } 158 159 static size_t 160 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 161 size_t nwc, size_t len, mbstate_t * __restrict ps) 162 { 163 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb)); 164 } 165 166 /* 167 * EUC-KR uses only CS0 and CS1. 168 */ 169 int 170 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl) 171 { 172 l->__mbrtowc = _EUC_KR_mbrtowc; 173 l->__wcrtomb = _EUC_KR_wcrtomb; 174 l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs; 175 l->__wcsnrtombs = _EUC_KR_wcsnrtombs; 176 l->__mbsinit = _EUC_mbsinit; 177 178 l->runes = rl; 179 l->__mb_cur_max = 2; 180 l->__mb_sb_limit = 128; 181 return (0); 182 } 183 184 static size_t 185 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 186 size_t n, mbstate_t * __restrict ps) 187 { 188 return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0)); 189 } 190 191 static size_t 192 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst, 193 const char ** __restrict src, 194 size_t nms, size_t len, mbstate_t * __restrict ps) 195 { 196 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc)); 197 } 198 199 static size_t 200 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc, 201 mbstate_t * __restrict ps) 202 { 203 return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); 204 } 205 206 static size_t 207 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 208 size_t nwc, size_t len, mbstate_t * __restrict ps) 209 { 210 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb)); 211 } 212 213 /* 214 * EUC-JP uses CS0, CS1, CS2, and CS3. 215 */ 216 int 217 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl) 218 { 219 l->__mbrtowc = _EUC_JP_mbrtowc; 220 l->__wcrtomb = _EUC_JP_wcrtomb; 221 l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs; 222 l->__wcsnrtombs = _EUC_JP_wcsnrtombs; 223 l->__mbsinit = _EUC_mbsinit; 224 225 l->runes = rl; 226 l->__mb_cur_max = 3; 227 l->__mb_sb_limit = 196; 228 return (0); 229 } 230 231 static size_t 232 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 233 size_t n, mbstate_t * __restrict ps) 234 { 235 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3)); 236 } 237 238 static size_t 239 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst, 240 const char ** __restrict src, 241 size_t nms, size_t len, mbstate_t * __restrict ps) 242 { 243 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc)); 244 } 245 246 static size_t 247 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc, 248 mbstate_t * __restrict ps) 249 { 250 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); 251 } 252 253 static size_t 254 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 255 size_t nwc, size_t len, mbstate_t * __restrict ps) 256 { 257 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb)); 258 } 259 260 /* 261 * EUC-TW uses CS0, CS1, and CS2. 262 */ 263 int 264 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl) 265 { 266 l->__mbrtowc = _EUC_TW_mbrtowc; 267 l->__wcrtomb = _EUC_TW_wcrtomb; 268 l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs; 269 l->__wcsnrtombs = _EUC_TW_wcsnrtombs; 270 l->__mbsinit = _EUC_mbsinit; 271 272 l->runes = rl; 273 l->__mb_cur_max = 4; 274 l->__mb_sb_limit = 256; 275 return (0); 276 } 277 278 static size_t 279 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, 280 size_t n, mbstate_t * __restrict ps) 281 { 282 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); 283 } 284 285 static size_t 286 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst, 287 const char ** __restrict src, 288 size_t nms, size_t len, mbstate_t * __restrict ps) 289 { 290 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc)); 291 } 292 293 static size_t 294 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc, 295 mbstate_t * __restrict ps) 296 { 297 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 298 } 299 300 static size_t 301 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 302 size_t nwc, size_t len, mbstate_t * __restrict ps) 303 { 304 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb)); 305 } 306 307 /* 308 * Common EUC code. 309 */ 310 311 static size_t 312 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s, 313 size_t n, mbstate_t * __restrict ps, 314 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 315 { 316 _EucState *es; 317 int i, want; 318 wchar_t wc; 319 unsigned char ch; 320 321 es = (_EucState *)ps; 322 323 if (es->want < 0 || es->want > MB_CUR_MAX) { 324 errno = EINVAL; 325 return ((size_t)-1); 326 } 327 328 if (s == NULL) { 329 s = ""; 330 n = 1; 331 pwc = NULL; 332 } 333 334 if (n == 0) 335 /* Incomplete multibyte sequence */ 336 return ((size_t)-2); 337 338 if (es->want == 0) { 339 /* Fast path for plain ASCII (CS0) */ 340 if (((ch = (unsigned char)*s) & 0x80) == 0) { 341 if (pwc != NULL) 342 *pwc = ch; 343 return (ch != '\0' ? 1 : 0); 344 } 345 346 if (ch >= 0xa1) { 347 /* CS1 */ 348 want = 2; 349 } else if (ch == cs2) { 350 want = cs2width; 351 } else if (ch == cs3) { 352 want = cs3width; 353 } else { 354 errno = EILSEQ; 355 return ((size_t)-1); 356 } 357 358 359 es->want = want; 360 es->ch = 0; 361 } else { 362 want = es->want; 363 wc = es->ch; 364 } 365 366 for (i = 0; i < MIN(want, n); i++) { 367 wc <<= 8; 368 wc |= *s; 369 s++; 370 } 371 if (i < want) { 372 /* Incomplete multibyte sequence */ 373 es->want = want - i; 374 es->ch = wc; 375 return ((size_t)-2); 376 } 377 if (pwc != NULL) 378 *pwc = wc; 379 es->want = 0; 380 return (wc == L'\0' ? 0 : want); 381 } 382 383 static size_t 384 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc, 385 mbstate_t * __restrict ps, 386 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 387 { 388 _EucState *es; 389 int i, len; 390 wchar_t nm; 391 392 es = (_EucState *)ps; 393 394 if (es->want != 0) { 395 errno = EINVAL; 396 return ((size_t)-1); 397 } 398 399 if (s == NULL) 400 /* Reset to initial shift state (no-op) */ 401 return (1); 402 403 if ((wc & ~0x7f) == 0) { 404 /* Fast path for plain ASCII (CS0) */ 405 *s = (char)wc; 406 return (1); 407 } 408 409 /* Determine the "length" */ 410 if ((unsigned)wc > 0xffffff) { 411 len = 4; 412 } else if ((unsigned)wc > 0xffff) { 413 len = 3; 414 } else if ((unsigned)wc > 0xff) { 415 len = 2; 416 } else { 417 len = 1; 418 } 419 420 if (len > MB_CUR_MAX) { 421 errno = EILSEQ; 422 return ((size_t)-1); 423 } 424 425 /* This first check excludes CS1, which is implicitly valid. */ 426 if ((wc < 0xa100) || (wc > 0xffff)) { 427 /* Check for valid CS2 or CS3 */ 428 nm = (wc >> ((len - 1) * 8)); 429 if (nm == cs2) { 430 if (len != cs2width) { 431 errno = EILSEQ; 432 return ((size_t)-1); 433 } 434 } else if (nm == cs3) { 435 if (len != cs3width) { 436 errno = EILSEQ; 437 return ((size_t)-1); 438 } 439 } else { 440 errno = EILSEQ; 441 return ((size_t)-1); 442 } 443 } 444 445 /* Stash the bytes, least significant last */ 446 for (i = len - 1; i >= 0; i--) { 447 s[i] = (wc & 0xff); 448 wc >>= 8; 449 } 450 return (len); 451 } 452