1 /* 2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin) 3 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 5 * Copyright (c) 2002-2004 Tim J. Robbins 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * WCSBIN_EOF - Indicate EOF on input buffer. 37 * 38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8 39 * has already been escaped), on bytes-to-wchars and 40 * wchars-to-bytes. Escaping of other illegal codes will 41 * still occur on input but de-escaping will not occur 42 * on output (they will remain in the surrogate space). 43 * 44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences 45 * (normally illegal), otherwise escape it on input 46 * and fail on output. 47 * 48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail. 49 */ 50 51 #include <sys/param.h> 52 53 #include <errno.h> 54 #include <limits.h> 55 #include <runetype.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <wchar.h> 59 #include "mblocal.h" 60 61 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 62 size_t, mbstate_t * __restrict); 63 static int _UTF8_mbsinit(const mbstate_t *); 64 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 65 const char ** __restrict, size_t, size_t, 66 mbstate_t * __restrict); 67 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 68 mbstate_t * __restrict); 69 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 70 size_t, size_t, mbstate_t * __restrict); 71 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst, 72 const char * __restrict src, 73 size_t dlen, size_t *slen, int flags); 74 static size_t _UTF8_wcrtombin(char * __restrict dst, 75 const wchar_t * __restrict src, 76 size_t dlen, size_t *slen, int flags); 77 78 typedef struct { 79 wchar_t ch; 80 int want; 81 wchar_t lbound; 82 } _UTF8State; 83 84 int 85 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 86 { 87 88 l->__mbrtowc = _UTF8_mbrtowc; 89 l->__wcrtomb = _UTF8_wcrtomb; 90 l->__mbsinit = _UTF8_mbsinit; 91 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 92 l->__wcsnrtombs = _UTF8_wcsnrtombs; 93 l->__mbintowcr = _UTF8_mbintowcr; 94 l->__wcrtombin = _UTF8_wcrtombin; 95 l->runes = rl; 96 l->__mb_cur_max = 4; 97 /* 98 * UCS-4 encoding used as the internal representation, so 99 * slots 0x0080-0x00FF are occuped and must be excluded 100 * from the single byte ctype by setting the limit. 101 */ 102 l->__mb_sb_limit = 128; 103 104 return (0); 105 } 106 107 static int 108 _UTF8_mbsinit(const mbstate_t *ps) 109 { 110 111 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 112 } 113 114 static size_t 115 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 116 mbstate_t * __restrict ps) 117 { 118 _UTF8State *us; 119 int ch, i, mask, want; 120 wchar_t lbound, wch; 121 122 us = (_UTF8State *)ps; 123 124 if (us->want < 0 || us->want > 4) { 125 errno = EINVAL; 126 return ((size_t)-1); 127 } 128 129 if (s == NULL) { 130 s = ""; 131 n = 1; 132 pwc = NULL; 133 } 134 135 if (n == 0) 136 /* Incomplete multibyte sequence */ 137 return ((size_t)-2); 138 139 if (us->want == 0) { 140 /* 141 * Determine the number of octets that make up this character 142 * from the first octet, and a mask that extracts the 143 * interesting bits of the first octet. We already know 144 * the character is at least two bytes long. 145 * 146 * We also specify a lower bound for the character code to 147 * detect redundant, non-"shortest form" encodings. For 148 * example, the sequence C0 80 is _not_ a legal representation 149 * of the null character. This enforces a 1-to-1 mapping 150 * between character codes and their multibyte representations. 151 */ 152 ch = (unsigned char)*s; 153 if ((ch & 0x80) == 0) { 154 /* Fast path for plain ASCII characters. */ 155 if (pwc != NULL) 156 *pwc = ch; 157 return (ch != '\0' ? 1 : 0); 158 } 159 if ((ch & 0xe0) == 0xc0) { 160 mask = 0x1f; 161 want = 2; 162 lbound = 0x80; 163 } else if ((ch & 0xf0) == 0xe0) { 164 mask = 0x0f; 165 want = 3; 166 lbound = 0x800; 167 } else if ((ch & 0xf8) == 0xf0) { 168 mask = 0x07; 169 want = 4; 170 lbound = 0x10000; 171 } else { 172 /* 173 * Malformed input; input is not UTF-8. 174 */ 175 errno = EILSEQ; 176 return ((size_t)-1); 177 } 178 } else { 179 want = us->want; 180 lbound = us->lbound; 181 } 182 183 /* 184 * Decode the octet sequence representing the character in chunks 185 * of 6 bits, most significant first. 186 */ 187 if (us->want == 0) 188 wch = (unsigned char)*s++ & mask; 189 else 190 wch = us->ch; 191 192 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 193 if ((*s & 0xc0) != 0x80) { 194 /* 195 * Malformed input; bad characters in the middle 196 * of a character. 197 */ 198 errno = EILSEQ; 199 return ((size_t)-1); 200 } 201 wch <<= 6; 202 wch |= *s++ & 0x3f; 203 } 204 if (i < want) { 205 /* Incomplete multibyte sequence. */ 206 us->want = want - i; 207 us->lbound = lbound; 208 us->ch = wch; 209 return ((size_t)-2); 210 } 211 if (wch < lbound || wch > 0x10ffff) { 212 /* 213 * Malformed input; redundant encoding or illegal 214 * code sequence. 215 */ 216 errno = EILSEQ; 217 return ((size_t)-1); 218 } 219 if (pwc != NULL) 220 *pwc = wch; 221 us->want = 0; 222 return (wch == L'\0' ? 0 : want); 223 } 224 225 static size_t 226 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 227 size_t nms, size_t len, mbstate_t * __restrict ps) 228 { 229 _UTF8State *us; 230 const char *s; 231 size_t nchr; 232 wchar_t wc; 233 size_t nb; 234 235 us = (_UTF8State *)ps; 236 237 s = *src; 238 nchr = 0; 239 240 if (dst == NULL) { 241 /* 242 * The fast path in the loop below is not safe if an ASCII 243 * character appears as anything but the first byte of a 244 * multibyte sequence. Check now to avoid doing it in the loop. 245 */ 246 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 247 errno = EILSEQ; 248 return ((size_t)-1); 249 } 250 for (;;) { 251 if (nms > 0 && (signed char)*s > 0) 252 /* 253 * Fast path for plain ASCII characters 254 * excluding NUL. 255 */ 256 nb = 1; 257 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 258 (size_t)-1) 259 /* Invalid sequence - mbrtowc() sets errno. */ 260 return ((size_t)-1); 261 else if (nb == 0 || nb == (size_t)-2) 262 return (nchr); 263 s += nb; 264 nms -= nb; 265 nchr++; 266 } 267 /*NOTREACHED*/ 268 } 269 270 /* 271 * The fast path in the loop below is not safe if an ASCII 272 * character appears as anything but the first byte of a 273 * multibyte sequence. Check now to avoid doing it in the loop. 274 */ 275 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 276 errno = EILSEQ; 277 return ((size_t)-1); 278 } 279 while (len-- > 0) { 280 if (nms > 0 && (signed char)*s > 0) { 281 /* 282 * Fast path for plain ASCII characters 283 * excluding NUL. 284 */ 285 *dst = (wchar_t)*s; 286 nb = 1; 287 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 288 (size_t)-1) { 289 *src = s; 290 return ((size_t)-1); 291 } else if (nb == (size_t)-2) { 292 *src = s + nms; 293 return (nchr); 294 } else if (nb == 0) { 295 *src = NULL; 296 return (nchr); 297 } 298 s += nb; 299 nms -= nb; 300 nchr++; 301 dst++; 302 } 303 *src = s; 304 return (nchr); 305 } 306 307 static size_t 308 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 309 { 310 _UTF8State *us; 311 unsigned char lead; 312 int i, len; 313 314 us = (_UTF8State *)ps; 315 316 if (us->want != 0) { 317 errno = EINVAL; 318 return ((size_t)-1); 319 } 320 321 if (s == NULL) 322 /* Reset to initial shift state (no-op) */ 323 return (1); 324 325 /* 326 * Determine the number of octets needed to represent this character. 327 * We always output the shortest sequence possible. Also specify the 328 * first few bits of the first octet, which contains the information 329 * about the sequence length. 330 */ 331 if ((wc & ~0x7f) == 0) { 332 /* Fast path for plain ASCII characters. */ 333 *s = (char)wc; 334 return (1); 335 } else if ((wc & ~0x7ff) == 0) { 336 lead = 0xc0; 337 len = 2; 338 } else if ((wc & ~0xffff) == 0) { 339 lead = 0xe0; 340 len = 3; 341 } else if (wc <= 0x10ffff) { 342 lead = 0xf0; 343 len = 4; 344 } else { 345 errno = EILSEQ; 346 return ((size_t)-1); 347 } 348 349 /* 350 * Output the octets representing the character in chunks 351 * of 6 bits, least significant last. The first octet is 352 * a special case because it contains the sequence length 353 * information. 354 */ 355 for (i = len - 1; i > 0; i--) { 356 s[i] = (wc & 0x3f) | 0x80; 357 wc >>= 6; 358 } 359 *s = (wc & 0xff) | lead; 360 361 return (len); 362 } 363 364 static size_t 365 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 366 size_t nwc, size_t len, mbstate_t * __restrict ps) 367 { 368 _UTF8State *us; 369 char buf[MB_LEN_MAX]; 370 const wchar_t *s; 371 size_t nbytes; 372 size_t nb; 373 374 us = (_UTF8State *)ps; 375 376 if (us->want != 0) { 377 errno = EINVAL; 378 return ((size_t)-1); 379 } 380 381 s = *src; 382 nbytes = 0; 383 384 if (dst == NULL) { 385 while (nwc-- > 0) { 386 if (0 <= *s && *s < 0x80) 387 /* Fast path for plain ASCII characters. */ 388 nb = 1; 389 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 390 (size_t)-1) 391 /* Invalid character - wcrtomb() sets errno. */ 392 return ((size_t)-1); 393 if (*s == L'\0') 394 return (nbytes + nb - 1); 395 s++; 396 nbytes += nb; 397 } 398 return (nbytes); 399 } 400 401 while (len > 0 && nwc-- > 0) { 402 if (0 <= *s && *s < 0x80) { 403 /* Fast path for plain ASCII characters. */ 404 nb = 1; 405 *dst = *s; 406 } else if (len > (size_t)MB_CUR_MAX) { 407 /* Enough space to translate in-place. */ 408 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 409 *src = s; 410 return ((size_t)-1); 411 } 412 } else { 413 /* 414 * May not be enough space; use temp. buffer. 415 */ 416 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 417 *src = s; 418 return ((size_t)-1); 419 } 420 if (nb > (int)len) 421 /* MB sequence for character won't fit. */ 422 break; 423 (void) memcpy(dst, buf, nb); 424 } 425 if (*s == L'\0') { 426 *src = NULL; 427 return (nbytes + nb - 1); 428 } 429 s++; 430 dst += nb; 431 len -= nb; 432 nbytes += nb; 433 } 434 *src = s; 435 return (nbytes); 436 } 437 438 /* 439 * Clean binary to wchar buffer conversions. This is basically like a normal 440 * buffer conversion but with a sane argument API and escaping. See none.c 441 * for a more complete description. 442 */ 443 static size_t 444 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src, 445 size_t dlen, size_t *slen, int flags) 446 { 447 size_t i; 448 size_t j; 449 size_t k; 450 size_t n = *slen; 451 int ch, mask, want; 452 wchar_t lbound, wch; 453 454 for (i = j = 0; i < n; ++i) { 455 if (j == dlen) 456 break; 457 ch = (unsigned char)src[i]; 458 459 if ((ch & 0x80) == 0) { 460 /* Fast path for plain ASCII characters. */ 461 if (dst) 462 dst[j] = ch; 463 ++j; 464 continue; 465 } 466 if ((ch & 0xe0) == 0xc0) { 467 mask = 0x1f; 468 want = 2; 469 lbound = 0x80; 470 } else if ((ch & 0xf0) == 0xe0) { 471 mask = 0x0f; 472 want = 3; 473 lbound = 0x800; 474 } else if ((ch & 0xf8) == 0xf0) { 475 mask = 0x07; 476 want = 4; 477 lbound = 0x10000; 478 } else if ((ch & 0xfc) == 0xf8) { 479 /* normally illegal, handled down below */ 480 mask = 0x03; 481 want = 5; 482 lbound = 0x200000; 483 } else if ((ch & 0xfe) == 0xfc) { 484 /* normally illegal, handled down below */ 485 mask = 0x01; 486 want = 6; 487 lbound = 0x4000000; 488 } else { 489 /* 490 * Malformed input; input is not UTF-8, escape 491 * with UTF-8B. 492 */ 493 if (flags & WCSBIN_STRICT) { 494 if (i == 0) { 495 errno = EILSEQ; 496 return ((size_t)-1); 497 } 498 break; 499 } 500 if (dst) 501 dst[j] = 0xDC00 | ch; 502 ++j; 503 continue; 504 } 505 506 /* 507 * Construct wchar_t from multibyte sequence. 508 */ 509 wch = ch & mask; 510 for (k = 1; k < want; ++k) { 511 /* 512 * Stop if not enough input (don't do this early 513 * so we can detect illegal characters as they occur 514 * in the stream). 515 * 516 * If termination is requested force-escape all chars. 517 */ 518 if (i + k >= n) { 519 if (flags & WCSBIN_EOF) { 520 want = n - i; 521 goto forceesc; 522 } 523 goto breakout; 524 } 525 526 ch = src[i+k]; 527 if ((ch & 0xc0) != 0x80) { 528 /* 529 * Malformed input, bad characters in the 530 * middle of a multibyte sequence. Escape 531 * with UTF-8B. 532 */ 533 if (flags & WCSBIN_STRICT) { 534 if (i == 0) { 535 errno = EILSEQ; 536 return ((size_t)-1); 537 } 538 goto breakout; 539 } 540 if (dst) 541 dst[j] = 0xDC00 | (unsigned char)src[i]; 542 ++j; 543 goto loopup; 544 } 545 wch <<= 6; 546 wch |= ch & 0x3f; 547 } 548 549 /* 550 * Check validity of the wchar. If invalid we could escape 551 * just the first character and loop up, but it ought to be 552 * more readable if we escape all the chars in the sequence 553 * (since they are all >= 0x80 and might represent a legacy 554 * 5-byte or 6-byte code). 555 */ 556 if (wch < lbound || 557 ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) { 558 goto forceesc; 559 } 560 561 /* 562 * Check if wch is a surrogate code (which also encloses our 563 * UTF-8B escaping range). This is normally illegal in UTF8. 564 * If it is, we need to escape each characer in the sequence. 565 * Breakout if there isn't enough output buffer space. 566 * 567 * If (flags & WCSBIN_SURRO) the caller wishes to accept 568 * surrogate codes, i.e. the input might potentially already 569 * be escaped UTF8-B or unchecked UTF-16 that was converted 570 * into UTF-8. 571 */ 572 if ((flags & WCSBIN_SURRO) == 0 && 573 wch >= 0xD800 && wch <= 0xDFFF) { 574 forceesc: 575 if (j + want > dlen) 576 break; 577 if (flags & WCSBIN_STRICT) { 578 if (i == 0) { 579 errno = EILSEQ; 580 return ((size_t)-1); 581 } 582 break; 583 } 584 for (k = 0; k < want; ++k) { 585 if (dst) { 586 dst[j] = 0xDC00 | 587 (unsigned char)src[i+k]; 588 } 589 ++j; 590 } 591 i += k - 1; 592 } else { 593 i += k - 1; 594 if (dst) 595 dst[j] = wch; 596 ++j; 597 } 598 loopup: 599 ; 600 } 601 breakout: 602 *slen = i; 603 604 return j; 605 } 606 607 static size_t 608 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src, 609 size_t dlen, size_t *slen, int flags) 610 { 611 size_t i; 612 size_t j; 613 size_t k; 614 size_t n = *slen; 615 size_t len; 616 unsigned char lead; 617 wchar_t wc; 618 619 for (i = j = 0; i < n; ++i) { 620 if (j == dlen) 621 break; 622 wc = src[i]; 623 624 if ((wc & ~0x7f) == 0) { 625 /* Fast path for plain ASCII characters. */ 626 if (dst) 627 dst[j] = (unsigned char)wc; 628 ++j; 629 continue; 630 } 631 if ((wc & ~0x7ff) == 0) { 632 lead = 0xc0; 633 len = 2; 634 } else if (wc >= 0xDC80 && wc <= 0xDCFF && 635 (flags & WCSBIN_SURRO) == 0) { 636 if (flags & WCSBIN_STRICT) { 637 /* 638 * STRICT without SURRO is an error for 639 * surrogates. 640 */ 641 if (i == 0) { 642 errno = EILSEQ; 643 return ((size_t)-1); 644 } 645 break; 646 } 647 if (dst) 648 dst[j] = (unsigned char)wc; 649 ++j; 650 continue; 651 } else if ((wc & ~0xffff) == 0) { 652 if (wc >= 0xD800 && wc <= 0xDFFF && 653 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) == 654 WCSBIN_STRICT) { 655 /* 656 * Surrogates in general are an error 657 * if STRICT is specified and SURRO is not 658 * specified. 659 */ 660 if (i == 0) { 661 errno = EILSEQ; 662 return ((size_t)-1); 663 } 664 break; 665 } 666 lead = 0xe0; 667 len = 3; 668 } else if (wc <= 0x10ffff) { 669 lead = 0xf0; 670 len = 4; 671 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) { 672 /* normally illegal */ 673 lead = 0xf0; 674 len = 4; 675 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) { 676 /* normally illegal */ 677 lead = 0xf8; 678 len = 5; 679 } else if ((flags & WCSBIN_LONGCODES) && 680 (uint32_t)wc < 0x80000000U) { 681 /* normally illegal */ 682 lead = 0xfc; 683 len = 6; 684 } else { 685 if (i == 0) { 686 errno = EILSEQ; 687 return ((size_t)-1); 688 } 689 /* stop here, process error on next loop */ 690 break; 691 } 692 693 /* 694 * Output the octets representing the character in chunks 695 * of 6 bits, least significant last. The first octet is 696 * a special case because it contains the sequence length 697 * information. 698 */ 699 if (j + len > dlen) 700 break; 701 k = j; 702 j += len; 703 if (dst) { 704 while (--len > 0) { 705 dst[k + len] = (wc & 0x3f) | 0x80; 706 wc >>= 6; 707 } 708 dst[k] = (wc & 0xff) | lead; 709 } 710 } 711 *slen = i; 712 713 return j; 714 } 715 716 size_t 717 utf8towcr(wchar_t * __restrict dst, const char * __restrict src, 718 size_t dlen, size_t *slen, int flags) 719 { 720 return _UTF8_mbintowcr(dst, src, dlen, slen, flags); 721 } 722 723 size_t 724 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src, 725 size_t dlen, size_t *slen, int flags) 726 { 727 return _UTF8_wcrtombin(dst, src, dlen, slen, flags); 728 } 729