1 /* 2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin) 3 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 5 * Copyright (c) 2002-2004 Tim J. Robbins 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * WCSBIN_EOF - Indicate EOF on input buffer. 37 * 38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8 39 * has already been escaped), on bytes-to-wchars and 40 * wchars-to-bytes. Escaping of other illegal codes will 41 * still occur on input but de-escaping will not occur 42 * on output (they will remain in the surrogate space). 43 * 44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences 45 * (normally illegal), otherwise escape it on input 46 * and fail on output. 47 * 48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail. 49 */ 50 51 #include <sys/param.h> 52 53 #include <errno.h> 54 #include <limits.h> 55 #include <runetype.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <wchar.h> 59 #include "mblocal.h" 60 61 extern int __mb_sb_limit; 62 63 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 64 size_t, mbstate_t * __restrict); 65 static int _UTF8_mbsinit(const mbstate_t *); 66 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 67 const char ** __restrict, size_t, size_t, 68 mbstate_t * __restrict); 69 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 70 mbstate_t * __restrict); 71 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 72 size_t, size_t, mbstate_t * __restrict); 73 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst, 74 const char * __restrict src, 75 size_t dlen, size_t *slen, int flags); 76 static size_t _UTF8_wcrtombin(char * __restrict dst, 77 const wchar_t * __restrict src, 78 size_t dlen, size_t *slen, int flags); 79 80 typedef struct { 81 wchar_t ch; 82 int want; 83 wchar_t lbound; 84 } _UTF8State; 85 86 int 87 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 88 { 89 90 l->__mbrtowc = _UTF8_mbrtowc; 91 l->__wcrtomb = _UTF8_wcrtomb; 92 l->__mbsinit = _UTF8_mbsinit; 93 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 94 l->__wcsnrtombs = _UTF8_wcsnrtombs; 95 l->__mbintowcr = _UTF8_mbintowcr; 96 l->__wcrtombin = _UTF8_wcrtombin; 97 l->runes = rl; 98 l->__mb_cur_max = 4; 99 /* 100 * UCS-4 encoding used as the internal representation, so 101 * slots 0x0080-0x00FF are occuped and must be excluded 102 * from the single byte ctype by setting the limit. 103 */ 104 l->__mb_sb_limit = 128; 105 106 return (0); 107 } 108 109 static int 110 _UTF8_mbsinit(const mbstate_t *ps) 111 { 112 113 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 114 } 115 116 static size_t 117 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 118 mbstate_t * __restrict ps) 119 { 120 _UTF8State *us; 121 int ch, i, mask, want; 122 wchar_t lbound, wch; 123 124 us = (_UTF8State *)ps; 125 126 if (us->want < 0 || us->want > 4) { 127 errno = EINVAL; 128 return ((size_t)-1); 129 } 130 131 if (s == NULL) { 132 s = ""; 133 n = 1; 134 pwc = NULL; 135 } 136 137 if (n == 0) 138 /* Incomplete multibyte sequence */ 139 return ((size_t)-2); 140 141 if (us->want == 0) { 142 /* 143 * Determine the number of octets that make up this character 144 * from the first octet, and a mask that extracts the 145 * interesting bits of the first octet. We already know 146 * the character is at least two bytes long. 147 * 148 * We also specify a lower bound for the character code to 149 * detect redundant, non-"shortest form" encodings. For 150 * example, the sequence C0 80 is _not_ a legal representation 151 * of the null character. This enforces a 1-to-1 mapping 152 * between character codes and their multibyte representations. 153 */ 154 ch = (unsigned char)*s; 155 if ((ch & 0x80) == 0) { 156 /* Fast path for plain ASCII characters. */ 157 if (pwc != NULL) 158 *pwc = ch; 159 return (ch != '\0' ? 1 : 0); 160 } 161 if ((ch & 0xe0) == 0xc0) { 162 mask = 0x1f; 163 want = 2; 164 lbound = 0x80; 165 } else if ((ch & 0xf0) == 0xe0) { 166 mask = 0x0f; 167 want = 3; 168 lbound = 0x800; 169 } else if ((ch & 0xf8) == 0xf0) { 170 mask = 0x07; 171 want = 4; 172 lbound = 0x10000; 173 } else { 174 /* 175 * Malformed input; input is not UTF-8. 176 */ 177 errno = EILSEQ; 178 return ((size_t)-1); 179 } 180 } else { 181 want = us->want; 182 lbound = us->lbound; 183 } 184 185 /* 186 * Decode the octet sequence representing the character in chunks 187 * of 6 bits, most significant first. 188 */ 189 if (us->want == 0) 190 wch = (unsigned char)*s++ & mask; 191 else 192 wch = us->ch; 193 194 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 195 if ((*s & 0xc0) != 0x80) { 196 /* 197 * Malformed input; bad characters in the middle 198 * of a character. 199 */ 200 errno = EILSEQ; 201 return ((size_t)-1); 202 } 203 wch <<= 6; 204 wch |= *s++ & 0x3f; 205 } 206 if (i < want) { 207 /* Incomplete multibyte sequence. */ 208 us->want = want - i; 209 us->lbound = lbound; 210 us->ch = wch; 211 return ((size_t)-2); 212 } 213 if (wch < lbound || wch > 0x10ffff) { 214 /* 215 * Malformed input; redundant encoding or illegal 216 * code sequence. 217 */ 218 errno = EILSEQ; 219 return ((size_t)-1); 220 } 221 if (pwc != NULL) 222 *pwc = wch; 223 us->want = 0; 224 return (wch == L'\0' ? 0 : want); 225 } 226 227 static size_t 228 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 229 size_t nms, size_t len, mbstate_t * __restrict ps) 230 { 231 _UTF8State *us; 232 const char *s; 233 size_t nchr; 234 wchar_t wc; 235 size_t nb; 236 237 us = (_UTF8State *)ps; 238 239 s = *src; 240 nchr = 0; 241 242 if (dst == NULL) { 243 /* 244 * The fast path in the loop below is not safe if an ASCII 245 * character appears as anything but the first byte of a 246 * multibyte sequence. Check now to avoid doing it in the loop. 247 */ 248 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 249 errno = EILSEQ; 250 return ((size_t)-1); 251 } 252 for (;;) { 253 if (nms > 0 && (signed char)*s > 0) 254 /* 255 * Fast path for plain ASCII characters 256 * excluding NUL. 257 */ 258 nb = 1; 259 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 260 (size_t)-1) 261 /* Invalid sequence - mbrtowc() sets errno. */ 262 return ((size_t)-1); 263 else if (nb == 0 || nb == (size_t)-2) 264 return (nchr); 265 s += nb; 266 nms -= nb; 267 nchr++; 268 } 269 /*NOTREACHED*/ 270 } 271 272 /* 273 * The fast path in the loop below is not safe if an ASCII 274 * character appears as anything but the first byte of a 275 * multibyte sequence. Check now to avoid doing it in the loop. 276 */ 277 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 278 errno = EILSEQ; 279 return ((size_t)-1); 280 } 281 while (len-- > 0) { 282 if (nms > 0 && (signed char)*s > 0) { 283 /* 284 * Fast path for plain ASCII characters 285 * excluding NUL. 286 */ 287 *dst = (wchar_t)*s; 288 nb = 1; 289 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 290 (size_t)-1) { 291 *src = s; 292 return ((size_t)-1); 293 } else if (nb == (size_t)-2) { 294 *src = s + nms; 295 return (nchr); 296 } else if (nb == 0) { 297 *src = NULL; 298 return (nchr); 299 } 300 s += nb; 301 nms -= nb; 302 nchr++; 303 dst++; 304 } 305 *src = s; 306 return (nchr); 307 } 308 309 static size_t 310 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 311 { 312 _UTF8State *us; 313 unsigned char lead; 314 int i, len; 315 316 us = (_UTF8State *)ps; 317 318 if (us->want != 0) { 319 errno = EINVAL; 320 return ((size_t)-1); 321 } 322 323 if (s == NULL) 324 /* Reset to initial shift state (no-op) */ 325 return (1); 326 327 /* 328 * Determine the number of octets needed to represent this character. 329 * We always output the shortest sequence possible. Also specify the 330 * first few bits of the first octet, which contains the information 331 * about the sequence length. 332 */ 333 if ((wc & ~0x7f) == 0) { 334 /* Fast path for plain ASCII characters. */ 335 *s = (char)wc; 336 return (1); 337 } else if ((wc & ~0x7ff) == 0) { 338 lead = 0xc0; 339 len = 2; 340 } else if ((wc & ~0xffff) == 0) { 341 lead = 0xe0; 342 len = 3; 343 } else if (wc <= 0x10ffff) { 344 lead = 0xf0; 345 len = 4; 346 } else { 347 errno = EILSEQ; 348 return ((size_t)-1); 349 } 350 351 /* 352 * Output the octets representing the character in chunks 353 * of 6 bits, least significant last. The first octet is 354 * a special case because it contains the sequence length 355 * information. 356 */ 357 for (i = len - 1; i > 0; i--) { 358 s[i] = (wc & 0x3f) | 0x80; 359 wc >>= 6; 360 } 361 *s = (wc & 0xff) | lead; 362 363 return (len); 364 } 365 366 static size_t 367 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 368 size_t nwc, size_t len, mbstate_t * __restrict ps) 369 { 370 _UTF8State *us; 371 char buf[MB_LEN_MAX]; 372 const wchar_t *s; 373 size_t nbytes; 374 size_t nb; 375 376 us = (_UTF8State *)ps; 377 378 if (us->want != 0) { 379 errno = EINVAL; 380 return ((size_t)-1); 381 } 382 383 s = *src; 384 nbytes = 0; 385 386 if (dst == NULL) { 387 while (nwc-- > 0) { 388 if (0 <= *s && *s < 0x80) 389 /* Fast path for plain ASCII characters. */ 390 nb = 1; 391 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 392 (size_t)-1) 393 /* Invalid character - wcrtomb() sets errno. */ 394 return ((size_t)-1); 395 if (*s == L'\0') 396 return (nbytes + nb - 1); 397 s++; 398 nbytes += nb; 399 } 400 return (nbytes); 401 } 402 403 while (len > 0 && nwc-- > 0) { 404 if (0 <= *s && *s < 0x80) { 405 /* Fast path for plain ASCII characters. */ 406 nb = 1; 407 *dst = *s; 408 } else if (len > (size_t)MB_CUR_MAX) { 409 /* Enough space to translate in-place. */ 410 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 411 *src = s; 412 return ((size_t)-1); 413 } 414 } else { 415 /* 416 * May not be enough space; use temp. buffer. 417 */ 418 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 419 *src = s; 420 return ((size_t)-1); 421 } 422 if (nb > (int)len) 423 /* MB sequence for character won't fit. */ 424 break; 425 (void) memcpy(dst, buf, nb); 426 } 427 if (*s == L'\0') { 428 *src = NULL; 429 return (nbytes + nb - 1); 430 } 431 s++; 432 dst += nb; 433 len -= nb; 434 nbytes += nb; 435 } 436 *src = s; 437 return (nbytes); 438 } 439 440 /* 441 * Clean binary to wchar buffer conversions. This is basically like a normal 442 * buffer conversion but with a sane argument API and escaping. See none.c 443 * for a more complete description. 444 */ 445 static size_t 446 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src, 447 size_t dlen, size_t *slen, int flags) 448 { 449 size_t i; 450 size_t j; 451 size_t k; 452 size_t n = *slen; 453 int ch, mask, want; 454 wchar_t lbound, wch; 455 456 for (i = j = 0; i < n; ++i) { 457 if (j == dlen) 458 break; 459 ch = (unsigned char)src[i]; 460 461 if ((ch & 0x80) == 0) { 462 /* Fast path for plain ASCII characters. */ 463 if (dst) 464 dst[j] = ch; 465 ++j; 466 continue; 467 } 468 if ((ch & 0xe0) == 0xc0) { 469 mask = 0x1f; 470 want = 2; 471 lbound = 0x80; 472 } else if ((ch & 0xf0) == 0xe0) { 473 mask = 0x0f; 474 want = 3; 475 lbound = 0x800; 476 } else if ((ch & 0xf8) == 0xf0) { 477 mask = 0x07; 478 want = 4; 479 lbound = 0x10000; 480 } else if ((ch & 0xfc) == 0xf8) { 481 /* normally illegal, handled down below */ 482 mask = 0x03; 483 want = 5; 484 lbound = 0x200000; 485 } else if ((ch & 0xfe) == 0xfc) { 486 /* normally illegal, handled down below */ 487 mask = 0x01; 488 want = 6; 489 lbound = 0x4000000; 490 } else { 491 /* 492 * Malformed input; input is not UTF-8, escape 493 * with UTF-8B. 494 */ 495 if (flags & WCSBIN_STRICT) { 496 if (i == 0) { 497 errno = EILSEQ; 498 return ((size_t)-1); 499 } 500 break; 501 } 502 if (dst) 503 dst[j] = 0xDC00 | ch; 504 ++j; 505 continue; 506 } 507 508 /* 509 * Construct wchar_t from multibyte sequence. 510 */ 511 wch = ch & mask; 512 for (k = 1; k < want; ++k) { 513 /* 514 * Stop if not enough input (don't do this early 515 * so we can detect illegal characters as they occur 516 * in the stream). 517 * 518 * If termination is requested force-escape all chars. 519 */ 520 if (i + k >= n) { 521 if (flags & WCSBIN_EOF) { 522 want = n - i; 523 goto forceesc; 524 } 525 goto breakout; 526 } 527 528 ch = src[i+k]; 529 if ((ch & 0xc0) != 0x80) { 530 /* 531 * Malformed input, bad characters in the 532 * middle of a multibyte sequence. Escape 533 * with UTF-8B. 534 */ 535 if (flags & WCSBIN_STRICT) { 536 if (i == 0) { 537 errno = EILSEQ; 538 return ((size_t)-1); 539 } 540 goto breakout; 541 } 542 if (dst) 543 dst[j] = 0xDC00 | (unsigned char)src[i]; 544 ++j; 545 goto loopup; 546 } 547 wch <<= 6; 548 wch |= ch & 0x3f; 549 } 550 551 /* 552 * Check validity of the wchar. If invalid we could escape 553 * just the first character and loop up, but it ought to be 554 * more readable if we escape all the chars in the sequence 555 * (since they are all >= 0x80 and might represent a legacy 556 * 5-byte or 6-byte code). 557 */ 558 if (wch < lbound || 559 ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) { 560 goto forceesc; 561 } 562 563 /* 564 * Check if wch is a surrogate code (which also encloses our 565 * UTF-8B escaping range). This is normally illegal in UTF8. 566 * If it is, we need to escape each characer in the sequence. 567 * Breakout if there isn't enough output buffer space. 568 * 569 * If (flags & WCSBIN_SURRO) the caller wishes to accept 570 * surrogate codes, i.e. the input might potentially already 571 * be escaped UTF8-B or unchecked UTF-16 that was converted 572 * into UTF-8. 573 */ 574 if ((flags & WCSBIN_SURRO) == 0 && 575 wch >= 0xD800 && wch <= 0xDFFF) { 576 forceesc: 577 if (j + want > dlen) 578 break; 579 if (flags & WCSBIN_STRICT) { 580 if (i == 0) { 581 errno = EILSEQ; 582 return ((size_t)-1); 583 } 584 break; 585 } 586 for (k = 0; k < want; ++k) { 587 if (dst) { 588 dst[j] = 0xDC00 | 589 (unsigned char)src[i+k]; 590 } 591 ++j; 592 } 593 i += k - 1; 594 } else { 595 i += k - 1; 596 if (dst) 597 dst[j] = wch; 598 ++j; 599 } 600 loopup: 601 ; 602 } 603 breakout: 604 *slen = i; 605 606 return j; 607 } 608 609 static size_t 610 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src, 611 size_t dlen, size_t *slen, int flags) 612 { 613 size_t i; 614 size_t j; 615 size_t k; 616 size_t n = *slen; 617 size_t len; 618 unsigned char lead; 619 wchar_t wc; 620 621 for (i = j = 0; i < n; ++i) { 622 if (j == dlen) 623 break; 624 wc = src[i]; 625 626 if ((wc & ~0x7f) == 0) { 627 /* Fast path for plain ASCII characters. */ 628 if (dst) 629 dst[j] = (unsigned char)wc; 630 ++j; 631 continue; 632 } 633 if ((wc & ~0x7ff) == 0) { 634 lead = 0xc0; 635 len = 2; 636 } else if (wc >= 0xDC80 && wc <= 0xDCFF && 637 (flags & WCSBIN_SURRO) == 0) { 638 if (flags & WCSBIN_STRICT) { 639 /* 640 * STRICT without SURRO is an error for 641 * surrogates. 642 */ 643 if (i == 0) { 644 errno = EILSEQ; 645 return ((size_t)-1); 646 } 647 break; 648 } 649 if (dst) 650 dst[j] = (unsigned char)wc; 651 ++j; 652 continue; 653 } else if ((wc & ~0xffff) == 0) { 654 if (wc >= 0xD800 && wc <= 0xDFFF && 655 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) == 656 WCSBIN_STRICT) { 657 /* 658 * Surrogates in general are an error 659 * if STRICT is specified and SURRO is not 660 * specified. 661 */ 662 if (i == 0) { 663 errno = EILSEQ; 664 return ((size_t)-1); 665 } 666 break; 667 } 668 lead = 0xe0; 669 len = 3; 670 } else if (wc <= 0x10ffff) { 671 lead = 0xf0; 672 len = 4; 673 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) { 674 /* normally illegal */ 675 lead = 0xf0; 676 len = 4; 677 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) { 678 /* normally illegal */ 679 lead = 0xf8; 680 len = 5; 681 } else if ((flags & WCSBIN_LONGCODES) && 682 (uint32_t)wc < 0x80000000U) { 683 /* normally illegal */ 684 lead = 0xfc; 685 len = 6; 686 } else { 687 if (i == 0) { 688 errno = EILSEQ; 689 return ((size_t)-1); 690 } 691 /* stop here, process error on next loop */ 692 break; 693 } 694 695 /* 696 * Output the octets representing the character in chunks 697 * of 6 bits, least significant last. The first octet is 698 * a special case because it contains the sequence length 699 * information. 700 */ 701 if (j + len > dlen) 702 break; 703 k = j; 704 j += len; 705 if (dst) { 706 while (--len > 0) { 707 dst[k + len] = (wc & 0x3f) | 0x80; 708 wc >>= 6; 709 } 710 dst[k] = (wc & 0xff) | lead; 711 } 712 } 713 *slen = i; 714 715 return j; 716 } 717 718 size_t 719 utf8towcr(wchar_t * __restrict dst, const char * __restrict src, 720 size_t dlen, size_t *slen, int flags) 721 { 722 return _UTF8_mbintowcr(dst, src, dlen, slen, flags); 723 } 724 725 size_t 726 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src, 727 size_t dlen, size_t *slen, int flags) 728 { 729 return _UTF8_wcrtombin(dst, src, dlen, slen, flags); 730 } 731