1 /*
2 * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
3 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
5 * Copyright (c) 2002-2004 Tim J. Robbins
6 * All rights reserved.
7 *
8 * Copyright (c) 2011 The FreeBSD Foundation
9 * All rights reserved.
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * WCSBIN_EOF - Indicate EOF on input buffer.
37 *
38 * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
39 * has already been escaped), on bytes-to-wchars and
40 * wchars-to-bytes. Escaping of other illegal codes will
41 * still occur on input but de-escaping will not occur
42 * on output (they will remain in the surrogate space).
43 *
44 * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
45 * (normally illegal), otherwise escape it on input
46 * and fail on output.
47 *
48 * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
49 */
50
51 #include <sys/param.h>
52
53 #include <errno.h>
54 #include <limits.h>
55 #include <runetype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <wchar.h>
59 #include "mblocal.h"
60
61 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
62 size_t, mbstate_t * __restrict);
63 static int _UTF8_mbsinit(const mbstate_t *);
64 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
65 const char ** __restrict, size_t, size_t,
66 mbstate_t * __restrict);
67 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
68 mbstate_t * __restrict);
69 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
70 size_t, size_t, mbstate_t * __restrict);
71 static size_t _UTF8_mbintowcr(wchar_t * __restrict dst,
72 const char * __restrict src,
73 size_t dlen, size_t *slen, int flags);
74 static size_t _UTF8_wcrtombin(char * __restrict dst,
75 const wchar_t * __restrict src,
76 size_t dlen, size_t *slen, int flags);
77
78 typedef struct {
79 wchar_t ch;
80 int want;
81 wchar_t lbound;
82 } _UTF8State;
83
84 int
_UTF8_init(struct xlocale_ctype * l,_RuneLocale * rl)85 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
86 {
87
88 l->__mbrtowc = _UTF8_mbrtowc;
89 l->__wcrtomb = _UTF8_wcrtomb;
90 l->__mbsinit = _UTF8_mbsinit;
91 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
92 l->__wcsnrtombs = _UTF8_wcsnrtombs;
93 l->__mbintowcr = _UTF8_mbintowcr;
94 l->__wcrtombin = _UTF8_wcrtombin;
95 l->runes = rl;
96 l->__mb_cur_max = 4;
97 /*
98 * UCS-4 encoding used as the internal representation, so
99 * slots 0x0080-0x00FF are occuped and must be excluded
100 * from the single byte ctype by setting the limit.
101 */
102 l->__mb_sb_limit = 128;
103
104 return (0);
105 }
106
107 static int
_UTF8_mbsinit(const mbstate_t * ps)108 _UTF8_mbsinit(const mbstate_t *ps)
109 {
110
111 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
112 }
113
114 static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)115 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
116 mbstate_t * __restrict ps)
117 {
118 _UTF8State *us;
119 int ch, i, mask, want;
120 wchar_t lbound, wch;
121
122 us = (_UTF8State *)ps;
123
124 if (us->want < 0 || us->want > 4) {
125 errno = EINVAL;
126 return ((size_t)-1);
127 }
128
129 if (s == NULL) {
130 s = "";
131 n = 1;
132 pwc = NULL;
133 }
134
135 if (n == 0)
136 /* Incomplete multibyte sequence */
137 return ((size_t)-2);
138
139 if (us->want == 0) {
140 /*
141 * Determine the number of octets that make up this character
142 * from the first octet, and a mask that extracts the
143 * interesting bits of the first octet. We already know
144 * the character is at least two bytes long.
145 *
146 * We also specify a lower bound for the character code to
147 * detect redundant, non-"shortest form" encodings. For
148 * example, the sequence C0 80 is _not_ a legal representation
149 * of the null character. This enforces a 1-to-1 mapping
150 * between character codes and their multibyte representations.
151 */
152 ch = (unsigned char)*s;
153 if ((ch & 0x80) == 0) {
154 /* Fast path for plain ASCII characters. */
155 if (pwc != NULL)
156 *pwc = ch;
157 return (ch != '\0' ? 1 : 0);
158 }
159 if ((ch & 0xe0) == 0xc0) {
160 mask = 0x1f;
161 want = 2;
162 lbound = 0x80;
163 } else if ((ch & 0xf0) == 0xe0) {
164 mask = 0x0f;
165 want = 3;
166 lbound = 0x800;
167 } else if ((ch & 0xf8) == 0xf0) {
168 mask = 0x07;
169 want = 4;
170 lbound = 0x10000;
171 } else {
172 /*
173 * Malformed input; input is not UTF-8.
174 */
175 errno = EILSEQ;
176 return ((size_t)-1);
177 }
178 } else {
179 want = us->want;
180 lbound = us->lbound;
181 }
182
183 /*
184 * Decode the octet sequence representing the character in chunks
185 * of 6 bits, most significant first.
186 */
187 if (us->want == 0)
188 wch = (unsigned char)*s++ & mask;
189 else
190 wch = us->ch;
191
192 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
193 if ((*s & 0xc0) != 0x80) {
194 /*
195 * Malformed input; bad characters in the middle
196 * of a character.
197 */
198 errno = EILSEQ;
199 return ((size_t)-1);
200 }
201 wch <<= 6;
202 wch |= *s++ & 0x3f;
203 }
204 if (i < want) {
205 /* Incomplete multibyte sequence. */
206 us->want = want - i;
207 us->lbound = lbound;
208 us->ch = wch;
209 return ((size_t)-2);
210 }
211 if (wch < lbound || wch > 0x10ffff) {
212 /*
213 * Malformed input; redundant encoding or illegal
214 * code sequence.
215 */
216 errno = EILSEQ;
217 return ((size_t)-1);
218 }
219 if (pwc != NULL)
220 *pwc = wch;
221 us->want = 0;
222 return (wch == L'\0' ? 0 : want);
223 }
224
225 static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)226 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
227 size_t nms, size_t len, mbstate_t * __restrict ps)
228 {
229 _UTF8State *us;
230 const char *s;
231 size_t nchr;
232 wchar_t wc;
233 size_t nb;
234
235 us = (_UTF8State *)ps;
236
237 s = *src;
238 nchr = 0;
239
240 if (dst == NULL) {
241 /*
242 * The fast path in the loop below is not safe if an ASCII
243 * character appears as anything but the first byte of a
244 * multibyte sequence. Check now to avoid doing it in the loop.
245 */
246 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
247 errno = EILSEQ;
248 return ((size_t)-1);
249 }
250 for (;;) {
251 if (nms > 0 && (signed char)*s > 0)
252 /*
253 * Fast path for plain ASCII characters
254 * excluding NUL.
255 */
256 nb = 1;
257 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
258 (size_t)-1)
259 /* Invalid sequence - mbrtowc() sets errno. */
260 return ((size_t)-1);
261 else if (nb == 0 || nb == (size_t)-2)
262 return (nchr);
263 s += nb;
264 nms -= nb;
265 nchr++;
266 }
267 /*NOTREACHED*/
268 }
269
270 /*
271 * The fast path in the loop below is not safe if an ASCII
272 * character appears as anything but the first byte of a
273 * multibyte sequence. Check now to avoid doing it in the loop.
274 */
275 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
276 errno = EILSEQ;
277 return ((size_t)-1);
278 }
279 while (len-- > 0) {
280 if (nms > 0 && (signed char)*s > 0) {
281 /*
282 * Fast path for plain ASCII characters
283 * excluding NUL.
284 */
285 *dst = (wchar_t)*s;
286 nb = 1;
287 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
288 (size_t)-1) {
289 *src = s;
290 return ((size_t)-1);
291 } else if (nb == (size_t)-2) {
292 *src = s + nms;
293 return (nchr);
294 } else if (nb == 0) {
295 *src = NULL;
296 return (nchr);
297 }
298 s += nb;
299 nms -= nb;
300 nchr++;
301 dst++;
302 }
303 *src = s;
304 return (nchr);
305 }
306
307 static size_t
_UTF8_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)308 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
309 {
310 _UTF8State *us;
311 unsigned char lead;
312 int i, len;
313
314 us = (_UTF8State *)ps;
315
316 if (us->want != 0) {
317 errno = EINVAL;
318 return ((size_t)-1);
319 }
320
321 if (s == NULL)
322 /* Reset to initial shift state (no-op) */
323 return (1);
324
325 /*
326 * Determine the number of octets needed to represent this character.
327 * We always output the shortest sequence possible. Also specify the
328 * first few bits of the first octet, which contains the information
329 * about the sequence length.
330 */
331 if ((wc & ~0x7f) == 0) {
332 /* Fast path for plain ASCII characters. */
333 *s = (char)wc;
334 return (1);
335 } else if ((wc & ~0x7ff) == 0) {
336 lead = 0xc0;
337 len = 2;
338 } else if ((wc & ~0xffff) == 0) {
339 lead = 0xe0;
340 len = 3;
341 } else if (wc <= 0x10ffff) {
342 lead = 0xf0;
343 len = 4;
344 } else {
345 errno = EILSEQ;
346 return ((size_t)-1);
347 }
348
349 /*
350 * Output the octets representing the character in chunks
351 * of 6 bits, least significant last. The first octet is
352 * a special case because it contains the sequence length
353 * information.
354 */
355 for (i = len - 1; i > 0; i--) {
356 s[i] = (wc & 0x3f) | 0x80;
357 wc >>= 6;
358 }
359 *s = (wc & 0xff) | lead;
360
361 return (len);
362 }
363
364 static size_t
_UTF8_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)365 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
366 size_t nwc, size_t len, mbstate_t * __restrict ps)
367 {
368 _UTF8State *us;
369 char buf[MB_LEN_MAX];
370 const wchar_t *s;
371 size_t nbytes;
372 size_t nb;
373
374 us = (_UTF8State *)ps;
375
376 if (us->want != 0) {
377 errno = EINVAL;
378 return ((size_t)-1);
379 }
380
381 s = *src;
382 nbytes = 0;
383
384 if (dst == NULL) {
385 while (nwc-- > 0) {
386 if (0 <= *s && *s < 0x80)
387 /* Fast path for plain ASCII characters. */
388 nb = 1;
389 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
390 (size_t)-1)
391 /* Invalid character - wcrtomb() sets errno. */
392 return ((size_t)-1);
393 if (*s == L'\0')
394 return (nbytes + nb - 1);
395 s++;
396 nbytes += nb;
397 }
398 return (nbytes);
399 }
400
401 while (len > 0 && nwc-- > 0) {
402 if (0 <= *s && *s < 0x80) {
403 /* Fast path for plain ASCII characters. */
404 nb = 1;
405 *dst = *s;
406 } else if (len > (size_t)MB_CUR_MAX) {
407 /* Enough space to translate in-place. */
408 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
409 *src = s;
410 return ((size_t)-1);
411 }
412 } else {
413 /*
414 * May not be enough space; use temp. buffer.
415 */
416 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
417 *src = s;
418 return ((size_t)-1);
419 }
420 if (nb > (int)len)
421 /* MB sequence for character won't fit. */
422 break;
423 (void) memcpy(dst, buf, nb);
424 }
425 if (*s == L'\0') {
426 *src = NULL;
427 return (nbytes + nb - 1);
428 }
429 s++;
430 dst += nb;
431 len -= nb;
432 nbytes += nb;
433 }
434 *src = s;
435 return (nbytes);
436 }
437
438 /*
439 * Clean binary to wchar buffer conversions. This is basically like a normal
440 * buffer conversion but with a sane argument API and escaping. See none.c
441 * for a more complete description.
442 */
443 static size_t
_UTF8_mbintowcr(wchar_t * __restrict dst,const char * __restrict src,size_t dlen,size_t * slen,int flags)444 _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
445 size_t dlen, size_t *slen, int flags)
446 {
447 size_t i;
448 size_t j;
449 size_t k;
450 size_t n = *slen;
451 int ch, mask, want;
452 wchar_t lbound, wch;
453
454 for (i = j = 0; i < n; ++i) {
455 if (j == dlen)
456 break;
457 ch = (unsigned char)src[i];
458
459 if ((ch & 0x80) == 0) {
460 /* Fast path for plain ASCII characters. */
461 if (dst)
462 dst[j] = ch;
463 ++j;
464 continue;
465 }
466 if ((ch & 0xe0) == 0xc0) {
467 mask = 0x1f;
468 want = 2;
469 lbound = 0x80;
470 } else if ((ch & 0xf0) == 0xe0) {
471 mask = 0x0f;
472 want = 3;
473 lbound = 0x800;
474 } else if ((ch & 0xf8) == 0xf0) {
475 mask = 0x07;
476 want = 4;
477 lbound = 0x10000;
478 } else if ((ch & 0xfc) == 0xf8) {
479 /* normally illegal, handled down below */
480 mask = 0x03;
481 want = 5;
482 lbound = 0x200000;
483 } else if ((ch & 0xfe) == 0xfc) {
484 /* normally illegal, handled down below */
485 mask = 0x01;
486 want = 6;
487 lbound = 0x4000000;
488 } else {
489 /*
490 * Malformed input; input is not UTF-8, escape
491 * with UTF-8B.
492 */
493 if (flags & WCSBIN_STRICT) {
494 if (i == 0) {
495 errno = EILSEQ;
496 return ((size_t)-1);
497 }
498 break;
499 }
500 if (dst)
501 dst[j] = 0xDC00 | ch;
502 ++j;
503 continue;
504 }
505
506 /*
507 * Construct wchar_t from multibyte sequence.
508 */
509 wch = ch & mask;
510 for (k = 1; k < want; ++k) {
511 /*
512 * Stop if not enough input (don't do this early
513 * so we can detect illegal characters as they occur
514 * in the stream).
515 *
516 * If termination is requested force-escape all chars.
517 */
518 if (i + k >= n) {
519 if (flags & WCSBIN_EOF) {
520 want = n - i;
521 goto forceesc;
522 }
523 goto breakout;
524 }
525
526 ch = src[i+k];
527 if ((ch & 0xc0) != 0x80) {
528 /*
529 * Malformed input, bad characters in the
530 * middle of a multibyte sequence. Escape
531 * with UTF-8B.
532 */
533 if (flags & WCSBIN_STRICT) {
534 if (i == 0) {
535 errno = EILSEQ;
536 return ((size_t)-1);
537 }
538 goto breakout;
539 }
540 if (dst)
541 dst[j] = 0xDC00 | (unsigned char)src[i];
542 ++j;
543 goto loopup;
544 }
545 wch <<= 6;
546 wch |= ch & 0x3f;
547 }
548
549 /*
550 * Check validity of the wchar. If invalid we could escape
551 * just the first character and loop up, but it ought to be
552 * more readable if we escape all the chars in the sequence
553 * (since they are all >= 0x80 and might represent a legacy
554 * 5-byte or 6-byte code).
555 */
556 if (wch < lbound ||
557 ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) {
558 goto forceesc;
559 }
560
561 /*
562 * Check if wch is a surrogate code (which also encloses our
563 * UTF-8B escaping range). This is normally illegal in UTF8.
564 * If it is, we need to escape each characer in the sequence.
565 * Breakout if there isn't enough output buffer space.
566 *
567 * If (flags & WCSBIN_SURRO) the caller wishes to accept
568 * surrogate codes, i.e. the input might potentially already
569 * be escaped UTF8-B or unchecked UTF-16 that was converted
570 * into UTF-8.
571 */
572 if ((flags & WCSBIN_SURRO) == 0 &&
573 wch >= 0xD800 && wch <= 0xDFFF) {
574 forceesc:
575 if (j + want > dlen)
576 break;
577 if (flags & WCSBIN_STRICT) {
578 if (i == 0) {
579 errno = EILSEQ;
580 return ((size_t)-1);
581 }
582 break;
583 }
584 for (k = 0; k < want; ++k) {
585 if (dst) {
586 dst[j] = 0xDC00 |
587 (unsigned char)src[i+k];
588 }
589 ++j;
590 }
591 i += k - 1;
592 } else {
593 i += k - 1;
594 if (dst)
595 dst[j] = wch;
596 ++j;
597 }
598 loopup:
599 ;
600 }
601 breakout:
602 *slen = i;
603
604 return j;
605 }
606
607 static size_t
_UTF8_wcrtombin(char * __restrict dst,const wchar_t * __restrict src,size_t dlen,size_t * slen,int flags)608 _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
609 size_t dlen, size_t *slen, int flags)
610 {
611 size_t i;
612 size_t j;
613 size_t k;
614 size_t n = *slen;
615 size_t len;
616 unsigned char lead;
617 wchar_t wc;
618
619 for (i = j = 0; i < n; ++i) {
620 if (j == dlen)
621 break;
622 wc = src[i];
623
624 if ((wc & ~0x7f) == 0) {
625 /* Fast path for plain ASCII characters. */
626 if (dst)
627 dst[j] = (unsigned char)wc;
628 ++j;
629 continue;
630 }
631 if ((wc & ~0x7ff) == 0) {
632 lead = 0xc0;
633 len = 2;
634 } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
635 (flags & WCSBIN_SURRO) == 0) {
636 if (flags & WCSBIN_STRICT) {
637 /*
638 * STRICT without SURRO is an error for
639 * surrogates.
640 */
641 if (i == 0) {
642 errno = EILSEQ;
643 return ((size_t)-1);
644 }
645 break;
646 }
647 if (dst)
648 dst[j] = (unsigned char)wc;
649 ++j;
650 continue;
651 } else if ((wc & ~0xffff) == 0) {
652 if (wc >= 0xD800 && wc <= 0xDFFF &&
653 (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
654 WCSBIN_STRICT) {
655 /*
656 * Surrogates in general are an error
657 * if STRICT is specified and SURRO is not
658 * specified.
659 */
660 if (i == 0) {
661 errno = EILSEQ;
662 return ((size_t)-1);
663 }
664 break;
665 }
666 lead = 0xe0;
667 len = 3;
668 } else if (wc <= 0x10ffff) {
669 lead = 0xf0;
670 len = 4;
671 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
672 /* normally illegal */
673 lead = 0xf0;
674 len = 4;
675 } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
676 /* normally illegal */
677 lead = 0xf8;
678 len = 5;
679 } else if ((flags & WCSBIN_LONGCODES) &&
680 (uint32_t)wc < 0x80000000U) {
681 /* normally illegal */
682 lead = 0xfc;
683 len = 6;
684 } else {
685 if (i == 0) {
686 errno = EILSEQ;
687 return ((size_t)-1);
688 }
689 /* stop here, process error on next loop */
690 break;
691 }
692
693 /*
694 * Output the octets representing the character in chunks
695 * of 6 bits, least significant last. The first octet is
696 * a special case because it contains the sequence length
697 * information.
698 */
699 if (j + len > dlen)
700 break;
701 k = j;
702 j += len;
703 if (dst) {
704 while (--len > 0) {
705 dst[k + len] = (wc & 0x3f) | 0x80;
706 wc >>= 6;
707 }
708 dst[k] = (wc & 0xff) | lead;
709 }
710 }
711 *slen = i;
712
713 return j;
714 }
715
716 size_t
utf8towcr(wchar_t * __restrict dst,const char * __restrict src,size_t dlen,size_t * slen,int flags)717 utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
718 size_t dlen, size_t *slen, int flags)
719 {
720 return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
721 }
722
723 size_t
wcrtoutf8(char * __restrict dst,const wchar_t * __restrict src,size_t dlen,size_t * slen,int flags)724 wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
725 size_t dlen, size_t *slen, int flags)
726 {
727 return _UTF8_wcrtombin(dst, src, dlen, slen, flags);
728 }
729