xref: /freebsd/lib/libc/locale/utf8.c (revision 1d386b48)
1972baa37STim J. Robbins /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3d915a14eSPedro F. Giffuni  *
47b247341SBaptiste Daroussin  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
50716c0ffSPedro F. Giffuni  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
6ca2dae42STim J. Robbins  * Copyright (c) 2002-2004 Tim J. Robbins
7972baa37STim J. Robbins  * All rights reserved.
8972baa37STim J. Robbins  *
93c87aa1dSDavid Chisnall  * Copyright (c) 2011 The FreeBSD Foundation
105b5fa75aSEd Maste  *
113c87aa1dSDavid Chisnall  * Portions of this software were developed by David Chisnall
123c87aa1dSDavid Chisnall  * under sponsorship from the FreeBSD Foundation.
133c87aa1dSDavid Chisnall  *
14972baa37STim J. Robbins  * Redistribution and use in source and binary forms, with or without
15972baa37STim J. Robbins  * modification, are permitted provided that the following conditions
16972baa37STim J. Robbins  * are met:
17972baa37STim J. Robbins  * 1. Redistributions of source code must retain the above copyright
18972baa37STim J. Robbins  *    notice, this list of conditions and the following disclaimer.
19972baa37STim J. Robbins  * 2. Redistributions in binary form must reproduce the above copyright
20972baa37STim J. Robbins  *    notice, this list of conditions and the following disclaimer in the
21972baa37STim J. Robbins  *    documentation and/or other materials provided with the distribution.
22972baa37STim J. Robbins  *
23972baa37STim J. Robbins  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24972baa37STim J. Robbins  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25972baa37STim J. Robbins  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26972baa37STim J. Robbins  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27972baa37STim J. Robbins  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28972baa37STim J. Robbins  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29972baa37STim J. Robbins  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30972baa37STim J. Robbins  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31972baa37STim J. Robbins  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32972baa37STim J. Robbins  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33972baa37STim J. Robbins  * SUCH DAMAGE.
34972baa37STim J. Robbins  */
35972baa37STim J. Robbins 
36ca2dae42STim J. Robbins #include <sys/param.h>
3702f4f60aSTim J. Robbins #include <errno.h>
38ea9a9a37STim J. Robbins #include <limits.h>
3902f4f60aSTim J. Robbins #include <runetype.h>
40972baa37STim J. Robbins #include <stdlib.h>
41ca2dae42STim J. Robbins #include <string.h>
4202f4f60aSTim J. Robbins #include <wchar.h>
432051a8f2STim J. Robbins #include "mblocal.h"
4402f4f60aSTim J. Robbins 
45367ed4e1SAndrey A. Chernov extern int __mb_sb_limit;
46367ed4e1SAndrey A. Chernov 
47e94c6cb4SAlexey Zelkin static size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
48e94c6cb4SAlexey Zelkin 		    size_t, mbstate_t * __restrict);
49e94c6cb4SAlexey Zelkin static int	_UTF8_mbsinit(const mbstate_t *);
50e94c6cb4SAlexey Zelkin static size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
51e94c6cb4SAlexey Zelkin 		    const char ** __restrict, size_t, size_t,
5202f4f60aSTim J. Robbins 		    mbstate_t * __restrict);
53e94c6cb4SAlexey Zelkin static size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
54e94c6cb4SAlexey Zelkin 		    mbstate_t * __restrict);
55e94c6cb4SAlexey Zelkin static size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
56ea9a9a37STim J. Robbins 		    size_t, size_t, mbstate_t * __restrict);
57972baa37STim J. Robbins 
58ca2dae42STim J. Robbins typedef struct {
595e44d7ebSTim J. Robbins 	wchar_t	ch;
605e44d7ebSTim J. Robbins 	int	want;
615e44d7ebSTim J. Robbins 	wchar_t	lbound;
62ca2dae42STim J. Robbins } _UTF8State;
63ca2dae42STim J. Robbins 
64972baa37STim J. Robbins int
_UTF8_init(struct xlocale_ctype * l,_RuneLocale * rl)653c87aa1dSDavid Chisnall _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
66972baa37STim J. Robbins {
67972baa37STim J. Robbins 
683c87aa1dSDavid Chisnall 	l->__mbrtowc = _UTF8_mbrtowc;
693c87aa1dSDavid Chisnall 	l->__wcrtomb = _UTF8_wcrtomb;
703c87aa1dSDavid Chisnall 	l->__mbsinit = _UTF8_mbsinit;
713c87aa1dSDavid Chisnall 	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
723c87aa1dSDavid Chisnall 	l->__wcsnrtombs = _UTF8_wcsnrtombs;
733c87aa1dSDavid Chisnall 	l->runes = rl;
747b247341SBaptiste Daroussin 	l->__mb_cur_max = 4;
754932c895SAndrey A. Chernov 	/*
764932c895SAndrey A. Chernov 	 * UCS-4 encoding used as the internal representation, so
774932c895SAndrey A. Chernov 	 * slots 0x0080-0x00FF are occuped and must be excluded
784932c895SAndrey A. Chernov 	 * from the single byte ctype by setting the limit.
794932c895SAndrey A. Chernov 	 */
803c87aa1dSDavid Chisnall 	l->__mb_sb_limit = 128;
81972baa37STim J. Robbins 
82972baa37STim J. Robbins 	return (0);
83972baa37STim J. Robbins }
84972baa37STim J. Robbins 
85e94c6cb4SAlexey Zelkin static int
_UTF8_mbsinit(const mbstate_t * ps)86ca2dae42STim J. Robbins _UTF8_mbsinit(const mbstate_t *ps)
87ca2dae42STim J. Robbins {
88ca2dae42STim J. Robbins 
895e44d7ebSTim J. Robbins 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
90ca2dae42STim J. Robbins }
91ca2dae42STim J. Robbins 
92e94c6cb4SAlexey Zelkin static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)9302f4f60aSTim J. Robbins _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
94ca2dae42STim J. Robbins     mbstate_t * __restrict ps)
95972baa37STim J. Robbins {
96ca2dae42STim J. Robbins 	_UTF8State *us;
975e44d7ebSTim J. Robbins 	int ch, i, mask, want;
9802f4f60aSTim J. Robbins 	wchar_t lbound, wch;
99972baa37STim J. Robbins 
100ca2dae42STim J. Robbins 	us = (_UTF8State *)ps;
101ca2dae42STim J. Robbins 
1025e44d7ebSTim J. Robbins 	if (us->want < 0 || us->want > 6) {
103fc813796STim J. Robbins 		errno = EINVAL;
104fc813796STim J. Robbins 		return ((size_t)-1);
105fc813796STim J. Robbins 	}
106fc813796STim J. Robbins 
107ca2dae42STim J. Robbins 	if (s == NULL) {
108ca2dae42STim J. Robbins 		s = "";
109ca2dae42STim J. Robbins 		n = 1;
110ca2dae42STim J. Robbins 		pwc = NULL;
111ca2dae42STim J. Robbins 	}
112ca2dae42STim J. Robbins 
11302f4f60aSTim J. Robbins 	if (n == 0)
11402f4f60aSTim J. Robbins 		/* Incomplete multibyte sequence */
11502f4f60aSTim J. Robbins 		return ((size_t)-2);
116972baa37STim J. Robbins 
1175e44d7ebSTim J. Robbins 	if (us->want == 0) {
118972baa37STim J. Robbins 		/*
1195e44d7ebSTim J. Robbins 		 * Determine the number of octets that make up this character
1205e44d7ebSTim J. Robbins 		 * from the first octet, and a mask that extracts the
1215e44d7ebSTim J. Robbins 		 * interesting bits of the first octet. We already know
1225e44d7ebSTim J. Robbins 		 * the character is at least two bytes long.
123972baa37STim J. Robbins 		 *
1245e44d7ebSTim J. Robbins 		 * We also specify a lower bound for the character code to
1255e44d7ebSTim J. Robbins 		 * detect redundant, non-"shortest form" encodings. For
1265e44d7ebSTim J. Robbins 		 * example, the sequence C0 80 is _not_ a legal representation
1275e44d7ebSTim J. Robbins 		 * of the null character. This enforces a 1-to-1 mapping
1285e44d7ebSTim J. Robbins 		 * between character codes and their multibyte representations.
129972baa37STim J. Robbins 		 */
13002f4f60aSTim J. Robbins 		ch = (unsigned char)*s;
131972baa37STim J. Robbins 		if ((ch & 0x80) == 0) {
1320716c0ffSPedro F. Giffuni 			/* Fast path for plain ASCII characters. */
1330716c0ffSPedro F. Giffuni 			if (pwc != NULL)
1340716c0ffSPedro F. Giffuni 				*pwc = ch;
1350716c0ffSPedro F. Giffuni 			return (ch != '\0' ? 1 : 0);
1360716c0ffSPedro F. Giffuni 		}
1370716c0ffSPedro F. Giffuni 		if ((ch & 0xe0) == 0xc0) {
138972baa37STim J. Robbins 			mask = 0x1f;
1395e44d7ebSTim J. Robbins 			want = 2;
140972baa37STim J. Robbins 			lbound = 0x80;
141972baa37STim J. Robbins 		} else if ((ch & 0xf0) == 0xe0) {
142972baa37STim J. Robbins 			mask = 0x0f;
1435e44d7ebSTim J. Robbins 			want = 3;
144972baa37STim J. Robbins 			lbound = 0x800;
145972baa37STim J. Robbins 		} else if ((ch & 0xf8) == 0xf0) {
146972baa37STim J. Robbins 			mask = 0x07;
1475e44d7ebSTim J. Robbins 			want = 4;
148972baa37STim J. Robbins 			lbound = 0x10000;
149972baa37STim J. Robbins 		} else {
150972baa37STim J. Robbins 			/*
151972baa37STim J. Robbins 			 * Malformed input; input is not UTF-8.
152972baa37STim J. Robbins 			 */
15302f4f60aSTim J. Robbins 			errno = EILSEQ;
15402f4f60aSTim J. Robbins 			return ((size_t)-1);
155972baa37STim J. Robbins 		}
1565e44d7ebSTim J. Robbins 	} else {
1575e44d7ebSTim J. Robbins 		want = us->want;
1585e44d7ebSTim J. Robbins 		lbound = us->lbound;
1595e44d7ebSTim J. Robbins 	}
160972baa37STim J. Robbins 
161972baa37STim J. Robbins 	/*
162972baa37STim J. Robbins 	 * Decode the octet sequence representing the character in chunks
163972baa37STim J. Robbins 	 * of 6 bits, most significant first.
164972baa37STim J. Robbins 	 */
1655e44d7ebSTim J. Robbins 	if (us->want == 0)
16602f4f60aSTim J. Robbins 		wch = (unsigned char)*s++ & mask;
1675e44d7ebSTim J. Robbins 	else
1685e44d7ebSTim J. Robbins 		wch = us->ch;
1697b247341SBaptiste Daroussin 
1705e44d7ebSTim J. Robbins 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
17102f4f60aSTim J. Robbins 		if ((*s & 0xc0) != 0x80) {
172972baa37STim J. Robbins 			/*
173972baa37STim J. Robbins 			 * Malformed input; bad characters in the middle
174972baa37STim J. Robbins 			 * of a character.
175972baa37STim J. Robbins 			 */
17602f4f60aSTim J. Robbins 			errno = EILSEQ;
17702f4f60aSTim J. Robbins 			return ((size_t)-1);
178972baa37STim J. Robbins 		}
179972baa37STim J. Robbins 		wch <<= 6;
18002f4f60aSTim J. Robbins 		wch |= *s++ & 0x3f;
181972baa37STim J. Robbins 	}
1825e44d7ebSTim J. Robbins 	if (i < want) {
1835e44d7ebSTim J. Robbins 		/* Incomplete multibyte sequence. */
1845e44d7ebSTim J. Robbins 		us->want = want - i;
1855e44d7ebSTim J. Robbins 		us->lbound = lbound;
1865e44d7ebSTim J. Robbins 		us->ch = wch;
1875e44d7ebSTim J. Robbins 		return ((size_t)-2);
1885e44d7ebSTim J. Robbins 	}
18902f4f60aSTim J. Robbins 	if (wch < lbound) {
190972baa37STim J. Robbins 		/*
191972baa37STim J. Robbins 		 * Malformed input; redundant encoding.
192972baa37STim J. Robbins 		 */
19302f4f60aSTim J. Robbins 		errno = EILSEQ;
19402f4f60aSTim J. Robbins 		return ((size_t)-1);
19502f4f60aSTim J. Robbins 	}
19657c69b14SEd Schouten 	if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
19781eb7d7eSBaptiste Daroussin 		/*
19881eb7d7eSBaptiste Daroussin 		 * Malformed input; invalid code points.
19981eb7d7eSBaptiste Daroussin 		 */
20081eb7d7eSBaptiste Daroussin 		errno = EILSEQ;
20181eb7d7eSBaptiste Daroussin 		return ((size_t)-1);
20281eb7d7eSBaptiste Daroussin 	}
20302f4f60aSTim J. Robbins 	if (pwc != NULL)
20402f4f60aSTim J. Robbins 		*pwc = wch;
2055e44d7ebSTim J. Robbins 	us->want = 0;
2065e44d7ebSTim J. Robbins 	return (wch == L'\0' ? 0 : want);
207972baa37STim J. Robbins }
208972baa37STim J. Robbins 
209e94c6cb4SAlexey Zelkin static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)210ea9a9a37STim J. Robbins _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
211ea9a9a37STim J. Robbins     size_t nms, size_t len, mbstate_t * __restrict ps)
212ea9a9a37STim J. Robbins {
213ea9a9a37STim J. Robbins 	_UTF8State *us;
214ea9a9a37STim J. Robbins 	const char *s;
215ea9a9a37STim J. Robbins 	size_t nchr;
216ea9a9a37STim J. Robbins 	wchar_t wc;
217ea9a9a37STim J. Robbins 	size_t nb;
218ea9a9a37STim J. Robbins 
219ea9a9a37STim J. Robbins 	us = (_UTF8State *)ps;
220ea9a9a37STim J. Robbins 
221ea9a9a37STim J. Robbins 	s = *src;
222ea9a9a37STim J. Robbins 	nchr = 0;
223ea9a9a37STim J. Robbins 
224ea9a9a37STim J. Robbins 	if (dst == NULL) {
225ea9a9a37STim J. Robbins 		/*
226ea9a9a37STim J. Robbins 		 * The fast path in the loop below is not safe if an ASCII
227ea9a9a37STim J. Robbins 		 * character appears as anything but the first byte of a
228ea9a9a37STim J. Robbins 		 * multibyte sequence. Check now to avoid doing it in the loop.
229ea9a9a37STim J. Robbins 		 */
230ea9a9a37STim J. Robbins 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
231ea9a9a37STim J. Robbins 			errno = EILSEQ;
232ea9a9a37STim J. Robbins 			return ((size_t)-1);
233ea9a9a37STim J. Robbins 		}
234ea9a9a37STim J. Robbins 		for (;;) {
235ea9a9a37STim J. Robbins 			if (nms > 0 && (signed char)*s > 0)
236ea9a9a37STim J. Robbins 				/*
237ea9a9a37STim J. Robbins 				 * Fast path for plain ASCII characters
238ea9a9a37STim J. Robbins 				 * excluding NUL.
239ea9a9a37STim J. Robbins 				 */
240ea9a9a37STim J. Robbins 				nb = 1;
241ea9a9a37STim J. Robbins 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
242ea9a9a37STim J. Robbins 			    (size_t)-1)
243ea9a9a37STim J. Robbins 				/* Invalid sequence - mbrtowc() sets errno. */
244ea9a9a37STim J. Robbins 				return ((size_t)-1);
245ea9a9a37STim J. Robbins 			else if (nb == 0 || nb == (size_t)-2)
246ea9a9a37STim J. Robbins 				return (nchr);
247ea9a9a37STim J. Robbins 			s += nb;
248ea9a9a37STim J. Robbins 			nms -= nb;
249ea9a9a37STim J. Robbins 			nchr++;
250ea9a9a37STim J. Robbins 		}
251ea9a9a37STim J. Robbins 		/*NOTREACHED*/
252ea9a9a37STim J. Robbins 	}
253ea9a9a37STim J. Robbins 
254ea9a9a37STim J. Robbins 	/*
255ea9a9a37STim J. Robbins 	 * The fast path in the loop below is not safe if an ASCII
256ea9a9a37STim J. Robbins 	 * character appears as anything but the first byte of a
257ea9a9a37STim J. Robbins 	 * multibyte sequence. Check now to avoid doing it in the loop.
258ea9a9a37STim J. Robbins 	 */
259ea9a9a37STim J. Robbins 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
260ea9a9a37STim J. Robbins 		errno = EILSEQ;
261ea9a9a37STim J. Robbins 		return ((size_t)-1);
262ea9a9a37STim J. Robbins 	}
263ea9a9a37STim J. Robbins 	while (len-- > 0) {
264ea9a9a37STim J. Robbins 		if (nms > 0 && (signed char)*s > 0) {
265ea9a9a37STim J. Robbins 			/*
266ea9a9a37STim J. Robbins 			 * Fast path for plain ASCII characters
267ea9a9a37STim J. Robbins 			 * excluding NUL.
268ea9a9a37STim J. Robbins 			 */
269ea9a9a37STim J. Robbins 			*dst = (wchar_t)*s;
270ea9a9a37STim J. Robbins 			nb = 1;
271ea9a9a37STim J. Robbins 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
272ea9a9a37STim J. Robbins 		    (size_t)-1) {
273ea9a9a37STim J. Robbins 			*src = s;
274ea9a9a37STim J. Robbins 			return ((size_t)-1);
275ea9a9a37STim J. Robbins 		} else if (nb == (size_t)-2) {
276ea9a9a37STim J. Robbins 			*src = s + nms;
277ea9a9a37STim J. Robbins 			return (nchr);
278ea9a9a37STim J. Robbins 		} else if (nb == 0) {
279ea9a9a37STim J. Robbins 			*src = NULL;
280ea9a9a37STim J. Robbins 			return (nchr);
281ea9a9a37STim J. Robbins 		}
282ea9a9a37STim J. Robbins 		s += nb;
283ea9a9a37STim J. Robbins 		nms -= nb;
284ea9a9a37STim J. Robbins 		nchr++;
285ea9a9a37STim J. Robbins 		dst++;
286ea9a9a37STim J. Robbins 	}
287ea9a9a37STim J. Robbins 	*src = s;
288ea9a9a37STim J. Robbins 	return (nchr);
289ea9a9a37STim J. Robbins }
290ea9a9a37STim J. Robbins 
291e94c6cb4SAlexey Zelkin static size_t
_UTF8_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)292fc813796STim J. Robbins _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
293972baa37STim J. Robbins {
294fc813796STim J. Robbins 	_UTF8State *us;
295972baa37STim J. Robbins 	unsigned char lead;
296972baa37STim J. Robbins 	int i, len;
297972baa37STim J. Robbins 
298fc813796STim J. Robbins 	us = (_UTF8State *)ps;
299fc813796STim J. Robbins 
3005e44d7ebSTim J. Robbins 	if (us->want != 0) {
301fc813796STim J. Robbins 		errno = EINVAL;
302fc813796STim J. Robbins 		return ((size_t)-1);
303fc813796STim J. Robbins 	}
304fc813796STim J. Robbins 
30502f4f60aSTim J. Robbins 	if (s == NULL)
30602f4f60aSTim J. Robbins 		/* Reset to initial shift state (no-op) */
30702f4f60aSTim J. Robbins 		return (1);
30802f4f60aSTim J. Robbins 
309972baa37STim J. Robbins 	/*
310972baa37STim J. Robbins 	 * Determine the number of octets needed to represent this character.
311972baa37STim J. Robbins 	 * We always output the shortest sequence possible. Also specify the
312972baa37STim J. Robbins 	 * first few bits of the first octet, which contains the information
313972baa37STim J. Robbins 	 * about the sequence length.
314972baa37STim J. Robbins 	 */
31502f4f60aSTim J. Robbins 	if ((wc & ~0x7f) == 0) {
3160716c0ffSPedro F. Giffuni 		/* Fast path for plain ASCII characters. */
3170716c0ffSPedro F. Giffuni 		*s = (char)wc;
3180716c0ffSPedro F. Giffuni 		return (1);
31902f4f60aSTim J. Robbins 	} else if ((wc & ~0x7ff) == 0) {
320972baa37STim J. Robbins 		lead = 0xc0;
321972baa37STim J. Robbins 		len = 2;
32202f4f60aSTim J. Robbins 	} else if ((wc & ~0xffff) == 0) {
32357c69b14SEd Schouten 		if (wc >= 0xd800 && wc <= 0xdfff) {
32457c69b14SEd Schouten 			errno = EILSEQ;
32557c69b14SEd Schouten 			return ((size_t)-1);
32657c69b14SEd Schouten 		}
327972baa37STim J. Robbins 		lead = 0xe0;
328972baa37STim J. Robbins 		len = 3;
3298bb93485SBaptiste Daroussin 	} else if (wc >= 0 && wc <= 0x10ffff) {
330972baa37STim J. Robbins 		lead = 0xf0;
331972baa37STim J. Robbins 		len = 4;
332972baa37STim J. Robbins 	} else {
33302f4f60aSTim J. Robbins 		errno = EILSEQ;
33402f4f60aSTim J. Robbins 		return ((size_t)-1);
335972baa37STim J. Robbins 	}
336972baa37STim J. Robbins 
337972baa37STim J. Robbins 	/*
338972baa37STim J. Robbins 	 * Output the octets representing the character in chunks
339972baa37STim J. Robbins 	 * of 6 bits, least significant last. The first octet is
340972baa37STim J. Robbins 	 * a special case because it contains the sequence length
341972baa37STim J. Robbins 	 * information.
342972baa37STim J. Robbins 	 */
343972baa37STim J. Robbins 	for (i = len - 1; i > 0; i--) {
34402f4f60aSTim J. Robbins 		s[i] = (wc & 0x3f) | 0x80;
34502f4f60aSTim J. Robbins 		wc >>= 6;
346972baa37STim J. Robbins 	}
34702f4f60aSTim J. Robbins 	*s = (wc & 0xff) | lead;
348972baa37STim J. Robbins 
349972baa37STim J. Robbins 	return (len);
350972baa37STim J. Robbins }
351ea9a9a37STim J. Robbins 
352e94c6cb4SAlexey Zelkin static size_t
_UTF8_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)353ea9a9a37STim J. Robbins _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
354ea9a9a37STim J. Robbins     size_t nwc, size_t len, mbstate_t * __restrict ps)
355ea9a9a37STim J. Robbins {
356ea9a9a37STim J. Robbins 	_UTF8State *us;
357ea9a9a37STim J. Robbins 	char buf[MB_LEN_MAX];
358ea9a9a37STim J. Robbins 	const wchar_t *s;
359ea9a9a37STim J. Robbins 	size_t nbytes;
360ea9a9a37STim J. Robbins 	size_t nb;
361ea9a9a37STim J. Robbins 
362ea9a9a37STim J. Robbins 	us = (_UTF8State *)ps;
363ea9a9a37STim J. Robbins 
364ea9a9a37STim J. Robbins 	if (us->want != 0) {
365ea9a9a37STim J. Robbins 		errno = EINVAL;
366ea9a9a37STim J. Robbins 		return ((size_t)-1);
367ea9a9a37STim J. Robbins 	}
368ea9a9a37STim J. Robbins 
369ea9a9a37STim J. Robbins 	s = *src;
370ea9a9a37STim J. Robbins 	nbytes = 0;
371ea9a9a37STim J. Robbins 
372ea9a9a37STim J. Robbins 	if (dst == NULL) {
373ea9a9a37STim J. Robbins 		while (nwc-- > 0) {
374ea9a9a37STim J. Robbins 			if (0 <= *s && *s < 0x80)
375ea9a9a37STim J. Robbins 				/* Fast path for plain ASCII characters. */
376ea9a9a37STim J. Robbins 				nb = 1;
377ea9a9a37STim J. Robbins 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
378ea9a9a37STim J. Robbins 			    (size_t)-1)
379ea9a9a37STim J. Robbins 				/* Invalid character - wcrtomb() sets errno. */
380ea9a9a37STim J. Robbins 				return ((size_t)-1);
381ea9a9a37STim J. Robbins 			if (*s == L'\0')
382ea9a9a37STim J. Robbins 				return (nbytes + nb - 1);
383ea9a9a37STim J. Robbins 			s++;
384ea9a9a37STim J. Robbins 			nbytes += nb;
385ea9a9a37STim J. Robbins 		}
386ea9a9a37STim J. Robbins 		return (nbytes);
387ea9a9a37STim J. Robbins 	}
388ea9a9a37STim J. Robbins 
389ea9a9a37STim J. Robbins 	while (len > 0 && nwc-- > 0) {
390ea9a9a37STim J. Robbins 		if (0 <= *s && *s < 0x80) {
391ea9a9a37STim J. Robbins 			/* Fast path for plain ASCII characters. */
392ea9a9a37STim J. Robbins 			nb = 1;
393ea9a9a37STim J. Robbins 			*dst = *s;
394ea9a9a37STim J. Robbins 		} else if (len > (size_t)MB_CUR_MAX) {
395ea9a9a37STim J. Robbins 			/* Enough space to translate in-place. */
396610b5a1fSStefan Farfeleder 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
397ea9a9a37STim J. Robbins 				*src = s;
398ea9a9a37STim J. Robbins 				return ((size_t)-1);
399ea9a9a37STim J. Robbins 			}
400ea9a9a37STim J. Robbins 		} else {
401ea9a9a37STim J. Robbins 			/*
402ea9a9a37STim J. Robbins 			 * May not be enough space; use temp. buffer.
403ea9a9a37STim J. Robbins 			 */
404610b5a1fSStefan Farfeleder 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
405ea9a9a37STim J. Robbins 				*src = s;
406ea9a9a37STim J. Robbins 				return ((size_t)-1);
407ea9a9a37STim J. Robbins 			}
408ea9a9a37STim J. Robbins 			if (nb > (int)len)
409ea9a9a37STim J. Robbins 				/* MB sequence for character won't fit. */
410ea9a9a37STim J. Robbins 				break;
411ea9a9a37STim J. Robbins 			memcpy(dst, buf, nb);
412ea9a9a37STim J. Robbins 		}
413ea9a9a37STim J. Robbins 		if (*s == L'\0') {
414ea9a9a37STim J. Robbins 			*src = NULL;
415ea9a9a37STim J. Robbins 			return (nbytes + nb - 1);
416ea9a9a37STim J. Robbins 		}
417ea9a9a37STim J. Robbins 		s++;
418ea9a9a37STim J. Robbins 		dst += nb;
419ea9a9a37STim J. Robbins 		len -= nb;
420ea9a9a37STim J. Robbins 		nbytes += nb;
421ea9a9a37STim J. Robbins 	}
422ea9a9a37STim J. Robbins 	*src = s;
423ea9a9a37STim J. Robbins 	return (nbytes);
424ea9a9a37STim J. Robbins }
425