xref: /openbsd/lib/libc/citrus/citrus_utf8.c (revision d415bd75)
1 /*	$OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */
2 
3 /*-
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 
31 #include <errno.h>
32 #include <string.h>
33 #include <wchar.h>
34 
35 #include "citrus_ctype.h"
36 
37 struct _utf8_state {
38 	wchar_t	ch;
39 	int	want;
40 	wchar_t	lbound;
41 };
42 
43 size_t
44 _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
45     const char * __restrict s, size_t n, mbstate_t * __restrict ps)
46 {
47 	struct _utf8_state *us;
48 	int ch, i, mask, want;
49 	wchar_t lbound, wch;
50 
51 	us = (struct _utf8_state *)ps;
52 
53 	if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
54 		errno = EINVAL;
55 		return -1;
56 	}
57 
58 	if (s == NULL) {
59 		s = "";
60 		n = 1;
61 		pwc = NULL;
62 	}
63 
64 	if (n == 0)
65 		return -2;
66 
67 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
68 		/* Fast path for plain ASCII characters. */
69 		if (pwc != NULL)
70 			*pwc = ch;
71 		return ch != '\0' ? 1 : 0;
72 	}
73 
74 	if (us->want == 0) {
75 		/*
76 		 * Determine the number of bytes that make up this character
77 		 * from the first byte, and a mask that extracts the
78 		 * interesting bits of the first byte.  We already know
79 		 * the character is at least two bytes long.
80 		 *
81 		 * We also specify a lower bound for the character code to
82 		 * detect redundant, non-"shortest form" encodings. For
83 		 * example, the sequence C0 80 is _not_ a legal representation
84 		 * of the null character. This enforces a 1-to-1 mapping
85 		 * between character codes and their multibyte representations.
86 		 */
87 		ch = (unsigned char)*s;
88 		if ((ch & 0x80) == 0) {
89 			mask = 0x7f;
90 			want = 1;
91 			lbound = 0;
92 		} else if ((ch & 0xe0) == 0xc0) {
93 			mask = 0x1f;
94 			want = 2;
95 			lbound = 0x80;
96 		} else if ((ch & 0xf0) == 0xe0) {
97 			mask = 0x0f;
98 			want = 3;
99 			lbound = 0x800;
100 		} else if ((ch & 0xf8) == 0xf0) {
101 			mask = 0x07;
102 			want = 4;
103 			lbound = 0x10000;
104 		} else {
105 			/*
106 			 * Malformed input; input is not UTF-8.
107 			 * See RFC 3629.
108 			 */
109 			errno = EILSEQ;
110 			return -1;
111 		}
112 	} else {
113 		want = us->want;
114 		lbound = us->lbound;
115 	}
116 
117 	/*
118 	 * Decode the byte sequence representing the character in chunks
119 	 * of 6 bits, most significant first.
120 	 */
121 	if (us->want == 0)
122 		wch = (unsigned char)*s++ & mask;
123 	else
124 		wch = us->ch;
125 	for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) {
126 		if ((*s & 0xc0) != 0x80) {
127 			/*
128 			 * Malformed input; bad byte in the middle
129 			 * of a character.
130 			 */
131 			errno = EILSEQ;
132 			return -1;
133 		}
134 		wch <<= 6;
135 		wch |= *s++ & 0x3f;
136 	}
137 	if (i < want) {
138 		/* Incomplete multibyte sequence. */
139 		us->want = want - i;
140 		us->lbound = lbound;
141 		us->ch = wch;
142 		return -2;
143 	}
144 	if (wch < lbound) {
145 		/*
146 		 * Malformed input; redundant encoding.
147 		 */
148 		errno = EILSEQ;
149 		return -1;
150 	}
151 	if (wch >= 0xd800 && wch <= 0xdfff) {
152 		/*
153 		 * Malformed input; invalid code points.
154 		 */
155 		errno = EILSEQ;
156 		return -1;
157 	}
158 	if (wch > 0x10ffff) {
159 		/*
160 		 * Malformed input; invalid code points.
161 		 */
162 		errno = EILSEQ;
163 		return -1;
164 	}
165 	if (pwc != NULL)
166 		*pwc = wch;
167 	us->want = 0;
168 	return wch == L'\0' ? 0 : want;
169 }
170 
171 int
172 _citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)
173 {
174 	return ((const struct _utf8_state *)ps)->want == 0;
175 }
176 
177 size_t
178 _citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,
179     const char ** __restrict src, size_t nmc, size_t len,
180     mbstate_t * __restrict ps)
181 {
182 	struct _utf8_state *us;
183 	size_t i, o, r;
184 
185 	us = (struct _utf8_state *)ps;
186 
187 	if (dst == NULL) {
188 		/*
189 		 * The fast path in the loop below is not safe if an ASCII
190 		 * character appears as anything but the first byte of a
191 		 * multibyte sequence. Check now to avoid doing it in the loop.
192 		 */
193 		if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) {
194 			errno = EILSEQ;
195 			return -1;
196 		}
197 		for (i = o = 0; i < nmc; i += r, o++) {
198 			if ((unsigned char)(*src)[i] < 0x80) {
199 				/* Fast path for plain ASCII characters. */
200 				if ((*src)[i] == '\0')
201 					return o;
202 				r = 1;
203 			} else {
204 				r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i,
205 				    nmc - i, ps);
206 				if (r == (size_t)-1)
207 					return r;
208 				if (r == (size_t)-2)
209 					return o;
210 				if (r == 0)
211 					return o;
212 			}
213 		}
214 		return o;
215 	}
216 
217 	/*
218 	 * The fast path in the loop below is not safe if an ASCII
219 	 * character appears as anything but the first byte of a
220 	 * multibyte sequence. Check now to avoid doing it in the loop.
221 	 */
222 	if (len > 0 && nmc > 0 && us->want > 0 &&
223 	    (unsigned char)(*src)[0] < 0x80) {
224 		errno = EILSEQ;
225 		return -1;
226 	}
227 	for (i = o = 0; i < nmc && o < len; i += r, o++) {
228 		if ((unsigned char)(*src)[i] < 0x80) {
229 			/* Fast path for plain ASCII characters. */
230 			dst[o] = (wchar_t)(unsigned char)(*src)[i];
231 			if ((*src)[i] == '\0') {
232 				*src = NULL;
233 				return o;
234 			}
235 			r = 1;
236 		} else {
237 			r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i,
238 			    nmc - i, ps);
239 			if (r == (size_t)-1) {
240 				*src += i;
241 				return r;
242 			}
243 			if (r == (size_t)-2) {
244 				*src += nmc;
245 				return o;
246 			}
247 			if (r == 0) {
248 				*src = NULL;
249 				return o;
250 			}
251 		}
252 	}
253 	*src += i;
254 	return o;
255 }
256 
257 size_t
258 _citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc,
259     mbstate_t * __restrict ps)
260 {
261 	struct _utf8_state *us;
262 	unsigned char lead;
263 	int i, len;
264 
265 	us = (struct _utf8_state *)ps;
266 
267 	if (us->want != 0) {
268 		errno = EINVAL;
269 		return -1;
270 	}
271 
272 	if (s == NULL)
273 		return 1;
274 
275 	if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) {
276 		errno = EILSEQ;
277 		return -1;
278 	}
279 
280 	/*
281 	 * Determine the number of bytes needed to represent this character.
282 	 * We always output the shortest sequence possible. Also specify the
283 	 * first few bits of the first byte, which contains the information
284 	 * about the sequence length.
285 	 */
286 	if (wc <= 0x7f) {
287 		/* Fast path for plain ASCII characters. */
288 		*s = (char)wc;
289 		return 1;
290 	} else if (wc <= 0x7ff) {
291 		lead = 0xc0;
292 		len = 2;
293 	} else if (wc <= 0xffff) {
294 		lead = 0xe0;
295 		len = 3;
296 	} else {
297 		lead = 0xf0;
298 		len = 4;
299 	}
300 
301 	/*
302 	 * Output the bytes representing the character in chunks
303 	 * of 6 bits, least significant last. The first byte is
304 	 * a special case because it contains the sequence length
305 	 * information.
306 	 */
307 	for (i = len - 1; i > 0; i--) {
308 		s[i] = (wc & 0x3f) | 0x80;
309 		wc >>= 6;
310 	}
311 	*s = (wc & 0xff) | lead;
312 
313 	return len;
314 }
315 
316 size_t
317 _citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,
318     const wchar_t ** __restrict src, size_t nwc, size_t len,
319     mbstate_t * __restrict ps)
320 {
321 	struct _utf8_state *us;
322 	char buf[_CITRUS_UTF8_MB_CUR_MAX];
323 	size_t i, o, r;
324 
325 	us = (struct _utf8_state *)ps;
326 
327 	if (us->want != 0) {
328 		errno = EINVAL;
329 		return -1;
330 	}
331 
332 	if (dst == NULL) {
333 		for (i = o = 0; i < nwc; i++, o += r) {
334 			wchar_t wc = (*src)[i];
335 			if (wc >= 0 && wc < 0x80) {
336 				/* Fast path for plain ASCII characters. */
337 				if (wc == 0)
338 					return o;
339 				r = 1;
340 			} else {
341 				r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
342 				if (r == (size_t)-1)
343 					return r;
344 			}
345 		}
346 		return o;
347 	}
348 
349 	for (i = o = 0; i < nwc && o < len; i++, o += r) {
350 		wchar_t wc = (*src)[i];
351 		if (wc >= 0 && wc < 0x80) {
352 			/* Fast path for plain ASCII characters. */
353 			dst[o] = (wchar_t)wc;
354 			if (wc == 0) {
355 				*src = NULL;
356 				return o;
357 			}
358 			r = 1;
359 		} else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) {
360 			/* Enough space to translate in-place. */
361 			r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps);
362 			if (r == (size_t)-1) {
363 				*src += i;
364 				return r;
365 			}
366 		} else {
367 			/* May not be enough space; use temp buffer. */
368 			r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
369 			if (r == (size_t)-1) {
370 				*src += i;
371 				return r;
372 			}
373 			if (r > len - o)
374 				break;
375 			memcpy(dst + o, buf, r);
376 		}
377 	}
378 	*src += i;
379 	return o;
380 }
381