xref: /openbsd/lib/libc/citrus/citrus_utf8.c (revision 3d8817e4)
1 /*	$OpenBSD: citrus_utf8.c,v 1.4 2011/04/21 00:16:06 yasuoka Exp $ */
2 
3 /*-
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/errno.h>
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/limits.h>
34 
35 #include <errno.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <stddef.h>
39 #include <string.h>
40 #include <wchar.h>
41 
42 #include "citrus_ctype.h"
43 #include "citrus_utf8.h"
44 
45 _CITRUS_CTYPE_DEF_OPS(utf8);
46 
47 struct _utf8_state {
48 	wchar_t	ch;
49 	int	want;
50 	wchar_t	lbound;
51 };
52 
53 size_t
54 /*ARGSUSED*/
55 _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
56 			   const char * __restrict s, size_t n,
57 			   void * __restrict pspriv)
58 {
59 	struct _utf8_state *us;
60 	int ch, i, mask, want;
61 	wchar_t lbound, wch;
62 
63 	us = (struct _utf8_state *)pspriv;
64 
65 	if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
66 		errno = EINVAL;
67 		return ((size_t)-1);
68 	}
69 
70 	if (s == NULL) {
71 		s = "";
72 		n = 1;
73 		pwc = NULL;
74 	}
75 
76 	if (n == 0) {
77 		/* Incomplete multibyte sequence */
78 		return ((size_t)-2);
79 	}
80 
81 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
82 		/* Fast path for plain ASCII characters. */
83 		if (pwc != NULL)
84 			*pwc = ch;
85 		return (ch != '\0' ? 1 : 0);
86 	}
87 
88 	if (us->want == 0) {
89 		/*
90 		 * Determine the number of octets that make up this character
91 		 * from the first octet, and a mask that extracts the
92 		 * interesting bits of the first octet. We already know
93 		 * the character is at least two bytes long.
94 		 *
95 		 * We also specify a lower bound for the character code to
96 		 * detect redundant, non-"shortest form" encodings. For
97 		 * example, the sequence C0 80 is _not_ a legal representation
98 		 * of the null character. This enforces a 1-to-1 mapping
99 		 * between character codes and their multibyte representations.
100 		 */
101 		ch = (unsigned char)*s;
102 		if ((ch & 0x80) == 0) {
103 			mask = 0x7f;
104 			want = 1;
105 			lbound = 0;
106 		} else if ((ch & 0xe0) == 0xc0) {
107 			mask = 0x1f;
108 			want = 2;
109 			lbound = 0x80;
110 		} else if ((ch & 0xf0) == 0xe0) {
111 			mask = 0x0f;
112 			want = 3;
113 			lbound = 0x800;
114 		} else if ((ch & 0xf8) == 0xf0) {
115 			mask = 0x07;
116 			want = 4;
117 			lbound = 0x10000;
118 		} else {
119 			/*
120 			 * Malformed input; input is not UTF-8.
121 			 * See RFC 3629.
122 			 */
123 			errno = EILSEQ;
124 			return ((size_t)-1);
125 		}
126 	} else {
127 		want = us->want;
128 		lbound = us->lbound;
129 	}
130 
131 	/*
132 	 * Decode the octet sequence representing the character in chunks
133 	 * of 6 bits, most significant first.
134 	 */
135 	if (us->want == 0)
136 		wch = (unsigned char)*s++ & mask;
137 	else
138 		wch = us->ch;
139 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
140 		if ((*s & 0xc0) != 0x80) {
141 			/*
142 			 * Malformed input; bad characters in the middle
143 			 * of a character.
144 			 */
145 			errno = EILSEQ;
146 			return ((size_t)-1);
147 		}
148 		wch <<= 6;
149 		wch |= *s++ & 0x3f;
150 	}
151 	if (i < want) {
152 		/* Incomplete multibyte sequence. */
153 		us->want = want - i;
154 		us->lbound = lbound;
155 		us->ch = wch;
156 		return ((size_t)-2);
157 	}
158 	if (wch < lbound) {
159 		/*
160 		 * Malformed input; redundant encoding.
161 		 */
162 		errno = EILSEQ;
163 		return ((size_t)-1);
164 	}
165 	if ((wch >= 0xd800 && wch <= 0xdfff) ||
166 	    wch == 0xfffe || wch == 0xffff) {
167 		/*
168 		 * Malformed input; invalid code points.
169 		 */
170 		errno = EILSEQ;
171 		return ((size_t)-1);
172 	}
173 	if (pwc != NULL)
174 		*pwc = wch;
175 	us->want = 0;
176 	return (wch == L'\0' ? 0 : want);
177 }
178 
179 int
180 /*ARGSUSED*/
181 _citrus_utf8_ctype_mbsinit(const void * __restrict pspriv)
182 {
183 	return (pspriv == NULL ||
184 	    ((const struct _utf8_state *)pspriv)->want == 0);
185 }
186 
187 size_t
188 /*ARGSUSED*/
189 _citrus_utf8_ctype_mbsrtowcs(wchar_t * __restrict pwcs,
190 			     const char ** __restrict s, size_t n,
191 			     void * __restrict pspriv)
192 {
193 	struct _utf8_state *us;
194 	const char *src;
195 	size_t nchr;
196 	wchar_t wc;
197 	size_t nb;
198 
199 	us = (struct _utf8_state *)pspriv;
200 	src = *s;
201 	nchr = 0;
202 
203 	if (pwcs == NULL) {
204 		/*
205 		 * The fast path in the loop below is not safe if an ASCII
206 		 * character appears as anything but the first byte of a
207 		 * multibyte sequence. Check now to avoid doing it in the loop.
208 		 */
209 		if (us->want > 0 && (signed char)*src > 0) {
210 			errno = EILSEQ;
211 			return ((size_t)-1);
212 		}
213 		for (;;) {
214 			if ((signed char)*src > 0) {
215 				/*
216 				 * Fast path for plain ASCII characters
217 				 * excluding NUL.
218 				 */
219 				nb = 1;
220 			} else {
221 				nb = _citrus_utf8_ctype_mbrtowc(&wc, src,
222 				    _CITRUS_UTF8_MB_CUR_MAX, us);
223 				if (nb == (size_t)-1) {
224 					/* Invalid sequence. */
225 					return (nb);
226 				}
227 				if (nb == 0 || nb == (size_t)-2) {
228 					return (nchr);
229 				}
230 			}
231 
232 			src += nb;
233 			nchr++;
234 		}
235 		/*NOTREACHED*/
236 	}
237 
238 	/*
239 	 * The fast path in the loop below is not safe if an ASCII
240 	 * character appears as anything but the first byte of a
241 	 * multibyte sequence. Check now to avoid doing it in the loop.
242 	 */
243 	if (n > 0 && us->want > 0 && (signed char)*src > 0) {
244 		errno = EILSEQ;
245 		return ((size_t)-1);
246 	}
247 	while (n-- > 0) {
248 		if ((signed char)*src > 0) {
249 			/*
250 			 * Fast path for plain ASCII characters
251 			 * excluding NUL.
252 			 */
253 			*pwcs = (wchar_t)*src;
254 			nb = 1;
255 		} else {
256 			nb = _citrus_utf8_ctype_mbrtowc(pwcs, src,
257 			    _CITRUS_UTF8_MB_CUR_MAX, us);
258 			if (nb == (size_t)-1) {
259 				*s = src;
260 				return (nb);
261 			}
262 			if (nb == (size_t)-2) {
263 				*s = src;
264 				return (nchr);
265 			}
266 			if (nb == 0) {
267 				*s = NULL;
268 				return (nchr);
269 			}
270 		}
271 		src += nb;
272 		nchr++;
273 		pwcs++;
274 	}
275 	*s = src;
276 	return (nchr);
277 }
278 
279 size_t
280 /*ARGSUSED*/
281 _citrus_utf8_ctype_wcrtomb(char * __restrict s,
282 			   wchar_t wc, void * __restrict pspriv)
283 {
284 	struct _utf8_state *us;
285 	unsigned char lead;
286 	int i, len;
287 
288 	us = (struct _utf8_state *)pspriv;
289 
290 	if (us->want != 0) {
291 		errno = EINVAL;
292 		return ((size_t)-1);
293 	}
294 
295 	if (s == NULL) {
296 		/* Reset to initial shift state (no-op) */
297 		return (1);
298 	}
299 
300 	if ((wc & ~0x7f) == 0) {
301 		/* Fast path for plain ASCII characters. */
302 		*s = (char)wc;
303 		return (1);
304 	}
305 
306 	/*
307 	 * Determine the number of octets needed to represent this character.
308 	 * We always output the shortest sequence possible. Also specify the
309 	 * first few bits of the first octet, which contains the information
310 	 * about the sequence length.
311 	 */
312 	if ((wc & ~0x7f) == 0) {
313 		lead = 0;
314 		len = 1;
315 	} else if ((wc & ~0x7ff) == 0) {
316 		lead = 0xc0;
317 		len = 2;
318 	} else if ((wc & ~0xffff) == 0) {
319 		lead = 0xe0;
320 		len = 3;
321 	} else if ((wc & ~0x1fffff) == 0) {
322 		lead = 0xf0;
323 		len = 4;
324 	} else {
325 		errno = EILSEQ;
326 		return ((size_t)-1);
327 	}
328 
329 	/*
330 	 * Output the octets representing the character in chunks
331 	 * of 6 bits, least significant last. The first octet is
332 	 * a special case because it contains the sequence length
333 	 * information.
334 	 */
335 	for (i = len - 1; i > 0; i--) {
336 		s[i] = (wc & 0x3f) | 0x80;
337 		wc >>= 6;
338 	}
339 	*s = (wc & 0xff) | lead;
340 
341 	return (len);
342 }
343 
344 size_t
345 /*ARGSUSED*/
346 _citrus_utf8_ctype_wcsrtombs(char * __restrict s,
347 			     const wchar_t ** __restrict pwcs, size_t n,
348 			     void * __restrict pspriv)
349 {
350 	struct _utf8_state *us;
351 	char buf[_CITRUS_UTF8_MB_CUR_MAX];
352 	const wchar_t *src;
353 	size_t nbytes;
354 	size_t nb;
355 
356 	us = (struct _utf8_state *)pspriv;
357 
358 	if (us->want != 0) {
359 		errno = EINVAL;
360 		return ((size_t)-1);
361 	}
362 
363 	src = *pwcs;
364 	nbytes = 0;
365 
366 	if (s == NULL) {
367 		for (;;) {
368 			if (0 <= *src && *src < 0x80)
369 				/* Fast path for plain ASCII characters. */
370 				nb = 1;
371 			else {
372 				nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
373 				if (nb == (size_t)-1) {
374 					/* Invalid character */
375 					return (nb);
376 				}
377 			}
378 			if (*src == L'\0') {
379 				return (nbytes + nb - 1);
380 			}
381 			src++;
382 			nbytes += nb;
383 		}
384 		/*NOTREACHED*/
385 	}
386 
387 	while (n > 0) {
388 		if (0 <= *src && *src < 0x80) {
389 			/* Fast path for plain ASCII characters. */
390 			nb = 1;
391 			*s = *src;
392 		} else if (n > (size_t)_CITRUS_UTF8_MB_CUR_MAX) {
393 			/* Enough space to translate in-place. */
394 			nb = _citrus_utf8_ctype_wcrtomb(s, *src, us);
395 			if (nb == (size_t)-1) {
396 				*pwcs = src;
397 				return (nb);
398 			}
399 		} else {
400 			/*
401 			 * May not be enough space; use temp. buffer.
402 			 */
403 			nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us);
404 			if (nb == (size_t)-1) {
405 				*pwcs = src;
406 				return (nb);
407 			}
408 			if (nb > n)
409 				/* MB sequence for character won't fit. */
410 				break;
411 			memcpy(s, buf, nb);
412 		}
413 		if (*src == L'\0') {
414 			*pwcs = NULL;
415 			return (nbytes + nb - 1);
416 		}
417 		src++;
418 		s += nb;
419 		n -= nb;
420 		nbytes += nb;
421 	}
422 	*pwcs = src;
423 	return (nbytes);
424 }
425