xref: /freebsd/lib/libc/locale/euc.c (revision a0ee8cc6)
1 /*-
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Copyright (c) 2011 The FreeBSD Foundation
12  * All rights reserved.
13  * Portions of this software were developed by David Chisnall
14  * under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #if defined(LIBC_SCCS) && !defined(lint)
42 static char sccsid[] = "@(#)euc.c	8.1 (Berkeley) 6/4/93";
43 #endif /* LIBC_SCCS and not lint */
44 #include <sys/param.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <errno.h>
48 #include <limits.h>
49 #include <runetype.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include "mblocal.h"
54 
55 extern int __mb_sb_limit;
56 
57 static size_t	_EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict,
58     size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
59 static size_t	_EUC_wcrtomb_impl(char * __restrict, wchar_t,
60     mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
61 
62 static size_t	_EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict,
63 		    size_t, mbstate_t * __restrict);
64 static size_t	_EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict,
65 		    size_t, mbstate_t * __restrict);
66 static size_t	_EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict,
67 		    size_t, mbstate_t * __restrict);
68 static size_t	_EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict,
69 		    size_t, mbstate_t * __restrict);
70 
71 static size_t	_EUC_CN_wcrtomb(char * __restrict, wchar_t,
72 		    mbstate_t * __restrict);
73 static size_t	_EUC_JP_wcrtomb(char * __restrict, wchar_t,
74 		    mbstate_t * __restrict);
75 static size_t	_EUC_KR_wcrtomb(char * __restrict, wchar_t,
76 		    mbstate_t * __restrict);
77 static size_t	_EUC_TW_wcrtomb(char * __restrict, wchar_t,
78 		    mbstate_t * __restrict);
79 
80 static size_t	_EUC_CN_mbsnrtowcs(wchar_t * __restrict,
81 		    const char ** __restrict, size_t, size_t,
82 		    mbstate_t * __restrict);
83 static size_t	_EUC_JP_mbsnrtowcs(wchar_t * __restrict,
84 		    const char ** __restrict, size_t, size_t,
85 		    mbstate_t * __restrict);
86 static size_t	_EUC_KR_mbsnrtowcs(wchar_t * __restrict,
87 		    const char ** __restrict, size_t, size_t,
88 		    mbstate_t * __restrict);
89 static size_t	_EUC_TW_mbsnrtowcs(wchar_t * __restrict,
90 		    const char ** __restrict, size_t, size_t,
91 		    mbstate_t * __restrict);
92 
93 static size_t	_EUC_CN_wcsnrtombs(char * __restrict,
94 		    const wchar_t ** __restrict, size_t, size_t,
95 		    mbstate_t * __restrict);
96 static size_t	_EUC_JP_wcsnrtombs(char * __restrict,
97 		    const wchar_t ** __restrict, size_t, size_t,
98 		    mbstate_t * __restrict);
99 static size_t	_EUC_KR_wcsnrtombs(char * __restrict,
100 		    const wchar_t ** __restrict, size_t, size_t,
101 		    mbstate_t * __restrict);
102 static size_t	_EUC_TW_wcsnrtombs(char * __restrict,
103 		    const wchar_t ** __restrict, size_t, size_t,
104 		    mbstate_t * __restrict);
105 
106 static int	_EUC_mbsinit(const mbstate_t *);
107 
108 typedef struct {
109 	wchar_t	ch;
110 	int	set;
111 	int	want;
112 } _EucState;
113 
114 static int
115 _EUC_mbsinit(const mbstate_t *ps)
116 {
117 
118 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
119 }
120 
121 /*
122  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
123  */
124 int
125 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl)
126 {
127 	l->__mbrtowc = _EUC_CN_mbrtowc;
128 	l->__wcrtomb = _EUC_CN_wcrtomb;
129 	l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs;
130 	l->__wcsnrtombs = _EUC_CN_wcsnrtombs;
131 	l->__mbsinit = _EUC_mbsinit;
132 
133 	l->runes = rl;
134 	l->__mb_cur_max = 4;
135 	l->__mb_sb_limit = 256;
136 	return (0);
137 }
138 
139 static size_t
140 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
141     size_t n, mbstate_t * __restrict ps)
142 {
143 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
144 }
145 
146 static size_t
147 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst,
148     const char ** __restrict src,
149     size_t nms, size_t len, mbstate_t * __restrict ps)
150 {
151 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
152 }
153 
154 static size_t
155 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc,
156     mbstate_t * __restrict ps)
157 {
158 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
159 }
160 
161 static size_t
162 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
163 	size_t nwc, size_t len, mbstate_t * __restrict ps)
164 {
165 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
166 }
167 
168 /*
169  * EUC-KR uses only CS0 and CS1.
170  */
171 int
172 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl)
173 {
174 	l->__mbrtowc = _EUC_KR_mbrtowc;
175 	l->__wcrtomb = _EUC_KR_wcrtomb;
176 	l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs;
177 	l->__wcsnrtombs = _EUC_KR_wcsnrtombs;
178 	l->__mbsinit = _EUC_mbsinit;
179 
180 	l->runes = rl;
181 	l->__mb_cur_max = 2;
182 	l->__mb_sb_limit = 128;
183 	return (0);
184 }
185 
186 static size_t
187 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
188     size_t n, mbstate_t * __restrict ps)
189 {
190 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
191 }
192 
193 static size_t
194 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst,
195     const char ** __restrict src,
196     size_t nms, size_t len, mbstate_t * __restrict ps)
197 {
198 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
199 }
200 
201 static size_t
202 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc,
203 	mbstate_t * __restrict ps)
204 {
205 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
206 }
207 
208 static size_t
209 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
210 	size_t nwc, size_t len, mbstate_t * __restrict ps)
211 {
212 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
213 }
214 
215 /*
216  * EUC-JP uses CS0, CS1, CS2, and CS3.
217  */
218 int
219 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl)
220 {
221 	l->__mbrtowc = _EUC_JP_mbrtowc;
222 	l->__wcrtomb = _EUC_JP_wcrtomb;
223 	l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs;
224 	l->__wcsnrtombs = _EUC_JP_wcsnrtombs;
225 	l->__mbsinit = _EUC_mbsinit;
226 
227 	l->runes = rl;
228 	l->__mb_cur_max = 3;
229 	l->__mb_sb_limit = 196;
230 	return (0);
231 }
232 
233 static size_t
234 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
235     size_t n, mbstate_t * __restrict ps)
236 {
237 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
238 }
239 
240 static size_t
241 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst,
242     const char ** __restrict src,
243     size_t nms, size_t len, mbstate_t * __restrict ps)
244 {
245 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
246 }
247 
248 static size_t
249 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc,
250     mbstate_t * __restrict ps)
251 {
252 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
253 }
254 
255 static size_t
256 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
257 	size_t nwc, size_t len, mbstate_t * __restrict ps)
258 {
259 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
260 }
261 
262 /*
263  * EUC-TW uses CS0, CS1, and CS2.
264  */
265 int
266 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl)
267 {
268 	l->__mbrtowc = _EUC_TW_mbrtowc;
269 	l->__wcrtomb = _EUC_TW_wcrtomb;
270 	l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs;
271 	l->__wcsnrtombs = _EUC_TW_wcsnrtombs;
272 	l->__mbsinit = _EUC_mbsinit;
273 
274 	l->runes = rl;
275 	l->__mb_cur_max = 4;
276 	l->__mb_sb_limit = 256;
277 	return (0);
278 }
279 
280 static size_t
281 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
282 	size_t n, mbstate_t * __restrict ps)
283 {
284 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
285 }
286 
287 static size_t
288 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst,
289 	const char ** __restrict src,
290 	size_t nms, size_t len, mbstate_t * __restrict ps)
291 {
292 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
293 }
294 
295 static size_t
296 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc,
297 	mbstate_t * __restrict ps)
298 {
299 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
300 }
301 
302 static size_t
303 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
304 	size_t nwc, size_t len, mbstate_t * __restrict ps)
305 {
306 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
307 }
308 
309 /*
310  * Common EUC code.
311  */
312 
313 static size_t
314 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s,
315 	size_t n, mbstate_t * __restrict ps,
316 	uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
317 {
318 	_EucState *es;
319 	int i, want;
320 	wchar_t wc = 0;
321 	unsigned char ch, chs;
322 
323 	es = (_EucState *)ps;
324 
325 	if (es->want < 0 || es->want > MB_CUR_MAX) {
326 		errno = EINVAL;
327 		return ((size_t)-1);
328 	}
329 
330 	if (s == NULL) {
331 		s = "";
332 		n = 1;
333 		pwc = NULL;
334 	}
335 
336 	if (n == 0)
337 		/* Incomplete multibyte sequence */
338 		return ((size_t)-2);
339 
340 	if (es->want == 0) {
341 		/* Fast path for plain ASCII (CS0) */
342 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
343 			if (pwc != NULL)
344 				*pwc = ch;
345 			return (ch != '\0' ? 1 : 0);
346 		}
347 
348 		if (ch >= 0xa1) {
349 			/* CS1 */
350 			want = 2;
351 		} else if (ch == cs2) {
352 			want = cs2width;
353 		} else if (ch == cs3) {
354 			want = cs3width;
355 		} else {
356 			errno = EILSEQ;
357 			return ((size_t)-1);
358 		}
359 
360 
361 		es->want = want;
362 		es->ch = 0;
363 	} else {
364 		want = es->want;
365 		wc = es->ch;
366 	}
367 
368 	for (i = 0; i < MIN(want, n); i++) {
369 		wc <<= 8;
370 		chs = *s;
371 		wc |= chs;
372 		s++;
373 	}
374 	if (i < want) {
375 		/* Incomplete multibyte sequence */
376 		es->want = want - i;
377 		es->ch = wc;
378 		errno = EILSEQ;
379 		return ((size_t)-2);
380 	}
381 	if (pwc != NULL)
382 		*pwc = wc;
383 	es->want = 0;
384 	return (wc == L'\0' ? 0 : want);
385 }
386 
387 static size_t
388 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc,
389     mbstate_t * __restrict ps,
390     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
391 {
392 	_EucState *es;
393 	int i, len;
394 	wchar_t nm;
395 
396 	es = (_EucState *)ps;
397 
398 	if (es->want != 0) {
399 		errno = EINVAL;
400 		return ((size_t)-1);
401 	}
402 
403 	if (s == NULL)
404 		/* Reset to initial shift state (no-op) */
405 		return (1);
406 
407 	if ((wc & ~0x7f) == 0) {
408 		/* Fast path for plain ASCII (CS0) */
409 		*s = (char)wc;
410 		return (1);
411 	}
412 
413 	/* Determine the "length" */
414 	if ((unsigned)wc > 0xffffff) {
415 		len = 4;
416 	} else if ((unsigned)wc > 0xffff) {
417 		len = 3;
418 	} else if ((unsigned)wc > 0xff) {
419 		len = 2;
420 	} else {
421 		len = 1;
422 	}
423 
424 	if (len > MB_CUR_MAX) {
425 		errno = EILSEQ;
426 		return ((size_t)-1);
427 	}
428 
429 	/* This first check excludes CS1, which is implicitly valid. */
430 	if ((wc < 0xa100) || (wc > 0xffff)) {
431 		/* Check for valid CS2 or CS3 */
432 		nm = (wc >> ((len - 1) * 8));
433 		if (nm == cs2) {
434 			if (len != cs2width) {
435 				errno = EILSEQ;
436 				return ((size_t)-1);
437 			}
438 		} else if (nm == cs3) {
439 			if (len != cs3width) {
440 				errno = EILSEQ;
441 				return ((size_t)-1);
442 			}
443 		} else {
444 			errno = EILSEQ;
445 			return ((size_t)-1);
446 		}
447 	}
448 
449 	/* Stash the bytes, least significant last */
450 	for (i = len - 1; i >= 0; i--) {
451 		s[i] = (wc & 0xff);
452 		wc >>= 8;
453 	}
454 	return (len);
455 }
456