xref: /dragonfly/lib/libc/locale/euc.c (revision 73610d44)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Copyright (c) 2011 The FreeBSD Foundation
12  * All rights reserved.
13  * Portions of this software were developed by David Chisnall
14  * under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * @(#)euc.c	8.1 (Berkeley) 6/4/93
41  */
42 
43 #include <sys/param.h>
44 
45 #include <errno.h>
46 #include <limits.h>
47 #include <runetype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include "mblocal.h"
52 
53 extern int __mb_sb_limit;
54 
55 static size_t	_EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict,
56     size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
57 static size_t	_EUC_wcrtomb_impl(char * __restrict, wchar_t,
58     mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
59 
60 static size_t	_EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict,
61 		    size_t, mbstate_t * __restrict);
62 static size_t	_EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict,
63 		    size_t, mbstate_t * __restrict);
64 static size_t	_EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict,
65 		    size_t, mbstate_t * __restrict);
66 static size_t	_EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict,
67 		    size_t, mbstate_t * __restrict);
68 
69 static size_t	_EUC_CN_wcrtomb(char * __restrict, wchar_t,
70 		    mbstate_t * __restrict);
71 static size_t	_EUC_JP_wcrtomb(char * __restrict, wchar_t,
72 		    mbstate_t * __restrict);
73 static size_t	_EUC_KR_wcrtomb(char * __restrict, wchar_t,
74 		    mbstate_t * __restrict);
75 static size_t	_EUC_TW_wcrtomb(char * __restrict, wchar_t,
76 		    mbstate_t * __restrict);
77 
78 static size_t	_EUC_CN_mbsnrtowcs(wchar_t * __restrict,
79 		    const char ** __restrict, size_t, size_t,
80 		    mbstate_t * __restrict);
81 static size_t	_EUC_JP_mbsnrtowcs(wchar_t * __restrict,
82 		    const char ** __restrict, size_t, size_t,
83 		    mbstate_t * __restrict);
84 static size_t	_EUC_KR_mbsnrtowcs(wchar_t * __restrict,
85 		    const char ** __restrict, size_t, size_t,
86 		    mbstate_t * __restrict);
87 static size_t	_EUC_TW_mbsnrtowcs(wchar_t * __restrict,
88 		    const char ** __restrict, size_t, size_t,
89 		    mbstate_t * __restrict);
90 
91 static size_t	_EUC_CN_wcsnrtombs(char * __restrict,
92 		    const wchar_t ** __restrict, size_t, size_t,
93 		    mbstate_t * __restrict);
94 static size_t	_EUC_JP_wcsnrtombs(char * __restrict,
95 		    const wchar_t ** __restrict, size_t, size_t,
96 		    mbstate_t * __restrict);
97 static size_t	_EUC_KR_wcsnrtombs(char * __restrict,
98 		    const wchar_t ** __restrict, size_t, size_t,
99 		    mbstate_t * __restrict);
100 static size_t	_EUC_TW_wcsnrtombs(char * __restrict,
101 		    const wchar_t ** __restrict, size_t, size_t,
102 		    mbstate_t * __restrict);
103 
104 static int	_EUC_mbsinit(const mbstate_t *);
105 
106 typedef struct {
107 	wchar_t	ch;
108 	int	set;
109 	int	want;
110 } _EucState;
111 
112 static int
113 _EUC_mbsinit(const mbstate_t *ps)
114 {
115 
116 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
117 }
118 
119 /*
120  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
121  */
122 int
123 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl)
124 {
125 	l->__mbrtowc = _EUC_CN_mbrtowc;
126 	l->__wcrtomb = _EUC_CN_wcrtomb;
127 	l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs;
128 	l->__wcsnrtombs = _EUC_CN_wcsnrtombs;
129 	l->__mbsinit = _EUC_mbsinit;
130 
131 	l->runes = rl;
132 	l->__mb_cur_max = 4;
133 	l->__mb_sb_limit = 256;
134 	return (0);
135 }
136 
137 static size_t
138 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
139     size_t n, mbstate_t * __restrict ps)
140 {
141 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
142 }
143 
144 static size_t
145 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst,
146     const char ** __restrict src,
147     size_t nms, size_t len, mbstate_t * __restrict ps)
148 {
149 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
150 }
151 
152 static size_t
153 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc,
154     mbstate_t * __restrict ps)
155 {
156 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
157 }
158 
159 static size_t
160 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
161 	size_t nwc, size_t len, mbstate_t * __restrict ps)
162 {
163 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
164 }
165 
166 /*
167  * EUC-KR uses only CS0 and CS1.
168  */
169 int
170 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl)
171 {
172 	l->__mbrtowc = _EUC_KR_mbrtowc;
173 	l->__wcrtomb = _EUC_KR_wcrtomb;
174 	l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs;
175 	l->__wcsnrtombs = _EUC_KR_wcsnrtombs;
176 	l->__mbsinit = _EUC_mbsinit;
177 
178 	l->runes = rl;
179 	l->__mb_cur_max = 2;
180 	l->__mb_sb_limit = 128;
181 	return (0);
182 }
183 
184 static size_t
185 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
186     size_t n, mbstate_t * __restrict ps)
187 {
188 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
189 }
190 
191 static size_t
192 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst,
193     const char ** __restrict src,
194     size_t nms, size_t len, mbstate_t * __restrict ps)
195 {
196 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
197 }
198 
199 static size_t
200 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc,
201 	mbstate_t * __restrict ps)
202 {
203 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
204 }
205 
206 static size_t
207 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
208 	size_t nwc, size_t len, mbstate_t * __restrict ps)
209 {
210 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
211 }
212 
213 /*
214  * EUC-JP uses CS0, CS1, CS2, and CS3.
215  */
216 int
217 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl)
218 {
219 	l->__mbrtowc = _EUC_JP_mbrtowc;
220 	l->__wcrtomb = _EUC_JP_wcrtomb;
221 	l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs;
222 	l->__wcsnrtombs = _EUC_JP_wcsnrtombs;
223 	l->__mbsinit = _EUC_mbsinit;
224 
225 	l->runes = rl;
226 	l->__mb_cur_max = 3;
227 	l->__mb_sb_limit = 196;
228 	return (0);
229 }
230 
231 static size_t
232 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
233     size_t n, mbstate_t * __restrict ps)
234 {
235 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
236 }
237 
238 static size_t
239 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst,
240     const char ** __restrict src,
241     size_t nms, size_t len, mbstate_t * __restrict ps)
242 {
243 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
244 }
245 
246 static size_t
247 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc,
248     mbstate_t * __restrict ps)
249 {
250 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
251 }
252 
253 static size_t
254 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
255 	size_t nwc, size_t len, mbstate_t * __restrict ps)
256 {
257 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
258 }
259 
260 /*
261  * EUC-TW uses CS0, CS1, and CS2.
262  */
263 int
264 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl)
265 {
266 	l->__mbrtowc = _EUC_TW_mbrtowc;
267 	l->__wcrtomb = _EUC_TW_wcrtomb;
268 	l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs;
269 	l->__wcsnrtombs = _EUC_TW_wcsnrtombs;
270 	l->__mbsinit = _EUC_mbsinit;
271 
272 	l->runes = rl;
273 	l->__mb_cur_max = 4;
274 	l->__mb_sb_limit = 256;
275 	return (0);
276 }
277 
278 static size_t
279 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
280 	size_t n, mbstate_t * __restrict ps)
281 {
282 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
283 }
284 
285 static size_t
286 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst,
287 	const char ** __restrict src,
288 	size_t nms, size_t len, mbstate_t * __restrict ps)
289 {
290 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
291 }
292 
293 static size_t
294 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc,
295 	mbstate_t * __restrict ps)
296 {
297 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
298 }
299 
300 static size_t
301 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
302 	size_t nwc, size_t len, mbstate_t * __restrict ps)
303 {
304 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
305 }
306 
307 /*
308  * Common EUC code.
309  */
310 
311 static size_t
312 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s,
313 	size_t n, mbstate_t * __restrict ps,
314 	uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
315 {
316 	_EucState *es;
317 	int i, want;
318 	wchar_t wc;
319 	unsigned char ch;
320 
321 	es = (_EucState *)ps;
322 
323 	if (es->want < 0 || es->want > MB_CUR_MAX) {
324 		errno = EINVAL;
325 		return ((size_t)-1);
326 	}
327 
328 	if (s == NULL) {
329 		s = "";
330 		n = 1;
331 		pwc = NULL;
332 	}
333 
334 	if (n == 0)
335 		/* Incomplete multibyte sequence */
336 		return ((size_t)-2);
337 
338 	if (es->want == 0) {
339 		/* Fast path for plain ASCII (CS0) */
340 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
341 			if (pwc != NULL)
342 				*pwc = ch;
343 			return (ch != '\0' ? 1 : 0);
344 		}
345 
346 		if (ch >= 0xa1) {
347 			/* CS1 */
348 			want = 2;
349 		} else if (ch == cs2) {
350 			want = cs2width;
351 		} else if (ch == cs3) {
352 			want = cs3width;
353 		} else {
354 			errno = EILSEQ;
355 			return ((size_t)-1);
356 		}
357 
358 
359 		es->want = want;
360 		es->ch = 0;
361 	} else {
362 		want = es->want;
363 		wc = es->ch;
364 	}
365 
366 	for (i = 0; i < MIN(want, n); i++) {
367 		wc <<= 8;
368 		wc |= *s;
369 		s++;
370 	}
371 	if (i < want) {
372 		/* Incomplete multibyte sequence */
373 		es->want = want - i;
374 		es->ch = wc;
375 		return ((size_t)-2);
376 	}
377 	if (pwc != NULL)
378 		*pwc = wc;
379 	es->want = 0;
380 	return (wc == L'\0' ? 0 : want);
381 }
382 
383 static size_t
384 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc,
385     mbstate_t * __restrict ps,
386     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
387 {
388 	_EucState *es;
389 	int i, len;
390 	wchar_t nm;
391 
392 	es = (_EucState *)ps;
393 
394 	if (es->want != 0) {
395 		errno = EINVAL;
396 		return ((size_t)-1);
397 	}
398 
399 	if (s == NULL)
400 		/* Reset to initial shift state (no-op) */
401 		return (1);
402 
403 	if ((wc & ~0x7f) == 0) {
404 		/* Fast path for plain ASCII (CS0) */
405 		*s = (char)wc;
406 		return (1);
407 	}
408 
409 	/* Determine the "length" */
410 	if ((unsigned)wc > 0xffffff) {
411 		len = 4;
412 	} else if ((unsigned)wc > 0xffff) {
413 		len = 3;
414 	} else if ((unsigned)wc > 0xff) {
415 		len = 2;
416 	} else {
417 		len = 1;
418 	}
419 
420 	if (len > MB_CUR_MAX) {
421 		errno = EILSEQ;
422 		return ((size_t)-1);
423 	}
424 
425 	/* This first check excludes CS1, which is implicitly valid. */
426 	if ((wc < 0xa100) || (wc > 0xffff)) {
427 		/* Check for valid CS2 or CS3 */
428 		nm = (wc >> ((len - 1) * 8));
429 		if (nm == cs2) {
430 			if (len != cs2width) {
431 				errno = EILSEQ;
432 				return ((size_t)-1);
433 			}
434 		} else if (nm == cs3) {
435 			if (len != cs3width) {
436 				errno = EILSEQ;
437 				return ((size_t)-1);
438 			}
439 		} else {
440 			errno = EILSEQ;
441 			return ((size_t)-1);
442 		}
443 	}
444 
445 	/* Stash the bytes, least significant last */
446 	for (i = len - 1; i >= 0; i--) {
447 		s[i] = (wc & 0xff);
448 		wc >>= 8;
449 	}
450 	return (len);
451 }
452