xref: /dragonfly/lib/libc/locale/euc.c (revision 2b7dbe20)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Copyright (c) 2011 The FreeBSD Foundation
12  * All rights reserved.
13  * Portions of this software were developed by David Chisnall
14  * under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  * 3. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * @(#)euc.c	8.1 (Berkeley) 6/4/93
41  */
42 
43 #include <sys/param.h>
44 
45 #include <errno.h>
46 #include <limits.h>
47 #include <runetype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include "mblocal.h"
52 
53 static size_t	_EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict,
54     size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
55 static size_t	_EUC_wcrtomb_impl(char * __restrict, wchar_t,
56     mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
57 
58 static size_t	_EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict,
59 		    size_t, mbstate_t * __restrict);
60 static size_t	_EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict,
61 		    size_t, mbstate_t * __restrict);
62 static size_t	_EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict,
63 		    size_t, mbstate_t * __restrict);
64 static size_t	_EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict,
65 		    size_t, mbstate_t * __restrict);
66 
67 static size_t	_EUC_CN_wcrtomb(char * __restrict, wchar_t,
68 		    mbstate_t * __restrict);
69 static size_t	_EUC_JP_wcrtomb(char * __restrict, wchar_t,
70 		    mbstate_t * __restrict);
71 static size_t	_EUC_KR_wcrtomb(char * __restrict, wchar_t,
72 		    mbstate_t * __restrict);
73 static size_t	_EUC_TW_wcrtomb(char * __restrict, wchar_t,
74 		    mbstate_t * __restrict);
75 
76 static size_t	_EUC_CN_mbsnrtowcs(wchar_t * __restrict,
77 		    const char ** __restrict, size_t, size_t,
78 		    mbstate_t * __restrict);
79 static size_t	_EUC_JP_mbsnrtowcs(wchar_t * __restrict,
80 		    const char ** __restrict, size_t, size_t,
81 		    mbstate_t * __restrict);
82 static size_t	_EUC_KR_mbsnrtowcs(wchar_t * __restrict,
83 		    const char ** __restrict, size_t, size_t,
84 		    mbstate_t * __restrict);
85 static size_t	_EUC_TW_mbsnrtowcs(wchar_t * __restrict,
86 		    const char ** __restrict, size_t, size_t,
87 		    mbstate_t * __restrict);
88 
89 static size_t	_EUC_CN_wcsnrtombs(char * __restrict,
90 		    const wchar_t ** __restrict, size_t, size_t,
91 		    mbstate_t * __restrict);
92 static size_t	_EUC_JP_wcsnrtombs(char * __restrict,
93 		    const wchar_t ** __restrict, size_t, size_t,
94 		    mbstate_t * __restrict);
95 static size_t	_EUC_KR_wcsnrtombs(char * __restrict,
96 		    const wchar_t ** __restrict, size_t, size_t,
97 		    mbstate_t * __restrict);
98 static size_t	_EUC_TW_wcsnrtombs(char * __restrict,
99 		    const wchar_t ** __restrict, size_t, size_t,
100 		    mbstate_t * __restrict);
101 
102 static int	_EUC_mbsinit(const mbstate_t *);
103 
104 typedef struct {
105 	wchar_t	ch;
106 	int	set;
107 	int	want;
108 } _EucState;
109 
110 static int
111 _EUC_mbsinit(const mbstate_t *ps)
112 {
113 
114 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
115 }
116 
117 /*
118  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
119  */
120 int
121 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl)
122 {
123 	l->__mbrtowc = _EUC_CN_mbrtowc;
124 	l->__wcrtomb = _EUC_CN_wcrtomb;
125 	l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs;
126 	l->__wcsnrtombs = _EUC_CN_wcsnrtombs;
127 	l->__mbsinit = _EUC_mbsinit;
128 
129 	l->runes = rl;
130 	l->__mb_cur_max = 4;
131 	l->__mb_sb_limit = 256;
132 	return (0);
133 }
134 
135 static size_t
136 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
137     size_t n, mbstate_t * __restrict ps)
138 {
139 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
140 }
141 
142 static size_t
143 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst,
144     const char ** __restrict src,
145     size_t nms, size_t len, mbstate_t * __restrict ps)
146 {
147 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
148 }
149 
150 static size_t
151 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc,
152     mbstate_t * __restrict ps)
153 {
154 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
155 }
156 
157 static size_t
158 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
159 	size_t nwc, size_t len, mbstate_t * __restrict ps)
160 {
161 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
162 }
163 
164 /*
165  * EUC-KR uses only CS0 and CS1.
166  */
167 int
168 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl)
169 {
170 	l->__mbrtowc = _EUC_KR_mbrtowc;
171 	l->__wcrtomb = _EUC_KR_wcrtomb;
172 	l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs;
173 	l->__wcsnrtombs = _EUC_KR_wcsnrtombs;
174 	l->__mbsinit = _EUC_mbsinit;
175 
176 	l->runes = rl;
177 	l->__mb_cur_max = 2;
178 	l->__mb_sb_limit = 128;
179 	return (0);
180 }
181 
182 static size_t
183 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
184     size_t n, mbstate_t * __restrict ps)
185 {
186 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
187 }
188 
189 static size_t
190 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst,
191     const char ** __restrict src,
192     size_t nms, size_t len, mbstate_t * __restrict ps)
193 {
194 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
195 }
196 
197 static size_t
198 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc,
199 	mbstate_t * __restrict ps)
200 {
201 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
202 }
203 
204 static size_t
205 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
206 	size_t nwc, size_t len, mbstate_t * __restrict ps)
207 {
208 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
209 }
210 
211 /*
212  * EUC-JP uses CS0, CS1, CS2, and CS3.
213  */
214 int
215 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl)
216 {
217 	l->__mbrtowc = _EUC_JP_mbrtowc;
218 	l->__wcrtomb = _EUC_JP_wcrtomb;
219 	l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs;
220 	l->__wcsnrtombs = _EUC_JP_wcsnrtombs;
221 	l->__mbsinit = _EUC_mbsinit;
222 
223 	l->runes = rl;
224 	l->__mb_cur_max = 3;
225 	l->__mb_sb_limit = 196;
226 	return (0);
227 }
228 
229 static size_t
230 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
231     size_t n, mbstate_t * __restrict ps)
232 {
233 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
234 }
235 
236 static size_t
237 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst,
238     const char ** __restrict src,
239     size_t nms, size_t len, mbstate_t * __restrict ps)
240 {
241 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
242 }
243 
244 static size_t
245 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc,
246     mbstate_t * __restrict ps)
247 {
248 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
249 }
250 
251 static size_t
252 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
253 	size_t nwc, size_t len, mbstate_t * __restrict ps)
254 {
255 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
256 }
257 
258 /*
259  * EUC-TW uses CS0, CS1, and CS2.
260  */
261 int
262 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl)
263 {
264 	l->__mbrtowc = _EUC_TW_mbrtowc;
265 	l->__wcrtomb = _EUC_TW_wcrtomb;
266 	l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs;
267 	l->__wcsnrtombs = _EUC_TW_wcsnrtombs;
268 	l->__mbsinit = _EUC_mbsinit;
269 
270 	l->runes = rl;
271 	l->__mb_cur_max = 4;
272 	l->__mb_sb_limit = 256;
273 	return (0);
274 }
275 
276 static size_t
277 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
278 	size_t n, mbstate_t * __restrict ps)
279 {
280 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
281 }
282 
283 static size_t
284 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst,
285 	const char ** __restrict src,
286 	size_t nms, size_t len, mbstate_t * __restrict ps)
287 {
288 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
289 }
290 
291 static size_t
292 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc,
293 	mbstate_t * __restrict ps)
294 {
295 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
296 }
297 
298 static size_t
299 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
300 	size_t nwc, size_t len, mbstate_t * __restrict ps)
301 {
302 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
303 }
304 
305 /*
306  * Common EUC code.
307  */
308 
309 static size_t
310 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s,
311 	size_t n, mbstate_t * __restrict ps,
312 	uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
313 {
314 	_EucState *es;
315 	int i, want;
316 	wchar_t wc = 0;
317 	unsigned char ch, chs;
318 
319 	es = (_EucState *)ps;
320 
321 	if (es->want < 0 || es->want > MB_CUR_MAX) {
322 		errno = EINVAL;
323 		return ((size_t)-1);
324 	}
325 
326 	if (s == NULL) {
327 		s = "";
328 		n = 1;
329 		pwc = NULL;
330 	}
331 
332 	if (n == 0)
333 		/* Incomplete multibyte sequence */
334 		return ((size_t)-2);
335 
336 	if (es->want == 0) {
337 		/* Fast path for plain ASCII (CS0) */
338 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
339 			if (pwc != NULL)
340 				*pwc = ch;
341 			return (ch != '\0' ? 1 : 0);
342 		}
343 
344 		if (ch >= 0xa1) {
345 			/* CS1 */
346 			want = 2;
347 		} else if (ch == cs2) {
348 			want = cs2width;
349 		} else if (ch == cs3) {
350 			want = cs3width;
351 		} else {
352 			errno = EILSEQ;
353 			return ((size_t)-1);
354 		}
355 
356 
357 		es->want = want;
358 		es->ch = 0;
359 	} else {
360 		want = es->want;
361 		wc = es->ch;
362 	}
363 
364 	for (i = 0; i < MIN(want, n); i++) {
365 		wc <<= 8;
366 		chs = *s;
367 		wc |= chs;
368 		s++;
369 	}
370 	if (i < want) {
371 		/* Incomplete multibyte sequence */
372 		es->want = want - i;
373 		es->ch = wc;
374 		errno = EILSEQ;
375 		return ((size_t)-2);
376 	}
377 	if (pwc != NULL)
378 		*pwc = wc;
379 	es->want = 0;
380 	return (wc == L'\0' ? 0 : want);
381 }
382 
383 static size_t
384 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc,
385     mbstate_t * __restrict ps,
386     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
387 {
388 	_EucState *es;
389 	int i, len;
390 	wchar_t nm;
391 
392 	es = (_EucState *)ps;
393 
394 	if (es->want != 0) {
395 		errno = EINVAL;
396 		return ((size_t)-1);
397 	}
398 
399 	if (s == NULL)
400 		/* Reset to initial shift state (no-op) */
401 		return (1);
402 
403 	if ((wc & ~0x7f) == 0) {
404 		/* Fast path for plain ASCII (CS0) */
405 		*s = (char)wc;
406 		return (1);
407 	}
408 
409 	/* Determine the "length" */
410 	if ((unsigned)wc > 0xffffff) {
411 		len = 4;
412 	} else if ((unsigned)wc > 0xffff) {
413 		len = 3;
414 	} else if ((unsigned)wc > 0xff) {
415 		len = 2;
416 	} else {
417 		len = 1;
418 	}
419 
420 	if (len > MB_CUR_MAX) {
421 		errno = EILSEQ;
422 		return ((size_t)-1);
423 	}
424 
425 	/* This first check excludes CS1, which is implicitly valid. */
426 	if ((wc < 0xa100) || (wc > 0xffff)) {
427 		/* Check for valid CS2 or CS3 */
428 		nm = (wc >> ((len - 1) * 8));
429 		if (nm == cs2) {
430 			if (len != cs2width) {
431 				errno = EILSEQ;
432 				return ((size_t)-1);
433 			}
434 		} else if (nm == cs3) {
435 			if (len != cs3width) {
436 				errno = EILSEQ;
437 				return ((size_t)-1);
438 			}
439 		} else {
440 			errno = EILSEQ;
441 			return ((size_t)-1);
442 		}
443 	}
444 
445 	/* Stash the bytes, least significant last */
446 	for (i = len - 1; i >= 0; i--) {
447 		s[i] = (wc & 0xff);
448 		wc >>= 8;
449 	}
450 	return (len);
451 }
452