xref: /freebsd/sys/libkern/iconv_ucs.c (revision 3494f7c0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2003, 2005 Ryuichiro Imura
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 #include <sys/iconv.h>
34 
35 #include "iconv_converter_if.h"
36 
37 /*
38  * "UCS" converter
39  */
40 
41 #define	KICONV_UCS_COMBINE	0x1
42 #define	KICONV_UCS_FROM_UTF8	0x2
43 #define	KICONV_UCS_TO_UTF8	0x4
44 #define	KICONV_UCS_FROM_LE	0x8
45 #define	KICONV_UCS_TO_LE	0x10
46 #define	KICONV_UCS_FROM_UTF16	0x20
47 #define	KICONV_UCS_TO_UTF16	0x40
48 #define	KICONV_UCS_UCS4		0x80
49 
50 #define	ENCODING_UTF16	"UTF-16BE"
51 #define	ENCODING_UTF8	"UTF-8"
52 
53 static struct {
54 	const char *name;
55 	int from_flag, to_flag;
56 } unicode_family[] = {
57 	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
58 	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
59 	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
60 	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61 	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
62 	{ NULL,		0,	0 }
63 };
64 
65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67 static uint32_t encode_surrogate(uint32_t code);
68 static uint32_t decode_surrogate(const u_char *ucs);
69 
70 #ifdef MODULE_DEPEND
71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
72 #endif
73 
74 /*
75  * UCS converter instance
76  */
77 struct iconv_ucs {
78 	KOBJ_FIELDS;
79 	int			convtype;
80 	struct iconv_cspair *	d_csp;
81 	struct iconv_cspair *	d_cspf;
82 	void *			f_ctp;
83 	void *			t_ctp;
84 	void *			ctype;
85 };
86 
87 static int
88 iconv_ucs_open(struct iconv_converter_class *dcp,
89 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
90 {
91 	struct iconv_ucs *dp;
92 	int i;
93 	const char *from, *to;
94 
95 	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
96 	to = csp->cp_to;
97 	from = cspf ? cspf->cp_from : csp->cp_from;
98 
99 	dp->convtype = 0;
100 
101 	if (cspf)
102 		dp->convtype |= KICONV_UCS_COMBINE;
103 	for (i = 0; unicode_family[i].name; i++) {
104 		if (strcasecmp(from, unicode_family[i].name) == 0)
105 			dp->convtype |= unicode_family[i].from_flag;
106 		if (strcasecmp(to, unicode_family[i].name) == 0)
107 			dp->convtype |= unicode_family[i].to_flag;
108 	}
109 	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110 		dp->convtype |= KICONV_UCS_UCS4;
111 	else
112 		dp->convtype &= ~KICONV_UCS_UCS4;
113 
114 	dp->f_ctp = dp->t_ctp = NULL;
115 	if (dp->convtype & KICONV_UCS_COMBINE) {
116 		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117 		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118 			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
119 		}
120 		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121 		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
122 			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
123 		}
124 	}
125 
126 	dp->ctype = NULL;
127 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128 		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
129 
130 	dp->d_csp = csp;
131 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
132 		if (cspf) {
133 			dp->d_cspf = cspf;
134 			cspf->cp_refcount++;
135 		} else
136 			csp->cp_refcount++;
137 	}
138 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
139 		csp->cp_refcount++;
140 	*dpp = (void*)dp;
141 	return 0;
142 }
143 
144 static int
145 iconv_ucs_close(void *data)
146 {
147 	struct iconv_ucs *dp = data;
148 
149 	if (dp->f_ctp)
150 		iconv_close(dp->f_ctp);
151 	if (dp->t_ctp)
152 		iconv_close(dp->t_ctp);
153 	if (dp->ctype)
154 		iconv_close(dp->ctype);
155 	if (dp->d_cspf)
156 		dp->d_cspf->cp_refcount--;
157 	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158 		dp->d_csp->cp_refcount--;
159 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160 		dp->d_csp->cp_refcount--;
161 	kobj_delete((struct kobj*)data, M_ICONV);
162 	return 0;
163 }
164 
165 static int
166 iconv_ucs_conv(void *d2p, const char **inbuf,
167 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168 	int convchar, int casetype)
169 {
170 	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
171 	int ret = 0, i;
172 	size_t in, on, ir, or, inlen, outlen, ucslen;
173 	const char *src, *p;
174 	char *dst;
175 	u_char ucs[4], *q;
176 	uint32_t code;
177 
178 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
179 		return 0;
180 	ir = in = *inbytesleft;
181 	or = on = *outbytesleft;
182 	src = *inbuf;
183 	dst = *outbuf;
184 
185 	while (ir > 0 && or > 0) {
186 		/*
187 		 * The first half of conversion.
188 		 * (convert any code into ENCODING_UNICODE)
189 		 */
190 		code = 0;
191 		p = src;
192 		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
193 			/* convert UTF-8 to ENCODING_UNICODE */
194 			inlen = 0;
195 			code = utf8_to_ucs4(p, &inlen, ir);
196 			if (code == 0) {
197 				ret = -1;
198 				break;
199 			}
200 
201 			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
202 				code = towlower(code, dp->ctype);
203 			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
204 				code = towupper(code, dp->ctype);
205 			}
206 
207 			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
208 				/* reserved for utf-16 surrogate pair */
209 				/* invalid unicode */
210 				ret = -1;
211 				break;
212 			}
213 
214 			if (inlen == 4) {
215 				if (dp->convtype & KICONV_UCS_UCS4) {
216 					ucslen = 4;
217 					code = encode_surrogate(code);
218 				} else {
219 					/* can't handle with ucs-2 */
220 					ret = -1;
221 					break;
222 				}
223 			} else {
224 				ucslen = 2;
225 			}
226 
227 			/* save UCS-4 into ucs[] */
228 			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
229 				*q++ = (code >> (i << 3)) & 0xff;
230 
231 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
232 			/* convert local code to ENCODING_UNICODE */
233 			ucslen = 4;
234 			inlen = ir;
235 			q = ucs;
236 			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
237 			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
238 			if (ret)
239 				break;
240 			inlen = ir - inlen;
241 			ucslen = 4 - ucslen;
242 
243 		} else {
244 			/* src code is a proper subset of ENCODING_UNICODE */
245 			q = ucs;
246 			if (dp->convtype & KICONV_UCS_FROM_LE) {
247 				*q = *(p + 1);
248 				*(q + 1) = *p;
249 				p += 2;
250 			} else {
251 				*q = *p++;
252 				*(q + 1) = *p++;
253 			}
254 			if ((*q & 0xfc) == 0xd8) {
255 				if (dp->convtype & KICONV_UCS_UCS4 &&
256 				    dp->convtype & KICONV_UCS_FROM_UTF16) {
257 					inlen = ucslen = 4;
258 				} else {
259 					/* invalid unicode */
260 					ret = -1;
261 					break;
262 				}
263 			} else {
264 				inlen = ucslen = 2;
265 			}
266 			if (ir < inlen) {
267 				ret = -1;
268 				break;
269 			}
270 			if (ucslen == 4) {
271 				q += 2;
272 				if (dp->convtype & KICONV_UCS_FROM_LE) {
273 					*q = *(p + 1);
274 					*(q + 1) = *p;
275 				} else {
276 					*q = *p++;
277 					*(q + 1) = *p;
278 				}
279 				if ((*q & 0xfc) != 0xdc) {
280 					/* invalid unicode */
281 					ret = -1;
282 					break;
283 				}
284 			}
285 		}
286 
287 		/*
288 		 * The second half of conversion.
289 		 * (convert ENCODING_UNICODE into any code)
290 		 */
291 		p = ucs;
292 		if (dp->convtype & KICONV_UCS_TO_UTF8) {
293 			q = (u_char *)dst;
294 			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
295 				/* decode surrogate pair */
296 				code = decode_surrogate(p);
297 			} else {
298 				code = (ucs[0] << 8) | ucs[1];
299 			}
300 
301 			if (casetype == KICONV_LOWER && dp->ctype) {
302 				code = towlower(code, dp->ctype);
303 			} else if (casetype == KICONV_UPPER && dp->ctype) {
304 				code = towupper(code, dp->ctype);
305 			}
306 
307 			outlen = 0;
308 			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
309 				ret = -1;
310 				break;
311 			}
312 
313 			src += inlen;
314 			ir -= inlen;
315 			dst += outlen;
316 			or -= outlen;
317 
318 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
319 			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
320 			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
321 			if (ret)
322 				break;
323 
324 			src += inlen;
325 			ir -= inlen;
326 
327 		} else {
328 			/* dst code is a proper subset of ENCODING_UNICODE */
329 			if (or < ucslen) {
330 				ret = -1;
331 				break;
332 			}
333 			src += inlen;
334 			ir -= inlen;
335 			or -= ucslen;
336 			if (dp->convtype & KICONV_UCS_TO_LE) {
337 				*dst++ = *(p + 1);
338 				*dst++ = *p;
339 				p += 2;
340 			} else {
341 				*dst++ = *p++;
342 				*dst++ = *p++;
343 			}
344 			if (ucslen == 4) {
345 				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
346 				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
347 					ret = -1;
348 					break;
349 				}
350 				if (dp->convtype & KICONV_UCS_TO_LE) {
351 					*dst++ = *(p + 1);
352 					*dst++ = *p;
353 				} else {
354 					*dst++ = *p++;
355 					*dst++ = *p;
356 				}
357 			}
358 		}
359 
360 		if (convchar == 1)
361 			break;
362 	}
363 
364 	*inbuf += in - ir;
365 	*outbuf += on - or;
366 	*inbytesleft -= in - ir;
367 	*outbytesleft -= on - or;
368 	return (ret);
369 }
370 
371 static int
372 iconv_ucs_init(struct iconv_converter_class *dcp)
373 {
374 	int error;
375 
376 	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
377 	if (error)
378 		return (error);
379 	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
380 	if (error)
381 		return (error);
382 	return (0);
383 }
384 
385 static int
386 iconv_ucs_done(struct iconv_converter_class *dcp)
387 {
388 	return (0);
389 }
390 
391 static const char *
392 iconv_ucs_name(struct iconv_converter_class *dcp)
393 {
394 	return (ENCODING_UNICODE);
395 }
396 
397 static kobj_method_t iconv_ucs_methods[] = {
398 	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
399 	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
400 	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
401 	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
402 	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
403 	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
404 	{0, 0}
405 };
406 
407 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
408 
409 static uint32_t
410 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
411 {
412 	size_t i, w = 0;
413 	uint32_t ucs4 = 0;
414 
415 	/*
416 	 * get leading 1 byte from utf-8
417 	 */
418 	if ((*src & 0x80) == 0) {
419 		/*
420 		 * leading 1 bit is "0"
421 		 *  utf-8: 0xxxxxxx
422 		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
423 		 */
424 		w = 1;
425 		/* get trailing 7 bits */
426 		ucs4 = *src & 0x7f;
427 	} else if ((*src & 0xe0) == 0xc0) {
428 		/*
429 		 * leading 3 bits are "110"
430 		 *  utf-8: 110xxxxx 10yyyyyy
431 		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
432 		 */
433 		w = 2;
434 		/* get trailing 5 bits */
435 		ucs4 = *src & 0x1f;
436 	} else if ((*src & 0xf0) == 0xe0) {
437 		/*
438 		 * leading 4 bits are "1110"
439 		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
440 		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
441 		 */
442 		w = 3;
443 		/* get trailing 4 bits */
444 		ucs4 = *src & 0x0f;
445 	} else if ((*src & 0xf8) == 0xf0) {
446 		/*
447 		 * leading 5 bits are "11110"
448 		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
449 		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
450 		 */
451 		w = 4;
452 		/* get trailing 3 bits */
453 		ucs4 = *src & 0x07;
454 	} else {
455 		/* out of utf-16 range or having illegal bits */
456 		return (0);
457 	}
458 
459 	if (srclen < w)
460 		return (0);
461 
462 	/*
463 	 * get left parts from utf-8
464 	 */
465 	for (i = 1 ; i < w ; i++) {
466 		if ((*(src + i) & 0xc0) != 0x80) {
467 			/* invalid: leading 2 bits are not "10" */
468 			return (0);
469 		}
470 		/* concatenate trailing 6 bits into ucs4 */
471 		ucs4 <<= 6;
472 		ucs4 |= *(src + i) & 0x3f;
473 	}
474 
475 	*utf8width = w;
476 	return (ucs4);
477 }
478 
479 static u_char *
480 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
481 {
482 	u_char lead, *p;
483 	size_t i, w;
484 
485 	/*
486 	 * determine utf-8 width and leading bits
487 	 */
488 	if (ucs4 < 0x80) {
489 		w = 1;
490 		lead = 0;	/* "0" */
491 	} else if (ucs4 < 0x800) {
492 		w = 2;
493 		lead = 0xc0;	/* "11" */
494 	} else if (ucs4 < 0x10000) {
495 		w = 3;
496 		lead = 0xe0;	/* "111" */
497 	} else if (ucs4 < 0x200000) {
498 		w = 4;
499 		lead = 0xf0;	/* "1111" */
500 	} else {
501 		return (NULL);
502 	}
503 
504 	if (dstlen < w)
505 		return (NULL);
506 
507 	/*
508 	 * construct utf-8
509 	 */
510 	p = dst;
511 	for (i = w - 1 ; i >= 1 ; i--) {
512 		/* get trailing 6 bits and put it with leading bit as "1" */
513 		*(p + i) = (ucs4 & 0x3f) | 0x80;
514 		ucs4 >>= 6;
515 	}
516 	*p = ucs4 | lead;
517 
518 	*utf8width = w;
519 
520 	return (p);
521 }
522 
523 static uint32_t
524 encode_surrogate(uint32_t code)
525 {
526 	return ((((code - 0x10000) << 6) & 0x3ff0000) |
527 	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
528 }
529 
530 static uint32_t
531 decode_surrogate(const u_char *ucs)
532 {
533 	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
534 	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
535 }
536