1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 
28 /*
29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33  * the section 3C man pages.
34  * Interface stability: Committed
35  */
36 
37 #include <sys/types.h>
38 #ifdef	_KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/sunddi.h>
44 #else
45 #include <sys/u8_textprep.h>
46 #endif	/* _KERNEL */
47 #include <sys/byteorder.h>
48 #include <sys/errno.h>
49 
50 
51 /*
52  * The max and min values of high and low surrogate pairs of UTF-16,
53  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
54  */
55 #define	UCONV_U16_HI_MIN	(0xd800U)
56 #define	UCONV_U16_HI_MAX	(0xdbffU)
57 #define	UCONV_U16_LO_MIN	(0xdc00U)
58 #define	UCONV_U16_LO_MAX	(0xdfffU)
59 #define	UCONV_U16_BIT_SHIFT	(0x0400U)
60 #define	UCONV_U16_BIT_MASK	(0x0fffffU)
61 #define	UCONV_U16_START		(0x010000U)
62 
63 /* The maximum value of Unicode coding space and ASCII coding space. */
64 #define	UCONV_UNICODE_MAX	(0x10ffffU)
65 #define	UCONV_ASCII_MAX		(0x7fU)
66 
67 /* The mask values for input and output endians. */
68 #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69 #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
70 
71 /* Native and reversed endian macros. */
72 #ifdef	_ZFS_BIG_ENDIAN
73 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
74 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
75 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
76 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
77 #else
78 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
79 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
80 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
81 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
82 #endif	/* _BIG_ENDIAN */
83 
84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85 #define	UCONV_BOM_NORMAL	(0xfeffU)
86 #define	UCONV_BOM_SWAPPED	(0xfffeU)
87 #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
88 
89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
90 #define	UCONV_U8_ONE_BYTE	(0x7fU)
91 #define	UCONV_U8_TWO_BYTES	(0x7ffU)
92 #define	UCONV_U8_THREE_BYTES	(0xffffU)
93 #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
94 
95 /* The common minimum and maximum values at the UTF-8 character bytes. */
96 #define	UCONV_U8_BYTE_MIN	(0x80U)
97 #define	UCONV_U8_BYTE_MAX	(0xbfU)
98 
99 /*
100  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101  * UTF-8 character bytes.
102  */
103 #define	UCONV_U8_BIT_SHIFT	6
104 #define	UCONV_U8_BIT_MASK	0x3f
105 
106 /*
107  * The following vector shows remaining bytes in a UTF-8 character.
108  * Index will be the first byte of the character.
109  */
110 static const uchar_t remaining_bytes_tbl[0x100] = {
111 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
112 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
123 
124 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
125 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
126 
127 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
128 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
129 
130 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
131 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
132 
133 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
134 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
135 };
136 
137 /*
138  * The following is a vector of bit-masks to get used bits in
139  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
140  * the character.
141  */
142 #ifdef	_KERNEL
143 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144 #else
145 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
146 #endif	/* _KERNEL */
147 
148 /*
149  * The following two vectors are to provide valid minimum and
150  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
151  * better illegal sequence checking. The index value must be the value of
152  * the first byte of the UTF-8 character.
153  */
154 static const uchar_t valid_min_2nd_byte[0x100] = {
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 	0,    0,    0,    0,    0,    0,    0,    0,
177 	0,    0,    0,    0,    0,    0,    0,    0,
178 	0,    0,    0,    0,    0,    0,    0,    0,
179 
180 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
181 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
182 
183 /*	C8    C9    CA    CB    CC    CD    CE    CF */
184 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185 
186 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
187 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
188 
189 /*	D8    D9    DA    DB    DC    DD    DE    DF */
190 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191 
192 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
193 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
194 
195 /*	E8    E9    EA    EB    EC    ED    EE    EF */
196 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
197 
198 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
199 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
200 
201 	0,    0,    0,    0,    0,    0,    0,    0
202 };
203 
204 static const uchar_t valid_max_2nd_byte[0x100] = {
205 	0,    0,    0,    0,    0,    0,    0,    0,
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221 	0,    0,    0,    0,    0,    0,    0,    0,
222 	0,    0,    0,    0,    0,    0,    0,    0,
223 	0,    0,    0,    0,    0,    0,    0,    0,
224 	0,    0,    0,    0,    0,    0,    0,    0,
225 	0,    0,    0,    0,    0,    0,    0,    0,
226 	0,    0,    0,    0,    0,    0,    0,    0,
227 	0,    0,    0,    0,    0,    0,    0,    0,
228 	0,    0,    0,    0,    0,    0,    0,    0,
229 
230 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
231 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
232 
233 /*	C8    C9    CA    CB    CC    CD    CE    CF */
234 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
235 
236 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
237 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
238 
239 /*	D8    D9    DA    DB    DC    DD    DE    DF */
240 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
241 
242 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
243 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
244 
245 /*	E8    E9    EA    EB    EC    ED    EE    EF */
246 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
247 
248 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
249 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
250 
251 	0,    0,    0,    0,    0,    0,    0,    0
252 };
253 
254 
255 static int
256 check_endian(int flag, int *in, int *out)
257 {
258 	*in = flag & UCONV_IN_ENDIAN_MASKS;
259 
260 	/* You cannot have both. */
261 	if (*in == UCONV_IN_ENDIAN_MASKS)
262 		return (EBADF);
263 
264 	if (*in == 0)
265 		*in = UCONV_IN_NAT_ENDIAN;
266 
267 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
268 
269 	/* You cannot have both. */
270 	if (*out == UCONV_OUT_ENDIAN_MASKS)
271 		return (EBADF);
272 
273 	if (*out == 0)
274 		*out = UCONV_OUT_NAT_ENDIAN;
275 
276 	return (0);
277 }
278 
279 static boolean_t
280 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
281 {
282 	if (u16l > 0) {
283 		if (*u16s == UCONV_BOM_NORMAL) {
284 			*in = UCONV_IN_NAT_ENDIAN;
285 			return (B_TRUE);
286 		}
287 		if (*u16s == UCONV_BOM_SWAPPED) {
288 			*in = UCONV_IN_REV_ENDIAN;
289 			return (B_TRUE);
290 		}
291 	}
292 
293 	return (B_FALSE);
294 }
295 
296 static boolean_t
297 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
298 {
299 	if (u32l > 0) {
300 		if (*u32s == UCONV_BOM_NORMAL) {
301 			*in = UCONV_IN_NAT_ENDIAN;
302 			return (B_TRUE);
303 		}
304 		if (*u32s == UCONV_BOM_SWAPPED_32) {
305 			*in = UCONV_IN_REV_ENDIAN;
306 			return (B_TRUE);
307 		}
308 	}
309 
310 	return (B_FALSE);
311 }
312 
313 int
314 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
315     uint32_t *u32s, size_t *utf32len, int flag)
316 {
317 	int inendian;
318 	int outendian;
319 	size_t u16l;
320 	size_t u32l;
321 	uint32_t hi;
322 	uint32_t lo;
323 	boolean_t do_not_ignore_null;
324 
325 	/*
326 	 * Do preliminary validity checks on parameters and collect info on
327 	 * endians.
328 	 */
329 	if (u16s == NULL || utf16len == NULL)
330 		return (EILSEQ);
331 
332 	if (u32s == NULL || utf32len == NULL)
333 		return (E2BIG);
334 
335 	if (check_endian(flag, &inendian, &outendian) != 0)
336 		return (EBADF);
337 
338 	/*
339 	 * Initialize input and output parameter buffer indices and
340 	 * temporary variables.
341 	 */
342 	u16l = u32l = 0;
343 	hi = 0;
344 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
345 
346 	/*
347 	 * Check on the BOM at the beginning of the input buffer if required
348 	 * and if there is indeed one, process it.
349 	 */
350 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
351 	    check_bom16(u16s, *utf16len, &inendian))
352 		u16l++;
353 
354 	/*
355 	 * Reset inendian and outendian so that after this point, those can be
356 	 * used as condition values.
357 	 */
358 	inendian &= UCONV_IN_NAT_ENDIAN;
359 	outendian &= UCONV_OUT_NAT_ENDIAN;
360 
361 	/*
362 	 * If there is something in the input buffer and if necessary and
363 	 * requested, save the BOM at the output buffer.
364 	 */
365 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
366 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
367 		    UCONV_BOM_SWAPPED_32;
368 
369 	/*
370 	 * Do conversion; if encounter a surrogate pair, assemble high and
371 	 * low pair values to form a UTF-32 character. If a half of a pair
372 	 * exists alone, then, either it is an illegal (EILSEQ) or
373 	 * invalid (EINVAL) value.
374 	 */
375 	for (; u16l < *utf16len; u16l++) {
376 		if (u16s[u16l] == 0 && do_not_ignore_null)
377 			break;
378 
379 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
380 
381 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
382 			if (hi)
383 				return (EILSEQ);
384 			hi = lo;
385 			continue;
386 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
387 			if (! hi)
388 				return (EILSEQ);
389 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
390 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
391 			    + UCONV_U16_START;
392 			hi = 0;
393 		} else if (hi) {
394 			return (EILSEQ);
395 		}
396 
397 		if (u32l >= *utf32len)
398 			return (E2BIG);
399 
400 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
401 	}
402 
403 	/*
404 	 * If high half didn't see low half, then, it's most likely the input
405 	 * parameter is incomplete.
406 	 */
407 	if (hi)
408 		return (EINVAL);
409 
410 	/*
411 	 * Save the number of consumed and saved characters. They do not
412 	 * include terminating NULL character (U+0000) at the end of
413 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
414 	 * the input buffer length is big enough to include the terminating
415 	 * NULL character).
416 	 */
417 	*utf16len = u16l;
418 	*utf32len = u32l;
419 
420 	return (0);
421 }
422 
423 int
424 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
425     uchar_t *u8s, size_t *utf8len, int flag)
426 {
427 	int inendian;
428 	int outendian;
429 	size_t u16l;
430 	size_t u8l;
431 	uint32_t hi;
432 	uint32_t lo;
433 	boolean_t do_not_ignore_null;
434 
435 	if (u16s == NULL || utf16len == NULL)
436 		return (EILSEQ);
437 
438 	if (u8s == NULL || utf8len == NULL)
439 		return (E2BIG);
440 
441 	if (check_endian(flag, &inendian, &outendian) != 0)
442 		return (EBADF);
443 
444 	u16l = u8l = 0;
445 	hi = 0;
446 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
447 
448 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
449 	    check_bom16(u16s, *utf16len, &inendian))
450 		u16l++;
451 
452 	inendian &= UCONV_IN_NAT_ENDIAN;
453 
454 	for (; u16l < *utf16len; u16l++) {
455 		if (u16s[u16l] == 0 && do_not_ignore_null)
456 			break;
457 
458 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
459 
460 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
461 			if (hi)
462 				return (EILSEQ);
463 			hi = lo;
464 			continue;
465 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
466 			if (! hi)
467 				return (EILSEQ);
468 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
469 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
470 			    + UCONV_U16_START;
471 			hi = 0;
472 		} else if (hi) {
473 			return (EILSEQ);
474 		}
475 
476 		/*
477 		 * Now we convert a UTF-32 character into a UTF-8 character.
478 		 * Unicode coding space is between U+0000 and U+10FFFF;
479 		 * anything bigger is an illegal character.
480 		 */
481 		if (lo <= UCONV_U8_ONE_BYTE) {
482 			if (u8l >= *utf8len)
483 				return (E2BIG);
484 			u8s[u8l++] = (uchar_t)lo;
485 		} else if (lo <= UCONV_U8_TWO_BYTES) {
486 			if ((u8l + 1) >= *utf8len)
487 				return (E2BIG);
488 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
489 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
490 		} else if (lo <= UCONV_U8_THREE_BYTES) {
491 			if ((u8l + 2) >= *utf8len)
492 				return (E2BIG);
493 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
494 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
495 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
496 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
497 			if ((u8l + 3) >= *utf8len)
498 				return (E2BIG);
499 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
500 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
501 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
502 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
503 		} else {
504 			return (EILSEQ);
505 		}
506 	}
507 
508 	if (hi)
509 		return (EINVAL);
510 
511 	*utf16len = u16l;
512 	*utf8len = u8l;
513 
514 	return (0);
515 }
516 
517 int
518 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
519     uint16_t *u16s, size_t *utf16len, int flag)
520 {
521 	int inendian;
522 	int outendian;
523 	size_t u16l;
524 	size_t u32l;
525 	uint32_t hi;
526 	uint32_t lo;
527 	boolean_t do_not_ignore_null;
528 
529 	if (u32s == NULL || utf32len == NULL)
530 		return (EILSEQ);
531 
532 	if (u16s == NULL || utf16len == NULL)
533 		return (E2BIG);
534 
535 	if (check_endian(flag, &inendian, &outendian) != 0)
536 		return (EBADF);
537 
538 	u16l = u32l = 0;
539 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
540 
541 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
542 	    check_bom32(u32s, *utf32len, &inendian))
543 		u32l++;
544 
545 	inendian &= UCONV_IN_NAT_ENDIAN;
546 	outendian &= UCONV_OUT_NAT_ENDIAN;
547 
548 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
549 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
550 		    UCONV_BOM_SWAPPED;
551 
552 	for (; u32l < *utf32len; u32l++) {
553 		if (u32s[u32l] == 0 && do_not_ignore_null)
554 			break;
555 
556 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
557 
558 		/*
559 		 * Anything bigger than the Unicode coding space, i.e.,
560 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
561 		 * character.
562 		 */
563 		if (hi > UCONV_UNICODE_MAX)
564 			return (EILSEQ);
565 
566 		/*
567 		 * Anything bigger than U+FFFF must be converted into
568 		 * a surrogate pair in UTF-16.
569 		 */
570 		if (hi >= UCONV_U16_START) {
571 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
572 			    UCONV_U16_LO_MIN;
573 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
574 			    UCONV_U16_HI_MIN;
575 
576 			if ((u16l + 1) >= *utf16len)
577 				return (E2BIG);
578 
579 			if (outendian) {
580 				u16s[u16l++] = (uint16_t)hi;
581 				u16s[u16l++] = (uint16_t)lo;
582 			} else {
583 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
584 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
585 			}
586 		} else {
587 			if (u16l >= *utf16len)
588 				return (E2BIG);
589 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
590 			    BSWAP_16(((uint16_t)hi));
591 		}
592 	}
593 
594 	*utf16len = u16l;
595 	*utf32len = u32l;
596 
597 	return (0);
598 }
599 
600 int
601 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
602     uchar_t *u8s, size_t *utf8len, int flag)
603 {
604 	int inendian;
605 	int outendian;
606 	size_t u32l;
607 	size_t u8l;
608 	uint32_t lo;
609 	boolean_t do_not_ignore_null;
610 
611 	if (u32s == NULL || utf32len == NULL)
612 		return (EILSEQ);
613 
614 	if (u8s == NULL || utf8len == NULL)
615 		return (E2BIG);
616 
617 	if (check_endian(flag, &inendian, &outendian) != 0)
618 		return (EBADF);
619 
620 	u32l = u8l = 0;
621 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
622 
623 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
624 	    check_bom32(u32s, *utf32len, &inendian))
625 		u32l++;
626 
627 	inendian &= UCONV_IN_NAT_ENDIAN;
628 
629 	for (; u32l < *utf32len; u32l++) {
630 		if (u32s[u32l] == 0 && do_not_ignore_null)
631 			break;
632 
633 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
634 
635 		if (lo <= UCONV_U8_ONE_BYTE) {
636 			if (u8l >= *utf8len)
637 				return (E2BIG);
638 			u8s[u8l++] = (uchar_t)lo;
639 		} else if (lo <= UCONV_U8_TWO_BYTES) {
640 			if ((u8l + 1) >= *utf8len)
641 				return (E2BIG);
642 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
643 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
644 		} else if (lo <= UCONV_U8_THREE_BYTES) {
645 			if ((u8l + 2) >= *utf8len)
646 				return (E2BIG);
647 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
648 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
649 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
650 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
651 			if ((u8l + 3) >= *utf8len)
652 				return (E2BIG);
653 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
654 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
655 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
656 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
657 		} else {
658 			return (EILSEQ);
659 		}
660 	}
661 
662 	*utf32len = u32l;
663 	*utf8len = u8l;
664 
665 	return (0);
666 }
667 
668 int
669 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
670     uint16_t *u16s, size_t *utf16len, int flag)
671 {
672 	int inendian;
673 	int outendian;
674 	size_t u16l;
675 	size_t u8l;
676 	uint32_t hi;
677 	uint32_t lo;
678 	int remaining_bytes;
679 	int first_b;
680 	boolean_t do_not_ignore_null;
681 
682 	if (u8s == NULL || utf8len == NULL)
683 		return (EILSEQ);
684 
685 	if (u16s == NULL || utf16len == NULL)
686 		return (E2BIG);
687 
688 	if (check_endian(flag, &inendian, &outendian) != 0)
689 		return (EBADF);
690 
691 	u16l = u8l = 0;
692 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
693 
694 	outendian &= UCONV_OUT_NAT_ENDIAN;
695 
696 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
697 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
698 		    UCONV_BOM_SWAPPED;
699 
700 	for (; u8l < *utf8len; ) {
701 		if (u8s[u8l] == 0 && do_not_ignore_null)
702 			break;
703 
704 		/*
705 		 * Collect a UTF-8 character and convert it to a UTF-32
706 		 * character. In doing so, we screen out illegally formed
707 		 * UTF-8 characters and treat such as illegal characters.
708 		 * The algorithm at below also screens out anything bigger
709 		 * than the U+10FFFF.
710 		 *
711 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
712 		 * more details on the illegal values of UTF-8 character
713 		 * bytes.
714 		 */
715 		hi = (uint32_t)u8s[u8l++];
716 
717 		if (hi > UCONV_ASCII_MAX) {
718 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
719 				return (EILSEQ);
720 
721 			first_b = hi;
722 			hi = hi & u8_masks_tbl[remaining_bytes];
723 
724 			for (; remaining_bytes > 0; remaining_bytes--) {
725 				/*
726 				 * If we have no more bytes, the current
727 				 * UTF-8 character is incomplete.
728 				 */
729 				if (u8l >= *utf8len)
730 					return (EINVAL);
731 
732 				lo = (uint32_t)u8s[u8l++];
733 
734 				if (first_b) {
735 					if (lo < valid_min_2nd_byte[first_b] ||
736 					    lo > valid_max_2nd_byte[first_b])
737 						return (EILSEQ);
738 					first_b = 0;
739 				} else if (lo < UCONV_U8_BYTE_MIN ||
740 				    lo > UCONV_U8_BYTE_MAX) {
741 					return (EILSEQ);
742 				}
743 				hi = (hi << UCONV_U8_BIT_SHIFT) |
744 				    (lo & UCONV_U8_BIT_MASK);
745 			}
746 		}
747 
748 		if (hi >= UCONV_U16_START) {
749 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
750 			    UCONV_U16_LO_MIN;
751 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
752 			    UCONV_U16_HI_MIN;
753 
754 			if ((u16l + 1) >= *utf16len)
755 				return (E2BIG);
756 
757 			if (outendian) {
758 				u16s[u16l++] = (uint16_t)hi;
759 				u16s[u16l++] = (uint16_t)lo;
760 			} else {
761 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
762 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
763 			}
764 		} else {
765 			if (u16l >= *utf16len)
766 				return (E2BIG);
767 
768 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
769 			    BSWAP_16(((uint16_t)hi));
770 		}
771 	}
772 
773 	*utf16len = u16l;
774 	*utf8len = u8l;
775 
776 	return (0);
777 }
778 
779 int
780 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
781     uint32_t *u32s, size_t *utf32len, int flag)
782 {
783 	int inendian;
784 	int outendian;
785 	size_t u32l;
786 	size_t u8l;
787 	uint32_t hi;
788 	uint32_t c;
789 	int remaining_bytes;
790 	int first_b;
791 	boolean_t do_not_ignore_null;
792 
793 	if (u8s == NULL || utf8len == NULL)
794 		return (EILSEQ);
795 
796 	if (u32s == NULL || utf32len == NULL)
797 		return (E2BIG);
798 
799 	if (check_endian(flag, &inendian, &outendian) != 0)
800 		return (EBADF);
801 
802 	u32l = u8l = 0;
803 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
804 
805 	outendian &= UCONV_OUT_NAT_ENDIAN;
806 
807 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
808 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
809 		    UCONV_BOM_SWAPPED_32;
810 
811 	for (; u8l < *utf8len; ) {
812 		if (u8s[u8l] == 0 && do_not_ignore_null)
813 			break;
814 
815 		hi = (uint32_t)u8s[u8l++];
816 
817 		if (hi > UCONV_ASCII_MAX) {
818 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
819 				return (EILSEQ);
820 
821 			first_b = hi;
822 			hi = hi & u8_masks_tbl[remaining_bytes];
823 
824 			for (; remaining_bytes > 0; remaining_bytes--) {
825 				if (u8l >= *utf8len)
826 					return (EINVAL);
827 
828 				c = (uint32_t)u8s[u8l++];
829 
830 				if (first_b) {
831 					if (c < valid_min_2nd_byte[first_b] ||
832 					    c > valid_max_2nd_byte[first_b])
833 						return (EILSEQ);
834 					first_b = 0;
835 				} else if (c < UCONV_U8_BYTE_MIN ||
836 				    c > UCONV_U8_BYTE_MAX) {
837 					return (EILSEQ);
838 				}
839 				hi = (hi << UCONV_U8_BIT_SHIFT) |
840 				    (c & UCONV_U8_BIT_MASK);
841 			}
842 		}
843 
844 		if (u32l >= *utf32len)
845 			return (E2BIG);
846 
847 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
848 	}
849 
850 	*utf32len = u32l;
851 	*utf8len = u8l;
852 
853 	return (0);
854 }
855 
856 #if defined(_KERNEL)
857 EXPORT_SYMBOL(uconv_u16tou32);
858 EXPORT_SYMBOL(uconv_u16tou8);
859 EXPORT_SYMBOL(uconv_u32tou16);
860 EXPORT_SYMBOL(uconv_u32tou8);
861 EXPORT_SYMBOL(uconv_u8tou16);
862 EXPORT_SYMBOL(uconv_u8tou32);
863 #endif
864