1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 
28 /*
29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33  * the section 3C man pages.
34  * Interface stability: Committed
35  */
36 
37 #include <sys/types.h>
38 #ifdef	_KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/sunddi.h>
44 #else
45 #include <sys/u8_textprep.h>
46 #endif	/* _KERNEL */
47 #include <sys/byteorder.h>
48 #include <sys/errno.h>
49 
50 
51 /*
52  * The max and min values of high and low surrogate pairs of UTF-16,
53  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
54  */
55 #define	UCONV_U16_HI_MIN	(0xd800U)
56 #define	UCONV_U16_HI_MAX	(0xdbffU)
57 #define	UCONV_U16_LO_MIN	(0xdc00U)
58 #define	UCONV_U16_LO_MAX	(0xdfffU)
59 #define	UCONV_U16_BIT_SHIFT	(0x0400U)
60 #define	UCONV_U16_BIT_MASK	(0x0fffffU)
61 #define	UCONV_U16_START		(0x010000U)
62 
63 /* The maximum value of Unicode coding space and ASCII coding space. */
64 #define	UCONV_UNICODE_MAX	(0x10ffffU)
65 #define	UCONV_ASCII_MAX		(0x7fU)
66 
67 /* The mask values for input and output endians. */
68 #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69 #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
70 
71 /* Native and reversed endian macros. */
72 #ifdef	_ZFS_BIG_ENDIAN
73 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
74 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
75 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
76 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
77 #else
78 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
79 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
80 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
81 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
82 #endif	/* _BIG_ENDIAN */
83 
84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85 #define	UCONV_BOM_NORMAL	(0xfeffU)
86 #define	UCONV_BOM_SWAPPED	(0xfffeU)
87 #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
88 
89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
90 #define	UCONV_U8_ONE_BYTE	(0x7fU)
91 #define	UCONV_U8_TWO_BYTES	(0x7ffU)
92 #define	UCONV_U8_THREE_BYTES	(0xffffU)
93 #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
94 
95 /* The common minimum and maximum values at the UTF-8 character bytes. */
96 #define	UCONV_U8_BYTE_MIN	(0x80U)
97 #define	UCONV_U8_BYTE_MAX	(0xbfU)
98 
99 /*
100  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101  * UTF-8 character bytes.
102  */
103 #define	UCONV_U8_BIT_SHIFT	6
104 #define	UCONV_U8_BIT_MASK	0x3f
105 
106 /*
107  * The following vector shows remaining bytes in a UTF-8 character.
108  * Index will be the first byte of the character.
109  */
110 static const uchar_t remaining_bytes_tbl[0x100] = {
111 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
112 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
123 
124 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
125 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
126 
127 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
128 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
129 
130 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
131 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
132 
133 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
134 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
135 };
136 
137 /*
138  * The following is a vector of bit-masks to get used bits in
139  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
140  * the character.
141  */
142 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143 
144 /*
145  * The following two vectors are to provide valid minimum and
146  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
147  * better illegal sequence checking. The index value must be the value of
148  * the first byte of the UTF-8 character.
149  */
150 static const uchar_t valid_min_2nd_byte[0x100] = {
151 	0,    0,    0,    0,    0,    0,    0,    0,
152 	0,    0,    0,    0,    0,    0,    0,    0,
153 	0,    0,    0,    0,    0,    0,    0,    0,
154 	0,    0,    0,    0,    0,    0,    0,    0,
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 
176 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
177 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
178 
179 /*	C8    C9    CA    CB    CC    CD    CE    CF */
180 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
181 
182 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
183 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184 
185 /*	D8    D9    DA    DB    DC    DD    DE    DF */
186 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187 
188 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
189 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190 
191 /*	E8    E9    EA    EB    EC    ED    EE    EF */
192 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193 
194 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
195 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
196 
197 	0,    0,    0,    0,    0,    0,    0,    0
198 };
199 
200 static const uchar_t valid_max_2nd_byte[0x100] = {
201 	0,    0,    0,    0,    0,    0,    0,    0,
202 	0,    0,    0,    0,    0,    0,    0,    0,
203 	0,    0,    0,    0,    0,    0,    0,    0,
204 	0,    0,    0,    0,    0,    0,    0,    0,
205 	0,    0,    0,    0,    0,    0,    0,    0,
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221 	0,    0,    0,    0,    0,    0,    0,    0,
222 	0,    0,    0,    0,    0,    0,    0,    0,
223 	0,    0,    0,    0,    0,    0,    0,    0,
224 	0,    0,    0,    0,    0,    0,    0,    0,
225 
226 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
227 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
228 
229 /*	C8    C9    CA    CB    CC    CD    CE    CF */
230 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
231 
232 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
233 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234 
235 /*	D8    D9    DA    DB    DC    DD    DE    DF */
236 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237 
238 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
239 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240 
241 /*	E8    E9    EA    EB    EC    ED    EE    EF */
242 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
243 
244 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
245 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
246 
247 	0,    0,    0,    0,    0,    0,    0,    0
248 };
249 
250 
251 static int
check_endian(int flag,int * in,int * out)252 check_endian(int flag, int *in, int *out)
253 {
254 	*in = flag & UCONV_IN_ENDIAN_MASKS;
255 
256 	/* You cannot have both. */
257 	if (*in == UCONV_IN_ENDIAN_MASKS)
258 		return (EBADF);
259 
260 	if (*in == 0)
261 		*in = UCONV_IN_NAT_ENDIAN;
262 
263 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
264 
265 	/* You cannot have both. */
266 	if (*out == UCONV_OUT_ENDIAN_MASKS)
267 		return (EBADF);
268 
269 	if (*out == 0)
270 		*out = UCONV_OUT_NAT_ENDIAN;
271 
272 	return (0);
273 }
274 
275 static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)276 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
277 {
278 	if (u16l > 0) {
279 		if (*u16s == UCONV_BOM_NORMAL) {
280 			*in = UCONV_IN_NAT_ENDIAN;
281 			return (B_TRUE);
282 		}
283 		if (*u16s == UCONV_BOM_SWAPPED) {
284 			*in = UCONV_IN_REV_ENDIAN;
285 			return (B_TRUE);
286 		}
287 	}
288 
289 	return (B_FALSE);
290 }
291 
292 static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)293 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
294 {
295 	if (u32l > 0) {
296 		if (*u32s == UCONV_BOM_NORMAL) {
297 			*in = UCONV_IN_NAT_ENDIAN;
298 			return (B_TRUE);
299 		}
300 		if (*u32s == UCONV_BOM_SWAPPED_32) {
301 			*in = UCONV_IN_REV_ENDIAN;
302 			return (B_TRUE);
303 		}
304 	}
305 
306 	return (B_FALSE);
307 }
308 
309 int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)310 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
311     uint32_t *u32s, size_t *utf32len, int flag)
312 {
313 	int inendian;
314 	int outendian;
315 	size_t u16l;
316 	size_t u32l;
317 	uint32_t hi;
318 	uint32_t lo;
319 	boolean_t do_not_ignore_null;
320 
321 	/*
322 	 * Do preliminary validity checks on parameters and collect info on
323 	 * endians.
324 	 */
325 	if (u16s == NULL || utf16len == NULL)
326 		return (EILSEQ);
327 
328 	if (u32s == NULL || utf32len == NULL)
329 		return (E2BIG);
330 
331 	if (check_endian(flag, &inendian, &outendian) != 0)
332 		return (EBADF);
333 
334 	/*
335 	 * Initialize input and output parameter buffer indices and
336 	 * temporary variables.
337 	 */
338 	u16l = u32l = 0;
339 	hi = 0;
340 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
341 
342 	/*
343 	 * Check on the BOM at the beginning of the input buffer if required
344 	 * and if there is indeed one, process it.
345 	 */
346 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
347 	    check_bom16(u16s, *utf16len, &inendian))
348 		u16l++;
349 
350 	/*
351 	 * Reset inendian and outendian so that after this point, those can be
352 	 * used as condition values.
353 	 */
354 	inendian &= UCONV_IN_NAT_ENDIAN;
355 	outendian &= UCONV_OUT_NAT_ENDIAN;
356 
357 	/*
358 	 * If there is something in the input buffer and if necessary and
359 	 * requested, save the BOM at the output buffer.
360 	 */
361 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
362 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
363 		    UCONV_BOM_SWAPPED_32;
364 
365 	/*
366 	 * Do conversion; if encounter a surrogate pair, assemble high and
367 	 * low pair values to form a UTF-32 character. If a half of a pair
368 	 * exists alone, then, either it is an illegal (EILSEQ) or
369 	 * invalid (EINVAL) value.
370 	 */
371 	for (; u16l < *utf16len; u16l++) {
372 		if (u16s[u16l] == 0 && do_not_ignore_null)
373 			break;
374 
375 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
376 
377 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
378 			if (hi)
379 				return (EILSEQ);
380 			hi = lo;
381 			continue;
382 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
383 			if (! hi)
384 				return (EILSEQ);
385 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
386 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
387 			    + UCONV_U16_START;
388 			hi = 0;
389 		} else if (hi) {
390 			return (EILSEQ);
391 		}
392 
393 		if (u32l >= *utf32len)
394 			return (E2BIG);
395 
396 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
397 	}
398 
399 	/*
400 	 * If high half didn't see low half, then, it's most likely the input
401 	 * parameter is incomplete.
402 	 */
403 	if (hi)
404 		return (EINVAL);
405 
406 	/*
407 	 * Save the number of consumed and saved characters. They do not
408 	 * include terminating NULL character (U+0000) at the end of
409 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
410 	 * the input buffer length is big enough to include the terminating
411 	 * NULL character).
412 	 */
413 	*utf16len = u16l;
414 	*utf32len = u32l;
415 
416 	return (0);
417 }
418 
419 int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)420 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
421     uchar_t *u8s, size_t *utf8len, int flag)
422 {
423 	int inendian;
424 	int outendian;
425 	size_t u16l;
426 	size_t u8l;
427 	uint32_t hi;
428 	uint32_t lo;
429 	boolean_t do_not_ignore_null;
430 
431 	if (u16s == NULL || utf16len == NULL)
432 		return (EILSEQ);
433 
434 	if (u8s == NULL || utf8len == NULL)
435 		return (E2BIG);
436 
437 	if (check_endian(flag, &inendian, &outendian) != 0)
438 		return (EBADF);
439 
440 	u16l = u8l = 0;
441 	hi = 0;
442 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
443 
444 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
445 	    check_bom16(u16s, *utf16len, &inendian))
446 		u16l++;
447 
448 	inendian &= UCONV_IN_NAT_ENDIAN;
449 
450 	for (; u16l < *utf16len; u16l++) {
451 		if (u16s[u16l] == 0 && do_not_ignore_null)
452 			break;
453 
454 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
455 
456 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
457 			if (hi)
458 				return (EILSEQ);
459 			hi = lo;
460 			continue;
461 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
462 			if (! hi)
463 				return (EILSEQ);
464 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
465 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
466 			    + UCONV_U16_START;
467 			hi = 0;
468 		} else if (hi) {
469 			return (EILSEQ);
470 		}
471 
472 		/*
473 		 * Now we convert a UTF-32 character into a UTF-8 character.
474 		 * Unicode coding space is between U+0000 and U+10FFFF;
475 		 * anything bigger is an illegal character.
476 		 */
477 		if (lo <= UCONV_U8_ONE_BYTE) {
478 			if (u8l >= *utf8len)
479 				return (E2BIG);
480 			u8s[u8l++] = (uchar_t)lo;
481 		} else if (lo <= UCONV_U8_TWO_BYTES) {
482 			if ((u8l + 1) >= *utf8len)
483 				return (E2BIG);
484 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
485 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
486 		} else if (lo <= UCONV_U8_THREE_BYTES) {
487 			if ((u8l + 2) >= *utf8len)
488 				return (E2BIG);
489 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
490 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
491 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
492 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
493 			if ((u8l + 3) >= *utf8len)
494 				return (E2BIG);
495 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
496 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
497 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
498 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
499 		} else {
500 			return (EILSEQ);
501 		}
502 	}
503 
504 	if (hi)
505 		return (EINVAL);
506 
507 	*utf16len = u16l;
508 	*utf8len = u8l;
509 
510 	return (0);
511 }
512 
513 int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)514 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
515     uint16_t *u16s, size_t *utf16len, int flag)
516 {
517 	int inendian;
518 	int outendian;
519 	size_t u16l;
520 	size_t u32l;
521 	uint32_t hi;
522 	uint32_t lo;
523 	boolean_t do_not_ignore_null;
524 
525 	if (u32s == NULL || utf32len == NULL)
526 		return (EILSEQ);
527 
528 	if (u16s == NULL || utf16len == NULL)
529 		return (E2BIG);
530 
531 	if (check_endian(flag, &inendian, &outendian) != 0)
532 		return (EBADF);
533 
534 	u16l = u32l = 0;
535 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
536 
537 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
538 	    check_bom32(u32s, *utf32len, &inendian))
539 		u32l++;
540 
541 	inendian &= UCONV_IN_NAT_ENDIAN;
542 	outendian &= UCONV_OUT_NAT_ENDIAN;
543 
544 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
545 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
546 		    UCONV_BOM_SWAPPED;
547 
548 	for (; u32l < *utf32len; u32l++) {
549 		if (u32s[u32l] == 0 && do_not_ignore_null)
550 			break;
551 
552 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
553 
554 		/*
555 		 * Anything bigger than the Unicode coding space, i.e.,
556 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
557 		 * character.
558 		 */
559 		if (hi > UCONV_UNICODE_MAX)
560 			return (EILSEQ);
561 
562 		/*
563 		 * Anything bigger than U+FFFF must be converted into
564 		 * a surrogate pair in UTF-16.
565 		 */
566 		if (hi >= UCONV_U16_START) {
567 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
568 			    UCONV_U16_LO_MIN;
569 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
570 			    UCONV_U16_HI_MIN;
571 
572 			if ((u16l + 1) >= *utf16len)
573 				return (E2BIG);
574 
575 			if (outendian) {
576 				u16s[u16l++] = (uint16_t)hi;
577 				u16s[u16l++] = (uint16_t)lo;
578 			} else {
579 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
580 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
581 			}
582 		} else {
583 			if (u16l >= *utf16len)
584 				return (E2BIG);
585 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
586 			    BSWAP_16(((uint16_t)hi));
587 		}
588 	}
589 
590 	*utf16len = u16l;
591 	*utf32len = u32l;
592 
593 	return (0);
594 }
595 
596 int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)597 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
598     uchar_t *u8s, size_t *utf8len, int flag)
599 {
600 	int inendian;
601 	int outendian;
602 	size_t u32l;
603 	size_t u8l;
604 	uint32_t lo;
605 	boolean_t do_not_ignore_null;
606 
607 	if (u32s == NULL || utf32len == NULL)
608 		return (EILSEQ);
609 
610 	if (u8s == NULL || utf8len == NULL)
611 		return (E2BIG);
612 
613 	if (check_endian(flag, &inendian, &outendian) != 0)
614 		return (EBADF);
615 
616 	u32l = u8l = 0;
617 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
618 
619 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
620 	    check_bom32(u32s, *utf32len, &inendian))
621 		u32l++;
622 
623 	inendian &= UCONV_IN_NAT_ENDIAN;
624 
625 	for (; u32l < *utf32len; u32l++) {
626 		if (u32s[u32l] == 0 && do_not_ignore_null)
627 			break;
628 
629 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
630 
631 		if (lo <= UCONV_U8_ONE_BYTE) {
632 			if (u8l >= *utf8len)
633 				return (E2BIG);
634 			u8s[u8l++] = (uchar_t)lo;
635 		} else if (lo <= UCONV_U8_TWO_BYTES) {
636 			if ((u8l + 1) >= *utf8len)
637 				return (E2BIG);
638 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
639 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
640 		} else if (lo <= UCONV_U8_THREE_BYTES) {
641 			if ((u8l + 2) >= *utf8len)
642 				return (E2BIG);
643 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
644 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
645 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
646 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
647 			if ((u8l + 3) >= *utf8len)
648 				return (E2BIG);
649 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
650 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
651 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
652 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
653 		} else {
654 			return (EILSEQ);
655 		}
656 	}
657 
658 	*utf32len = u32l;
659 	*utf8len = u8l;
660 
661 	return (0);
662 }
663 
664 int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)665 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
666     uint16_t *u16s, size_t *utf16len, int flag)
667 {
668 	int inendian;
669 	int outendian;
670 	size_t u16l;
671 	size_t u8l;
672 	uint32_t hi;
673 	uint32_t lo;
674 	int remaining_bytes;
675 	int first_b;
676 	boolean_t do_not_ignore_null;
677 
678 	if (u8s == NULL || utf8len == NULL)
679 		return (EILSEQ);
680 
681 	if (u16s == NULL || utf16len == NULL)
682 		return (E2BIG);
683 
684 	if (check_endian(flag, &inendian, &outendian) != 0)
685 		return (EBADF);
686 
687 	u16l = u8l = 0;
688 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
689 
690 	outendian &= UCONV_OUT_NAT_ENDIAN;
691 
692 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
693 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
694 		    UCONV_BOM_SWAPPED;
695 
696 	for (; u8l < *utf8len; ) {
697 		if (u8s[u8l] == 0 && do_not_ignore_null)
698 			break;
699 
700 		/*
701 		 * Collect a UTF-8 character and convert it to a UTF-32
702 		 * character. In doing so, we screen out illegally formed
703 		 * UTF-8 characters and treat such as illegal characters.
704 		 * The algorithm at below also screens out anything bigger
705 		 * than the U+10FFFF.
706 		 *
707 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
708 		 * more details on the illegal values of UTF-8 character
709 		 * bytes.
710 		 */
711 		hi = (uint32_t)u8s[u8l++];
712 
713 		if (hi > UCONV_ASCII_MAX) {
714 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
715 				return (EILSEQ);
716 
717 			first_b = hi;
718 			hi = hi & u8_masks_tbl[remaining_bytes];
719 
720 			for (; remaining_bytes > 0; remaining_bytes--) {
721 				/*
722 				 * If we have no more bytes, the current
723 				 * UTF-8 character is incomplete.
724 				 */
725 				if (u8l >= *utf8len)
726 					return (EINVAL);
727 
728 				lo = (uint32_t)u8s[u8l++];
729 
730 				if (first_b) {
731 					if (lo < valid_min_2nd_byte[first_b] ||
732 					    lo > valid_max_2nd_byte[first_b])
733 						return (EILSEQ);
734 					first_b = 0;
735 				} else if (lo < UCONV_U8_BYTE_MIN ||
736 				    lo > UCONV_U8_BYTE_MAX) {
737 					return (EILSEQ);
738 				}
739 				hi = (hi << UCONV_U8_BIT_SHIFT) |
740 				    (lo & UCONV_U8_BIT_MASK);
741 			}
742 		}
743 
744 		if (hi >= UCONV_U16_START) {
745 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
746 			    UCONV_U16_LO_MIN;
747 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
748 			    UCONV_U16_HI_MIN;
749 
750 			if ((u16l + 1) >= *utf16len)
751 				return (E2BIG);
752 
753 			if (outendian) {
754 				u16s[u16l++] = (uint16_t)hi;
755 				u16s[u16l++] = (uint16_t)lo;
756 			} else {
757 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
758 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
759 			}
760 		} else {
761 			if (u16l >= *utf16len)
762 				return (E2BIG);
763 
764 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
765 			    BSWAP_16(((uint16_t)hi));
766 		}
767 	}
768 
769 	*utf16len = u16l;
770 	*utf8len = u8l;
771 
772 	return (0);
773 }
774 
775 int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)776 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
777     uint32_t *u32s, size_t *utf32len, int flag)
778 {
779 	int inendian;
780 	int outendian;
781 	size_t u32l;
782 	size_t u8l;
783 	uint32_t hi;
784 	uint32_t c;
785 	int remaining_bytes;
786 	int first_b;
787 	boolean_t do_not_ignore_null;
788 
789 	if (u8s == NULL || utf8len == NULL)
790 		return (EILSEQ);
791 
792 	if (u32s == NULL || utf32len == NULL)
793 		return (E2BIG);
794 
795 	if (check_endian(flag, &inendian, &outendian) != 0)
796 		return (EBADF);
797 
798 	u32l = u8l = 0;
799 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
800 
801 	outendian &= UCONV_OUT_NAT_ENDIAN;
802 
803 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
804 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
805 		    UCONV_BOM_SWAPPED_32;
806 
807 	for (; u8l < *utf8len; ) {
808 		if (u8s[u8l] == 0 && do_not_ignore_null)
809 			break;
810 
811 		hi = (uint32_t)u8s[u8l++];
812 
813 		if (hi > UCONV_ASCII_MAX) {
814 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
815 				return (EILSEQ);
816 
817 			first_b = hi;
818 			hi = hi & u8_masks_tbl[remaining_bytes];
819 
820 			for (; remaining_bytes > 0; remaining_bytes--) {
821 				if (u8l >= *utf8len)
822 					return (EINVAL);
823 
824 				c = (uint32_t)u8s[u8l++];
825 
826 				if (first_b) {
827 					if (c < valid_min_2nd_byte[first_b] ||
828 					    c > valid_max_2nd_byte[first_b])
829 						return (EILSEQ);
830 					first_b = 0;
831 				} else if (c < UCONV_U8_BYTE_MIN ||
832 				    c > UCONV_U8_BYTE_MAX) {
833 					return (EILSEQ);
834 				}
835 				hi = (hi << UCONV_U8_BIT_SHIFT) |
836 				    (c & UCONV_U8_BIT_MASK);
837 			}
838 		}
839 
840 		if (u32l >= *utf32len)
841 			return (E2BIG);
842 
843 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
844 	}
845 
846 	*utf32len = u32l;
847 	*utf8len = u8l;
848 
849 	return (0);
850 }
851 
852 #if defined(_KERNEL)
853 EXPORT_SYMBOL(uconv_u16tou32);
854 EXPORT_SYMBOL(uconv_u16tou8);
855 EXPORT_SYMBOL(uconv_u32tou16);
856 EXPORT_SYMBOL(uconv_u32tou8);
857 EXPORT_SYMBOL(uconv_u8tou16);
858 EXPORT_SYMBOL(uconv_u8tou32);
859 #endif
860