1 /* $NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $ */
2 
3 /*-
4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Dieter Baron.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "rcsid.h"
33 __KERNEL_RCSID(0, "$NetBSD: unicode.c,v 1.3 2015/06/21 14:09:47 maxv Exp $");
34 
35 #include <stddef.h>
36 
37 #include "unicode.h"
38 
39 size_t
utf8_to_utf16(uint16_t * dst,size_t dst_len,const char * src,size_t src_len,int flags,int * errp)40 utf8_to_utf16(uint16_t *dst, size_t dst_len,
41 	      const char *src, size_t src_len,
42 	      int flags, int *errp)
43 {
44 	const unsigned char *s;
45 	size_t spos, dpos;
46 	int error;
47 	uint16_t c;
48 
49 #define IS_CONT(c)	(((c)&0xc0) == 0x80)
50 
51 	error = 0;
52 	s = (const unsigned char *)src;
53 	spos = dpos = 0;
54 	while (spos < src_len) {
55 		if (s[spos] < 0x80) {
56 			c = s[spos++];
57 		} else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
58 			 && (spos >= src_len || !IS_CONT(s[spos+1]))
59 			 && s[spos]>=0xa0) {
60 			/* not valid UTF-8, assume ISO 8859-1 */
61 			c = s[spos++];
62 		} else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
63 			/* continuation byte without lead byte
64 			 * or lead byte for codepoint above 0x10ffff */
65 			error++;
66 			spos++;
67 			continue;
68 		} else if (s[spos] < 0xe0) {
69 			if (spos >= src_len || !IS_CONT(s[spos+1])) {
70 				spos++;
71 				error++;
72 				continue;
73 			}
74 			c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
75 			spos += 2;
76 			if (c < 0x80) {
77 				/* overlong encoding */
78 				error++;
79 				continue;
80 			}
81 		} else if (s[spos] < 0xf0) {
82 			if (spos >= src_len-2 ||
83 			    !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
84 				spos++;
85 				error++;
86 				continue;
87 			}
88 			c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
89 			    | (s[spos+2] & 0x3f);
90 			spos += 3;
91 			if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
92 				/* overlong encoding or encoded surrogate */
93 				error++;
94 				continue;
95 			}
96 		} else {
97 			uint32_t cc;
98 			/* UTF-16 surrogate pair */
99 
100 			if (spos >= src_len-3 || !IS_CONT(s[spos+1])
101 			    || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
102 				spos++;
103 				error++;
104 				continue;
105 			}
106 			cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
107 			    | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
108 			spos += 4;
109 			if (cc < 0x10000) {
110 				/* overlong encoding */
111 				error++;
112 				continue;
113 			}
114 			if (dst && dpos < dst_len)
115 				dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
116 			dpos++;
117 			c = 0xdc00 | ((cc-0x10000) & 0x3ff);
118 		}
119 
120 		if (dst && dpos < dst_len)
121 			dst[dpos] = c;
122 		dpos++;
123 	}
124 
125 	if (errp)
126 		*errp = error;
127 	return dpos;
128 #undef IS_CONT
129 }
130 
131 
132 size_t
utf16_to_utf8(char * dst,size_t dst_len,const uint16_t * src,size_t src_len,int flags,int * errp)133 utf16_to_utf8(char *dst, size_t dst_len,
134 	      const uint16_t *src, size_t src_len,
135 	      int flags, int *errp)
136 {
137 	size_t spos, dpos;
138 	int error;
139 
140 #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
141 #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
142 
143 	error = 0;
144 	dpos = 0;
145 	for (spos = 0; spos < src_len; spos++) {
146 		if (src[spos] < 0x80) {
147 			CHECK_LENGTH(1);
148 			ADD_BYTE(src[spos]);
149 		} else if (src[spos] < 0x800) {
150 			CHECK_LENGTH(2);
151 			ADD_BYTE(0xc0 | (src[spos]>>6));
152 			ADD_BYTE(0x80 | (src[spos] & 0x3f));
153 		} else if(src[spos] >= 0xd800 && src[spos] < 0xe000) {
154 			 if ((src[spos] & 0xdc00) == 0xd800) {
155 				uint32_t c;
156 				/* first surrogate */
157 				if (spos == src_len - 1 || (src[spos+1] & 0xdc00) != 0xdc00) {
158 					/* no second surrogate present */
159 					error++;
160 					continue;
161 				}
162 				spos++;
163 				CHECK_LENGTH(4);
164 				c = (((src[spos-1]&0x3ff) << 10) | (src[spos]&0x3ff)) + 0x10000;
165 				ADD_BYTE(0xf0 | (c>>18));
166 				ADD_BYTE(0x80 | ((c>>12) & 0x3f));
167 				ADD_BYTE(0x80 | ((c>>6) & 0x3f));
168 				ADD_BYTE(0x80 | (c & 0x3f));
169 			} else if ((src[spos] & 0xdc00) == 0xdc00) {
170 				/* second surrogate without preceding first surrogate */
171 				error++;
172 			} else {
173 				/* in surrogate pair range but none found */
174 				error++;
175 			}
176 		} else {
177 			CHECK_LENGTH(3);
178 			ADD_BYTE(0xe0 | src[spos]>>12);
179 			ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
180 			ADD_BYTE(0x80 | (src[spos] & 0x3f));
181 		}
182 	}
183 
184 	if (errp)
185 		*errp = error;
186 	return dpos;
187 #undef ADD_BYTE
188 #undef CHECK_LENGTH
189 }
190