xref: /freebsd/sys/dev/hyperv/utilities/unicode.h (revision 315ee00f)
1 /* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
2 
3 /*-
4  * Copyright (c) 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Dieter Baron.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/types.h>
33 
34 #define UNICODE_DECOMPOSE		0x01
35 #define UNICODE_PRECOMPOSE		0x02
36 #define UNICODE_UTF8_LATIN1_FALLBACK	0x03
37 
38 size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
39 size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
40 
41 size_t
42 utf8_to_utf16(uint16_t *dst, size_t dst_len,
43 	      const char *src, size_t src_len,
44 	      int flags, int *errp)
45 {
46     const unsigned char *s;
47     size_t spos, dpos;
48     int error;
49     uint16_t c;
50 
51 #define IS_CONT(c)	(((c)&0xc0) == 0x80)
52 
53     error = 0;
54     s = (const unsigned char *)src;
55     spos = dpos = 0;
56     while (spos<src_len) {
57 	if (s[spos] < 0x80)
58 	    c = s[spos++];
59 	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
60 		 && (spos >= src_len || !IS_CONT(s[spos+1]))
61 		 && s[spos]>=0xa0) {
62 	    /* not valid UTF-8, assume ISO 8859-1 */
63 	    c = s[spos++];
64 	}
65 	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
66 	    /* continuation byte without lead byte
67 	       or lead byte for codepoint above 0x10ffff */
68 	    error++;
69 	    spos++;
70 	    continue;
71 	}
72 	else if (s[spos] < 0xe0) {
73 	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
74 		spos++;
75 		error++;
76 		continue;
77 	    }
78 	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
79 	    spos += 2;
80 	    if (c < 0x80) {
81 		/* overlong encoding */
82 		error++;
83 		continue;
84 	    }
85 	}
86 	else if (s[spos] < 0xf0) {
87 	    if (spos >= src_len-2
88 		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
89 		spos++;
90 		error++;
91 		continue;
92 	    }
93 	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
94 		| (s[spos+2] & 0x3f);
95 	    spos += 3;
96 	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
97 		/* overlong encoding or encoded surrogate */
98 		error++;
99 		continue;
100 	    }
101 	}
102 	else {
103 	    uint32_t cc;
104 	    /* UTF-16 surrogate pair */
105 
106 	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
107 		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
108 		spos++;
109 		error++;
110 
111 		continue;
112 	    }
113 	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
114 		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
115 	    spos += 4;
116 	    if (cc < 0x10000) {
117 		/* overlong encoding */
118 		error++;
119 		continue;
120 	    }
121 	    if (dst && dpos < dst_len)
122 		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
123 	    dpos++;
124 	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
125 	}
126 
127 	if (dst && dpos < dst_len)
128 	    dst[dpos] = c;
129 	dpos++;
130     }
131 
132     if (errp)
133 	*errp = error;
134 
135     return dpos;
136 
137 #undef IS_CONT
138 }
139 
140 
141 size_t
142 utf16_to_utf8(char *dst, size_t dst_len,
143 	      const uint16_t *src, size_t src_len,
144 	      int flags, int *errp)
145 {
146     uint16_t spos, dpos;
147     int error;
148 
149 #define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
150 #define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
151 
152     error = 0;
153     dpos = 0;
154     for (spos=0; spos<src_len; spos++) {
155 	if (src[spos] < 0x80) {
156 	    CHECK_LENGTH(1);
157 	    ADD_BYTE(src[spos]);
158 	}
159 	else if (src[spos] < 0x800) {
160 	    CHECK_LENGTH(2);
161 	    ADD_BYTE(0xc0 | (src[spos]>>6));
162 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
163 	}
164 	else if ((src[spos] & 0xdc00) == 0xd800) {
165 	    uint32_t c;
166 	    /* first surrogate */
167 	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
168 		/* no second surrogate present */
169 		error++;
170 		continue;
171 	    }
172 	    spos++;
173 	    CHECK_LENGTH(4);
174 	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
175 	    ADD_BYTE(0xf0 | (c>>18));
176 	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
177 	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
178 	    ADD_BYTE(0x80 | (c & 0x3f));
179 	}
180 	else if ((src[spos] & 0xdc00) == 0xdc00) {
181 	    /* second surrogate without preceding first surrogate */
182 	    error++;
183 	}
184 	else {
185 	    CHECK_LENGTH(3);
186 	    ADD_BYTE(0xe0 | src[spos]>>12);
187 	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
188 	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
189 	}
190     }
191 
192     if (errp)
193 	*errp = error;
194 
195     return dpos;
196 
197 #undef ADD_BYTE
198 #undef CHECK_LENGTH
199 }
200