1 // Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
2 //
3 // This program is free software; you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License, version 2.0, as
5 // published by the Free Software Foundation.
6 //
7 // This program is also distributed with certain software (including
8 // but not limited to OpenSSL) that is licensed under separate terms,
9 // as designated in a particular file or component or in included license
10 // documentation. The authors of MySQL hereby grant you an
11 // additional permission to link the program and your derivative works
12 // with the separately licensed software that they have included with
13 // MySQL.
14 //
15 // Without limiting anything contained in the foregoing, this file,
16 // which is part of MySQL Connector/ODBC, is also subject to the
17 // Universal FOSS Exception, version 1.0, a copy of which can be found at
18 // http://oss.oracle.com/licenses/universal-foss-exception.
19 //
20 // This program is distributed in the hope that it will be useful, but
21 // WITHOUT ANY WARRANTY; without even the implied warranty of
22 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // See the GNU General Public License, version 2.0, for more details.
24 //
25 // You should have received a copy of the GNU General Public License
26 // along with this program; if not, write to the Free Software Foundation, Inc.,
27 // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28
29 /**
30 @file unicode_transcode.c
31 @brief Unicode transcoding functions. Raw conversions.
32 */
33
34 #ifndef ODBCTAP
35 # include "stringutil.h"
36 #endif
37
38 /**
39 Convert UTF-16 code unit(s) to a UTF-32 character. For characters in the
40 Basic Multilingual Plane, one UTF-16 code unit maps to one UTF-32 character,
41 but characters in other planes may require two UTF-16 code units.
42
43 @param[in] i Pointer to UTF-16 code units
44 @param[in] u Pointer to UTF-32 character
45
46 @return Number of UTF-16 code units consumed.
47 */
utf16toutf32(UTF16 * i,UTF32 * u)48 int utf16toutf32(UTF16 *i, UTF32 *u)
49 {
50 if (*i >= 0xd800 && *i <= 0xdbff)
51 {
52 *u= 0x10000 | ((*i++ & 0x3ff) << 10);
53 if (*i < 0xdc00 || *i > 0xdfff) /* invalid */
54 return 0;
55 *u|= *i & 0x3ff;
56 return 2;
57 }
58 else
59 {
60 *u= *i;
61 return 1;
62 }
63 }
64
65
66 /**
67 Convert UTF-32 character to UTF-16 code unit(s).
68
69 @param[in] i UTF-32 character
70 @param[in] u Pointer to UTF-16 code units
71
72 @return Number of UTF-16 code units produced.
73 */
utf32toutf16(UTF32 i,UTF16 * u)74 int utf32toutf16(UTF32 i, UTF16 *u)
75 {
76 if (i < 0xffff)
77 {
78 *u= (UTF16)(i & 0xffff);
79 return 1;
80 }
81 else if(i < 0x10ffff)
82 {
83 i-= 0x10000;
84 *u++= 0xd800 | (i >> 10);
85 *u= 0xdc00 | (i & 0x3ff);
86 return 2;
87 }
88 return 0;
89 }
90
91
92 /**
93 Convert UTF-8 octets to a UTF-32 character. It may take up to four
94 UTF-8 octets to encode one UTF-32 character.
95
96 @param[in] i Pointer to UTF-8 octets
97 @param[in] u Pointer to UTF-32 character
98
99 @return Number of UTF-8 octets consumed, or 0 if an invalid character was
100 encountered.
101 */
utf8toutf32(UTF8 * i,UTF32 * u)102 int utf8toutf32(UTF8 *i, UTF32 *u)
103 {
104 int len, x;
105
106 if (*i < 0x80)
107 {
108 *u= *i;
109 return 1;
110 }
111 else if (*i < 0xe0)
112 {
113 len= 2;
114 *u= *i & 0x1f;
115 }
116 else if (*i < 0xf0)
117 {
118 len= 3;
119 *u= *i & 0x0f;
120 }
121 else
122 {
123 len= 4;
124 *u= *i & 0x07;
125 }
126
127 x= len;
128 while (--x)
129 {
130 *u<<= 6;
131 *u|= *++i & 0x3f;
132 if (*i >> 6 != 2) /* invalid */
133 return 0;
134 }
135
136 return len;
137 }
138
139
140 /**
141 Convert a UTF-32 character into UTF-8 octets. It may take four UTF-8
142 octets to encode one UTF-32 character.
143
144 @param[in] i UTF-32 characer
145 @param[in] u Pointer to UTF-8 octets
146
147 @return Number of UTF-8 octets produced.
148 */
utf32toutf8(UTF32 i,UTF8 * c)149 int utf32toutf8(UTF32 i, UTF8 *c)
150 {
151 int len= 0, x;
152
153 if (i < 0x80)
154 {
155 *c= (UTF8)(i & 0x7f);
156 return 1;
157 }
158 else if (i < 0x800)
159 {
160 *c++= (3 << 6) | (i >> 6);
161 len= 2;
162 }
163 else if (i < 0x10000)
164 {
165 *c++= (7 << 5) | (i >> 12);
166 len= 3;
167 }
168 else if (i < 0x10ffff)
169 {
170 *c++= (0xf << 4) | (i >> 18);
171 len= 4;
172 }
173
174 x= len;
175 if (x)
176 while (--x)
177 {
178 *c++= (1 << 7) | ((i >> (6 * (x - 1))) & 0x3f);
179 }
180
181 return len;
182 }
183
184
185 #ifdef UCTEST
186
187 #include <assert.h>
188 #include <string.h>
189 #include <stdio.h>
190
191 typedef struct {
192 UTF8 u8[4];
193 UTF32 u32;
194 int cnt;
195 } t_8_32;
196
197 typedef struct {
198 UTF16 u16[2];
199 UTF32 u32;
200 int cnt;
201 } t_16_32;
202
t1()203 void t1()
204 {
205 int i, j;
206 t_8_32 t1[]= {
207 {{0, 0, 0, 0}, 0, 1},
208 {{0x3c, 0, 0, 0}, 0x3c, 1},
209 {{0xc3, 0xbe, 0, 0}, 0xfe, 2},
210 {{0xe0, 0xa4, 0x96, 0}, 0x916, 3},
211 {{0xf0, 0x90, 0x85, 0xad}, 0x1016d, 4}
212 };
213 printf("***** T1 -> utf32<->utf8 *****\n");
214 for (i= 0; i < sizeof(t1) / sizeof(t_8_32); ++i)
215 {
216 int cnt;
217 t_8_32 t= t1[i];
218 UTF8 res[4];
219 UTF32 resu;
220 memset(res, 0, 4);
221 printf("Convert %x\n", t.u32);
222 cnt= utf32toutf8(t.u32, res);
223 assert(cnt == t.cnt);
224 for (j= 0; j < 4; ++j)
225 {
226 printf("Res[%d] = 0x%x (expect 0x%x)\n", j, res[j], t.u8[j]);
227 assert(res[j] == t.u8[j]);
228 }
229 printf("Ok. Now back\n");
230 cnt= utf8toutf32(t.u8, &resu);
231 printf("ResU = %x\n", resu);
232 assert(cnt == t.cnt);
233 assert(resu == t.u32);
234 }
235 }
236
t2()237 void t2()
238 {
239 int i, j;
240 t_16_32 t1[]= {
241 {{0, 0}, 0, 1},
242 {{0x7a, 0}, 0x7a, 1},
243 {{0x6c34, 0}, 0x6c34, 1},
244 {{0xd834, 0xdd1e}, 0x1d11e, 2}
245 };
246 printf("***** T2 -> utf32<->utf16 *****\n");
247 for (i= 0; i < sizeof(t1) / sizeof(t_16_32); ++i)
248 {
249 int cnt;
250 t_16_32 t= t1[i];
251 UTF16 res[2];
252 UTF32 resu;
253 memset(res, 0, 2 * 2);
254 printf("Convert %x\n", t.u32);
255 cnt= utf32toutf16(t.u32, res);
256 assert(cnt == t.cnt);
257 for (j = 0; j < 2; ++j)
258 {
259 printf("Res[%d] = 0x%x (expect 0x%x)\n", j, res[j], t.u16[j]);
260 assert(res[j] == t.u16[j]);
261 }
262 printf("Ok. Now back\n");
263 cnt= utf16toutf32(t.u16, &resu);
264 printf("ResU = %x\n", resu);
265 assert(cnt == t.cnt);
266 assert(resu == t.u32);
267 }
268 }
269
main(int argc,char ** argv)270 int main(int argc, char **argv)
271 {
272 t1();
273 t2();
274 exit(0);
275 }
276 #endif /* UCTEST */
277