1 // Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
2 //
3 // This program is free software; you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License, version 2.0, as
5 // published by the Free Software Foundation.
6 //
7 // This program is also distributed with certain software (including
8 // but not limited to OpenSSL) that is licensed under separate terms,
9 // as designated in a particular file or component or in included license
10 // documentation. The authors of MySQL hereby grant you an
11 // additional permission to link the program and your derivative works
12 // with the separately licensed software that they have included with
13 // MySQL.
14 //
15 // Without limiting anything contained in the foregoing, this file,
16 // which is part of MySQL Connector/ODBC, is also subject to the
17 // Universal FOSS Exception, version 1.0, a copy of which can be found at
18 // http://oss.oracle.com/licenses/universal-foss-exception.
19 //
20 // This program is distributed in the hope that it will be useful, but
21 // WITHOUT ANY WARRANTY; without even the implied warranty of
22 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // See the GNU General Public License, version 2.0, for more details.
24 //
25 // You should have received a copy of the GNU General Public License
26 // along with this program; if not, write to the Free Software Foundation, Inc.,
27 // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 
29 /**
30   @file  unicode_transcode.c
31   @brief Unicode transcoding functions. Raw conversions.
32 */
33 
34 #ifndef ODBCTAP
35 # include "stringutil.h"
36 #endif
37 
38 /**
39   Convert UTF-16 code unit(s) to a UTF-32 character. For characters in the
40   Basic Multilingual Plane, one UTF-16 code unit maps to one UTF-32 character,
41   but characters in other planes may require two UTF-16 code units.
42 
43   @param[in] i  Pointer to UTF-16 code units
44   @param[in] u  Pointer to UTF-32 character
45 
46   @return Number of UTF-16 code units consumed.
47 */
utf16toutf32(UTF16 * i,UTF32 * u)48 int utf16toutf32(UTF16 *i, UTF32 *u)
49 {
50   if (*i >= 0xd800 && *i <= 0xdbff)
51   {
52     *u= 0x10000 | ((*i++ & 0x3ff) << 10);
53     if (*i < 0xdc00 || *i > 0xdfff) /* invalid */
54       return 0;
55     *u|= *i & 0x3ff;
56     return 2;
57   }
58   else
59   {
60     *u= *i;
61     return 1;
62   }
63 }
64 
65 
66 /**
67   Convert UTF-32 character to UTF-16 code unit(s).
68 
69   @param[in] i  UTF-32 character
70   @param[in] u  Pointer to UTF-16 code units
71 
72   @return Number of UTF-16 code units produced.
73 */
utf32toutf16(UTF32 i,UTF16 * u)74 int utf32toutf16(UTF32 i, UTF16 *u)
75 {
76   if (i < 0xffff)
77   {
78     *u= (UTF16)(i & 0xffff);
79     return 1;
80   }
81   else if(i < 0x10ffff)
82   {
83     i-= 0x10000;
84     *u++= 0xd800 | (i >> 10);
85     *u= 0xdc00 | (i & 0x3ff);
86     return 2;
87   }
88   return 0;
89 }
90 
91 
92 /**
93   Convert UTF-8 octets to a UTF-32 character. It may take up to four
94   UTF-8 octets to encode one UTF-32 character.
95 
96   @param[in] i  Pointer to UTF-8 octets
97   @param[in] u  Pointer to UTF-32 character
98 
99   @return Number of UTF-8 octets consumed, or 0 if an invalid character was
100   encountered.
101 */
utf8toutf32(UTF8 * i,UTF32 * u)102 int utf8toutf32(UTF8 *i, UTF32 *u)
103 {
104   int len, x;
105 
106   if (*i < 0x80)
107   {
108     *u= *i;
109     return 1;
110   }
111   else if (*i < 0xe0)
112   {
113     len= 2;
114     *u= *i & 0x1f;
115   }
116   else if (*i < 0xf0)
117   {
118     len= 3;
119     *u= *i & 0x0f;
120   }
121   else
122   {
123     len= 4;
124     *u= *i & 0x07;
125   }
126 
127   x= len;
128   while (--x)
129   {
130     *u<<= 6;
131     *u|= *++i & 0x3f;
132     if (*i >> 6 != 2) /* invalid */
133       return 0;
134   }
135 
136   return len;
137 }
138 
139 
140 /**
141   Convert a UTF-32 character into UTF-8 octets. It may take four UTF-8
142   octets to encode one UTF-32 character.
143 
144   @param[in] i  UTF-32 characer
145   @param[in] u  Pointer to UTF-8 octets
146 
147   @return Number of UTF-8 octets produced.
148 */
utf32toutf8(UTF32 i,UTF8 * c)149 int utf32toutf8(UTF32 i, UTF8 *c)
150 {
151   int len= 0, x;
152 
153   if (i < 0x80)
154   {
155     *c= (UTF8)(i & 0x7f);
156     return 1;
157   }
158   else if (i < 0x800)
159   {
160     *c++= (3 << 6) | (i >> 6);
161     len= 2;
162   }
163   else if (i < 0x10000)
164   {
165     *c++= (7 << 5) | (i >> 12);
166     len= 3;
167   }
168   else if (i < 0x10ffff)
169   {
170     *c++= (0xf << 4) | (i >> 18);
171     len= 4;
172   }
173 
174   x= len;
175   if (x)
176     while (--x)
177     {
178       *c++= (1 << 7) | ((i >> (6 * (x - 1))) & 0x3f);
179     }
180 
181   return len;
182 }
183 
184 
185 #ifdef UCTEST
186 
187 #include <assert.h>
188 #include <string.h>
189 #include <stdio.h>
190 
191 typedef struct {
192   UTF8 u8[4];
193   UTF32 u32;
194   int cnt;
195 } t_8_32;
196 
197 typedef struct {
198   UTF16 u16[2];
199   UTF32 u32;
200   int cnt;
201 } t_16_32;
202 
t1()203 void t1()
204 {
205   int i, j;
206   t_8_32 t1[]= {
207       {{0, 0, 0, 0}, 0, 1},
208       {{0x3c, 0, 0, 0}, 0x3c, 1},
209       {{0xc3, 0xbe, 0, 0}, 0xfe, 2},
210       {{0xe0, 0xa4, 0x96, 0}, 0x916, 3},
211       {{0xf0, 0x90, 0x85, 0xad}, 0x1016d, 4}
212   };
213   printf("***** T1 -> utf32<->utf8 *****\n");
214   for (i= 0; i < sizeof(t1) / sizeof(t_8_32); ++i)
215   {
216     int cnt;
217     t_8_32 t= t1[i];
218     UTF8 res[4];
219     UTF32 resu;
220     memset(res, 0, 4);
221     printf("Convert %x\n", t.u32);
222     cnt= utf32toutf8(t.u32, res);
223     assert(cnt == t.cnt);
224     for (j= 0; j < 4; ++j)
225     {
226       printf("Res[%d] = 0x%x (expect 0x%x)\n", j, res[j], t.u8[j]);
227       assert(res[j] == t.u8[j]);
228     }
229     printf("Ok. Now back\n");
230     cnt= utf8toutf32(t.u8, &resu);
231     printf("ResU = %x\n", resu);
232     assert(cnt == t.cnt);
233     assert(resu == t.u32);
234   }
235 }
236 
t2()237 void t2()
238 {
239   int i, j;
240   t_16_32 t1[]= {
241       {{0, 0}, 0, 1},
242       {{0x7a, 0}, 0x7a, 1},
243       {{0x6c34, 0}, 0x6c34, 1},
244       {{0xd834, 0xdd1e}, 0x1d11e, 2}
245   };
246   printf("***** T2 -> utf32<->utf16 *****\n");
247   for (i= 0; i < sizeof(t1) / sizeof(t_16_32); ++i)
248   {
249     int cnt;
250     t_16_32 t= t1[i];
251     UTF16 res[2];
252     UTF32 resu;
253     memset(res, 0, 2 * 2);
254     printf("Convert %x\n", t.u32);
255     cnt= utf32toutf16(t.u32, res);
256     assert(cnt == t.cnt);
257     for (j = 0; j < 2; ++j)
258     {
259       printf("Res[%d] = 0x%x (expect 0x%x)\n", j, res[j], t.u16[j]);
260       assert(res[j] == t.u16[j]);
261     }
262     printf("Ok. Now back\n");
263     cnt= utf16toutf32(t.u16, &resu);
264     printf("ResU = %x\n", resu);
265     assert(cnt == t.cnt);
266     assert(resu == t.u32);
267   }
268 }
269 
main(int argc,char ** argv)270 int main(int argc, char **argv)
271 {
272   t1();
273   t2();
274   exit(0);
275 }
276 #endif /* UCTEST */
277