1 /* $NetBSD: ucs4.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */
2
3 #ifndef lint
4 static char *rcsid = "Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp ";
5 #endif
6
7 /*
8 * Copyright (c) 2001 Japan Network Information Center. All rights reserved.
9 *
10 * By using this file, you agree to the terms and conditions set forth bellow.
11 *
12 * LICENSE TERMS AND CONDITIONS
13 *
14 * The following License Terms and Conditions apply, unless a different
15 * license is obtained from Japan Network Information Center ("JPNIC"),
16 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
17 * Chiyoda-ku, Tokyo 101-0047, Japan.
18 *
19 * 1. Use, Modification and Redistribution (including distribution of any
20 * modified or derived work) in source and/or binary forms is permitted
21 * under this License Terms and Conditions.
22 *
23 * 2. Redistribution of source code must retain the copyright notices as they
24 * appear in each source code file, this License Terms and Conditions.
25 *
26 * 3. Redistribution in binary form must reproduce the Copyright Notice,
27 * this License Terms and Conditions, in the documentation and/or other
28 * materials provided with the distribution. For the purposes of binary
29 * distribution the "Copyright Notice" refers to the following language:
30 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
31 *
32 * 4. The name of JPNIC may not be used to endorse or promote products
33 * derived from this Software without specific prior written approval of
34 * JPNIC.
35 *
36 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
37 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
38 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
39 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
45 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
46 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
47 */
48
49 #include <config.h>
50
51 #include <stddef.h>
52 #include <stdlib.h>
53 #include <string.h>
54
55 #include <idn/assert.h>
56 #include <idn/result.h>
57 #include <idn/logmacro.h>
58 #include <idn/util.h>
59 #include <idn/ucs4.h>
60 #include <idn/debug.h>
61
62 /*
63 * Unicode surrogate pair.
64 */
65 #define IS_SURROGATE_HIGH(v) (0xd800 <= (v) && (v) <= 0xdbff)
66 #define IS_SURROGATE_LOW(v) (0xdc00 <= (v) && (v) <= 0xdfff)
67 #define SURROGATE_HIGH(v) (SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
68 #define SURROGATE_LOW(v) (SURROGATE_L_OFF + ((v) & 0x3ff))
69 #define SURROGATE_BASE 0x10000
70 #define SURROGATE_H_OFF 0xd800
71 #define SURROGATE_L_OFF 0xdc00
72 #define COMBINE_SURROGATE(h, l) \
73 (SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
74
75 /*
76 * ASCII ctype macros.
77 * Note that these macros evaluate the argument multiple times. Be careful.
78 */
79 #define ASCII_TOUPPER(c) \
80 (('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c))
81 #define ASCII_TOLOWER(c) \
82 (('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
83
84 idn_result_t
idn_ucs4_ucs4toutf16(const unsigned long * ucs4,unsigned short * utf16,size_t tolen)85 idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16,
86 size_t tolen) {
87 unsigned short *utf16p = utf16;
88 unsigned long v;
89 idn_result_t r;
90
91 TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n",
92 idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
93
94 while (*ucs4 != '\0') {
95 v = *ucs4++;
96
97 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
98 WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains "
99 "surrogate pair\n"));
100 r = idn_invalid_encoding;
101 goto ret;
102 } else if (v > 0xffff) {
103 /* Convert to surrogate pair */
104 if (v >= 0x110000) {
105 r = idn_invalid_encoding;
106 goto ret;
107 }
108 if (tolen < 2) {
109 r = idn_buffer_overflow;
110 goto ret;
111 }
112 *utf16p++ = SURROGATE_HIGH(v);
113 *utf16p++ = SURROGATE_LOW(v);
114 tolen -= 2;
115 } else {
116 if (tolen < 1) {
117 r = idn_buffer_overflow;
118 goto ret;
119 }
120 *utf16p++ = v;
121 tolen--;
122 }
123 }
124
125 if (tolen < 1) {
126 r = idn_buffer_overflow;
127 goto ret;
128 }
129 *utf16p = '\0';
130
131 r = idn_success;
132 ret:
133 if (r == idn_success) {
134 TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n",
135 idn__debug_utf16xstring(utf16, 50)));
136 } else {
137 TRACE(("idn_ucs4_ucs4toutf16(): %s\n",
138 idn_result_tostring(r)));
139 }
140 return (r);
141 }
142
143 idn_result_t
idn_ucs4_utf16toucs4(const unsigned short * utf16,unsigned long * ucs4,size_t tolen)144 idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4,
145 size_t tolen) {
146 unsigned long *ucs4p = ucs4;
147 unsigned short v0, v1;
148 idn_result_t r;
149
150 TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n",
151 idn__debug_utf16xstring(utf16, 50), (int)tolen));
152
153 while (*utf16 != '\0') {
154 v0 = *utf16;
155
156 if (tolen < 1) {
157 r = idn_buffer_overflow;
158 goto ret;
159 }
160
161 if (IS_SURROGATE_HIGH(v0)) {
162 v1 = *(utf16 + 1);
163 if (!IS_SURROGATE_LOW(v1)) {
164 WARNING(("idn_ucs4_utf16toucs4: "
165 "corrupted surrogate pair\n"));
166 r = idn_invalid_encoding;
167 goto ret;
168 }
169 *ucs4p++ = COMBINE_SURROGATE(v0, v1);
170 tolen--;
171 utf16 += 2;
172
173 } else {
174 *ucs4p++ = v0;
175 tolen--;
176 utf16++;
177
178 }
179 }
180
181 if (tolen < 1) {
182 r = idn_buffer_overflow;
183 goto ret;
184 }
185 *ucs4p = '\0';
186
187 r = idn_success;
188 ret:
189 if (r == idn_success) {
190 TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n",
191 idn__debug_ucs4xstring(ucs4, 50)));
192 } else {
193 TRACE(("idn_ucs4_utf16toucs4(): %s\n",
194 idn_result_tostring(r)));
195 }
196 return (r);
197 }
198
199 idn_result_t
idn_ucs4_utf8toucs4(const char * utf8,unsigned long * ucs4,size_t tolen)200 idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) {
201 const unsigned char *utf8p = (const unsigned char *)utf8;
202 unsigned long *ucs4p = ucs4;
203 unsigned long v, min;
204 unsigned char c;
205 int width;
206 int i;
207 idn_result_t r;
208
209 TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n",
210 idn__debug_xstring(utf8, 50), (int)tolen));
211
212 while(*utf8p != '\0') {
213 c = *utf8p++;
214 if (c < 0x80) {
215 v = c;
216 min = 0;
217 width = 1;
218 } else if (c < 0xc0) {
219 WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
220 r = idn_invalid_encoding;
221 goto ret;
222 } else if (c < 0xe0) {
223 v = c & 0x1f;
224 min = 0x80;
225 width = 2;
226 } else if (c < 0xf0) {
227 v = c & 0x0f;
228 min = 0x800;
229 width = 3;
230 } else if (c < 0xf8) {
231 v = c & 0x07;
232 min = 0x10000;
233 width = 4;
234 } else if (c < 0xfc) {
235 v = c & 0x03;
236 min = 0x200000;
237 width = 5;
238 } else if (c < 0xfe) {
239 v = c & 0x01;
240 min = 0x4000000;
241 width = 6;
242 } else {
243 WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
244 r = idn_invalid_encoding;
245 goto ret;
246 }
247
248 for (i = width - 1; i > 0; i--) {
249 c = *utf8p++;
250 if (c < 0x80 || 0xc0 <= c) {
251 WARNING(("idn_ucs4_utf8toucs4: "
252 "invalid character\n"));
253 r = idn_invalid_encoding;
254 goto ret;
255 }
256 v = (v << 6) | (c & 0x3f);
257 }
258
259 if (v < min) {
260 WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
261 r = idn_invalid_encoding;
262 goto ret;
263 }
264 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
265 WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains "
266 "surrogate pair\n"));
267 r = idn_invalid_encoding;
268 goto ret;
269 }
270 if (tolen < 1) {
271 r = idn_buffer_overflow;
272 goto ret;
273 }
274 tolen--;
275 *ucs4p++ = v;
276 }
277
278 if (tolen < 1) {
279 r = idn_buffer_overflow;
280 goto ret;
281 }
282 *ucs4p = '\0';
283
284 r = idn_success;
285 ret:
286 if (r == idn_success) {
287 TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n",
288 idn__debug_ucs4xstring(ucs4, 50)));
289 } else {
290 TRACE(("idn_ucs4_utf8toucs4(): %s\n",
291 idn_result_tostring(r)));
292 }
293 return (r);
294 }
295
296 idn_result_t
idn_ucs4_ucs4toutf8(const unsigned long * ucs4,char * utf8,size_t tolen)297 idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) {
298 unsigned char *utf8p = (unsigned char *)utf8;
299 unsigned long v;
300 int width;
301 int mask;
302 int offset;
303 idn_result_t r;
304
305 TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n",
306 idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
307
308 while (*ucs4 != '\0') {
309 v = *ucs4++;
310 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
311 WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains "
312 "surrogate pair\n"));
313 r = idn_invalid_encoding;
314 goto ret;
315 }
316 if (v < 0x80) {
317 mask = 0;
318 width = 1;
319 } else if (v < 0x800) {
320 mask = 0xc0;
321 width = 2;
322 } else if (v < 0x10000) {
323 mask = 0xe0;
324 width = 3;
325 } else if (v < 0x200000) {
326 mask = 0xf0;
327 width = 4;
328 } else if (v < 0x4000000) {
329 mask = 0xf8;
330 width = 5;
331 } else if (v < 0x80000000) {
332 mask = 0xfc;
333 width = 6;
334 } else {
335 WARNING(("idn_ucs4_ucs4toutf8: invalid character\n"));
336 r = idn_invalid_encoding;
337 goto ret;
338 }
339
340 if (tolen < width) {
341 r = idn_buffer_overflow;
342 goto ret;
343 }
344 offset = 6 * (width - 1);
345 *utf8p++ = (v >> offset) | mask;
346 mask = 0x80;
347 while (offset > 0) {
348 offset -= 6;
349 *utf8p++ = ((v >> offset) & 0x3f) | mask;
350 }
351 tolen -= width;
352 }
353
354 if (tolen < 1) {
355 r = idn_buffer_overflow;
356 goto ret;
357 }
358 *utf8p = '\0';
359
360 r = idn_success;
361 ret:
362 if (r == idn_success) {
363 TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n",
364 idn__debug_xstring(utf8, 50)));
365 } else {
366 TRACE(("idn_ucs4_ucs4toutf8(): %s\n",
367 idn_result_tostring(r)));
368 }
369 return (r);
370 }
371
372 size_t
idn_ucs4_strlen(const unsigned long * ucs4)373 idn_ucs4_strlen(const unsigned long *ucs4) {
374 size_t len;
375
376 for (len = 0; *ucs4 != '\0'; ucs4++, len++)
377 /* nothing to do */ ;
378
379 return (len);
380 }
381
382 unsigned long *
idn_ucs4_strcpy(unsigned long * to,const unsigned long * from)383 idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) {
384 unsigned long *result = to;
385
386 while (*from != '\0')
387 *to++ = *from++;
388 *to = '\0';
389
390 return (result);
391 }
392
393 unsigned long *
idn_ucs4_strcat(unsigned long * to,const unsigned long * from)394 idn_ucs4_strcat(unsigned long *to, const unsigned long *from) {
395 unsigned long *result = to;
396
397 while (*to != '\0')
398 to++;
399
400 while (*from != '\0')
401 *to++ = *from++;
402 *to = '\0';
403
404 return (result);
405 }
406
407 int
idn_ucs4_strcmp(const unsigned long * str1,const unsigned long * str2)408 idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) {
409 while (*str1 != '\0') {
410 if (*str1 > *str2)
411 return (1);
412 else if (*str1 < *str2)
413 return (-1);
414 str1++;
415 str2++;
416 }
417
418 if (*str1 > *str2)
419 return (1);
420 else if (*str1 < *str2)
421 return (-1);
422
423 return (0);
424 }
425
426 int
idn_ucs4_strcasecmp(const unsigned long * str1,const unsigned long * str2)427 idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) {
428 unsigned long c1, c2;
429
430 while (*str1 != '\0') {
431 c1 = ASCII_TOLOWER(*str1);
432 c2 = ASCII_TOLOWER(*str2);
433 if (c1 > c2)
434 return (1);
435 else if (c1 < c2)
436 return (-1);
437 str1++;
438 str2++;
439 }
440
441 c1 = ASCII_TOLOWER(*str1);
442 c2 = ASCII_TOLOWER(*str2);
443 if (c1 > c2)
444 return (1);
445 else if (c1 < c2)
446 return (-1);
447
448 return (0);
449 }
450
451
452 unsigned long *
idn_ucs4_strdup(const unsigned long * str)453 idn_ucs4_strdup(const unsigned long *str) {
454 size_t length = idn_ucs4_strlen(str);
455 unsigned long *dupstr;
456
457 dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1));
458 if (dupstr == NULL)
459 return NULL;
460 memcpy(dupstr, str, sizeof(*str) * (length + 1));
461
462 return dupstr;
463 }
464