1 /* $NetBSD: ucs4.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */ 2 3 #ifndef lint 4 static char *rcsid = "Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp "; 5 #endif 6 7 /* 8 * Copyright (c) 2001 Japan Network Information Center. All rights reserved. 9 * 10 * By using this file, you agree to the terms and conditions set forth bellow. 11 * 12 * LICENSE TERMS AND CONDITIONS 13 * 14 * The following License Terms and Conditions apply, unless a different 15 * license is obtained from Japan Network Information Center ("JPNIC"), 16 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 17 * Chiyoda-ku, Tokyo 101-0047, Japan. 18 * 19 * 1. Use, Modification and Redistribution (including distribution of any 20 * modified or derived work) in source and/or binary forms is permitted 21 * under this License Terms and Conditions. 22 * 23 * 2. Redistribution of source code must retain the copyright notices as they 24 * appear in each source code file, this License Terms and Conditions. 25 * 26 * 3. Redistribution in binary form must reproduce the Copyright Notice, 27 * this License Terms and Conditions, in the documentation and/or other 28 * materials provided with the distribution. For the purposes of binary 29 * distribution the "Copyright Notice" refers to the following language: 30 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 31 * 32 * 4. The name of JPNIC may not be used to endorse or promote products 33 * derived from this Software without specific prior written approval of 34 * JPNIC. 35 * 36 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 37 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 39 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 40 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 43 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 44 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 45 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 46 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 47 */ 48 49 #include <config.h> 50 51 #include <stddef.h> 52 #include <stdlib.h> 53 #include <string.h> 54 55 #include <idn/assert.h> 56 #include <idn/result.h> 57 #include <idn/logmacro.h> 58 #include <idn/util.h> 59 #include <idn/ucs4.h> 60 #include <idn/debug.h> 61 62 /* 63 * Unicode surrogate pair. 64 */ 65 #define IS_SURROGATE_HIGH(v) (0xd800 <= (v) && (v) <= 0xdbff) 66 #define IS_SURROGATE_LOW(v) (0xdc00 <= (v) && (v) <= 0xdfff) 67 #define SURROGATE_HIGH(v) (SURROGATE_H_OFF + (((v) - 0x10000) >> 10)) 68 #define SURROGATE_LOW(v) (SURROGATE_L_OFF + ((v) & 0x3ff)) 69 #define SURROGATE_BASE 0x10000 70 #define SURROGATE_H_OFF 0xd800 71 #define SURROGATE_L_OFF 0xdc00 72 #define COMBINE_SURROGATE(h, l) \ 73 (SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF)) 74 75 /* 76 * ASCII ctype macros. 77 * Note that these macros evaluate the argument multiple times. Be careful. 78 */ 79 #define ASCII_TOUPPER(c) \ 80 (('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c)) 81 #define ASCII_TOLOWER(c) \ 82 (('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c)) 83 84 idn_result_t 85 idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16, 86 size_t tolen) { 87 unsigned short *utf16p = utf16; 88 unsigned long v; 89 idn_result_t r; 90 91 TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n", 92 idn__debug_ucs4xstring(ucs4, 50), (int)tolen)); 93 94 while (*ucs4 != '\0') { 95 v = *ucs4++; 96 97 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 98 WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains " 99 "surrogate pair\n")); 100 r = idn_invalid_encoding; 101 goto ret; 102 } else if (v > 0xffff) { 103 /* Convert to surrogate pair */ 104 if (v >= 0x110000) { 105 r = idn_invalid_encoding; 106 goto ret; 107 } 108 if (tolen < 2) { 109 r = idn_buffer_overflow; 110 goto ret; 111 } 112 *utf16p++ = SURROGATE_HIGH(v); 113 *utf16p++ = SURROGATE_LOW(v); 114 tolen -= 2; 115 } else { 116 if (tolen < 1) { 117 r = idn_buffer_overflow; 118 goto ret; 119 } 120 *utf16p++ = v; 121 tolen--; 122 } 123 } 124 125 if (tolen < 1) { 126 r = idn_buffer_overflow; 127 goto ret; 128 } 129 *utf16p = '\0'; 130 131 r = idn_success; 132 ret: 133 if (r == idn_success) { 134 TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n", 135 idn__debug_utf16xstring(utf16, 50))); 136 } else { 137 TRACE(("idn_ucs4_ucs4toutf16(): %s\n", 138 idn_result_tostring(r))); 139 } 140 return (r); 141 } 142 143 idn_result_t 144 idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4, 145 size_t tolen) { 146 unsigned long *ucs4p = ucs4; 147 unsigned short v0, v1; 148 idn_result_t r; 149 150 TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n", 151 idn__debug_utf16xstring(utf16, 50), (int)tolen)); 152 153 while (*utf16 != '\0') { 154 v0 = *utf16; 155 156 if (tolen < 1) { 157 r = idn_buffer_overflow; 158 goto ret; 159 } 160 161 if (IS_SURROGATE_HIGH(v0)) { 162 v1 = *(utf16 + 1); 163 if (!IS_SURROGATE_LOW(v1)) { 164 WARNING(("idn_ucs4_utf16toucs4: " 165 "corrupted surrogate pair\n")); 166 r = idn_invalid_encoding; 167 goto ret; 168 } 169 *ucs4p++ = COMBINE_SURROGATE(v0, v1); 170 tolen--; 171 utf16 += 2; 172 173 } else { 174 *ucs4p++ = v0; 175 tolen--; 176 utf16++; 177 178 } 179 } 180 181 if (tolen < 1) { 182 r = idn_buffer_overflow; 183 goto ret; 184 } 185 *ucs4p = '\0'; 186 187 r = idn_success; 188 ret: 189 if (r == idn_success) { 190 TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n", 191 idn__debug_ucs4xstring(ucs4, 50))); 192 } else { 193 TRACE(("idn_ucs4_utf16toucs4(): %s\n", 194 idn_result_tostring(r))); 195 } 196 return (r); 197 } 198 199 idn_result_t 200 idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) { 201 const unsigned char *utf8p = (const unsigned char *)utf8; 202 unsigned long *ucs4p = ucs4; 203 unsigned long v, min; 204 unsigned char c; 205 int width; 206 int i; 207 idn_result_t r; 208 209 TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n", 210 idn__debug_xstring(utf8, 50), (int)tolen)); 211 212 while(*utf8p != '\0') { 213 c = *utf8p++; 214 if (c < 0x80) { 215 v = c; 216 min = 0; 217 width = 1; 218 } else if (c < 0xc0) { 219 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 220 r = idn_invalid_encoding; 221 goto ret; 222 } else if (c < 0xe0) { 223 v = c & 0x1f; 224 min = 0x80; 225 width = 2; 226 } else if (c < 0xf0) { 227 v = c & 0x0f; 228 min = 0x800; 229 width = 3; 230 } else if (c < 0xf8) { 231 v = c & 0x07; 232 min = 0x10000; 233 width = 4; 234 } else if (c < 0xfc) { 235 v = c & 0x03; 236 min = 0x200000; 237 width = 5; 238 } else if (c < 0xfe) { 239 v = c & 0x01; 240 min = 0x4000000; 241 width = 6; 242 } else { 243 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 244 r = idn_invalid_encoding; 245 goto ret; 246 } 247 248 for (i = width - 1; i > 0; i--) { 249 c = *utf8p++; 250 if (c < 0x80 || 0xc0 <= c) { 251 WARNING(("idn_ucs4_utf8toucs4: " 252 "invalid character\n")); 253 r = idn_invalid_encoding; 254 goto ret; 255 } 256 v = (v << 6) | (c & 0x3f); 257 } 258 259 if (v < min) { 260 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 261 r = idn_invalid_encoding; 262 goto ret; 263 } 264 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 265 WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains " 266 "surrogate pair\n")); 267 r = idn_invalid_encoding; 268 goto ret; 269 } 270 if (tolen < 1) { 271 r = idn_buffer_overflow; 272 goto ret; 273 } 274 tolen--; 275 *ucs4p++ = v; 276 } 277 278 if (tolen < 1) { 279 r = idn_buffer_overflow; 280 goto ret; 281 } 282 *ucs4p = '\0'; 283 284 r = idn_success; 285 ret: 286 if (r == idn_success) { 287 TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n", 288 idn__debug_ucs4xstring(ucs4, 50))); 289 } else { 290 TRACE(("idn_ucs4_utf8toucs4(): %s\n", 291 idn_result_tostring(r))); 292 } 293 return (r); 294 } 295 296 idn_result_t 297 idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) { 298 unsigned char *utf8p = (unsigned char *)utf8; 299 unsigned long v; 300 int width; 301 int mask; 302 int offset; 303 idn_result_t r; 304 305 TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n", 306 idn__debug_ucs4xstring(ucs4, 50), (int)tolen)); 307 308 while (*ucs4 != '\0') { 309 v = *ucs4++; 310 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 311 WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains " 312 "surrogate pair\n")); 313 r = idn_invalid_encoding; 314 goto ret; 315 } 316 if (v < 0x80) { 317 mask = 0; 318 width = 1; 319 } else if (v < 0x800) { 320 mask = 0xc0; 321 width = 2; 322 } else if (v < 0x10000) { 323 mask = 0xe0; 324 width = 3; 325 } else if (v < 0x200000) { 326 mask = 0xf0; 327 width = 4; 328 } else if (v < 0x4000000) { 329 mask = 0xf8; 330 width = 5; 331 } else if (v < 0x80000000) { 332 mask = 0xfc; 333 width = 6; 334 } else { 335 WARNING(("idn_ucs4_ucs4toutf8: invalid character\n")); 336 r = idn_invalid_encoding; 337 goto ret; 338 } 339 340 if (tolen < width) { 341 r = idn_buffer_overflow; 342 goto ret; 343 } 344 offset = 6 * (width - 1); 345 *utf8p++ = (v >> offset) | mask; 346 mask = 0x80; 347 while (offset > 0) { 348 offset -= 6; 349 *utf8p++ = ((v >> offset) & 0x3f) | mask; 350 } 351 tolen -= width; 352 } 353 354 if (tolen < 1) { 355 r = idn_buffer_overflow; 356 goto ret; 357 } 358 *utf8p = '\0'; 359 360 r = idn_success; 361 ret: 362 if (r == idn_success) { 363 TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n", 364 idn__debug_xstring(utf8, 50))); 365 } else { 366 TRACE(("idn_ucs4_ucs4toutf8(): %s\n", 367 idn_result_tostring(r))); 368 } 369 return (r); 370 } 371 372 size_t 373 idn_ucs4_strlen(const unsigned long *ucs4) { 374 size_t len; 375 376 for (len = 0; *ucs4 != '\0'; ucs4++, len++) 377 /* nothing to do */ ; 378 379 return (len); 380 } 381 382 unsigned long * 383 idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) { 384 unsigned long *result = to; 385 386 while (*from != '\0') 387 *to++ = *from++; 388 *to = '\0'; 389 390 return (result); 391 } 392 393 unsigned long * 394 idn_ucs4_strcat(unsigned long *to, const unsigned long *from) { 395 unsigned long *result = to; 396 397 while (*to != '\0') 398 to++; 399 400 while (*from != '\0') 401 *to++ = *from++; 402 *to = '\0'; 403 404 return (result); 405 } 406 407 int 408 idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) { 409 while (*str1 != '\0') { 410 if (*str1 > *str2) 411 return (1); 412 else if (*str1 < *str2) 413 return (-1); 414 str1++; 415 str2++; 416 } 417 418 if (*str1 > *str2) 419 return (1); 420 else if (*str1 < *str2) 421 return (-1); 422 423 return (0); 424 } 425 426 int 427 idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) { 428 unsigned long c1, c2; 429 430 while (*str1 != '\0') { 431 c1 = ASCII_TOLOWER(*str1); 432 c2 = ASCII_TOLOWER(*str2); 433 if (c1 > c2) 434 return (1); 435 else if (c1 < c2) 436 return (-1); 437 str1++; 438 str2++; 439 } 440 441 c1 = ASCII_TOLOWER(*str1); 442 c2 = ASCII_TOLOWER(*str2); 443 if (c1 > c2) 444 return (1); 445 else if (c1 < c2) 446 return (-1); 447 448 return (0); 449 } 450 451 452 unsigned long * 453 idn_ucs4_strdup(const unsigned long *str) { 454 size_t length = idn_ucs4_strlen(str); 455 unsigned long *dupstr; 456 457 dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1)); 458 if (dupstr == NULL) 459 return NULL; 460 memcpy(dupstr, str, sizeof(*str) * (length + 1)); 461 462 return dupstr; 463 } 464