1 /* $NetBSD: utf8.c,v 1.1.1.1 2011/04/13 18:16:00 elric Exp $ */ 2 3 /* 4 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan 5 * (Royal Institute of Technology, Stockholm, Sweden). 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * 3. Neither the name of the Institute nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <config.h> 37 #include "windlocl.h" 38 39 static int 40 utf8toutf32(const unsigned char **pp, uint32_t *out) 41 { 42 const unsigned char *p = *pp; 43 unsigned c = *p; 44 45 if (c & 0x80) { 46 if ((c & 0xE0) == 0xC0) { 47 const unsigned c2 = *++p; 48 if ((c2 & 0xC0) == 0x80) { 49 *out = ((c & 0x1F) << 6) 50 | (c2 & 0x3F); 51 } else { 52 return WIND_ERR_INVALID_UTF8; 53 } 54 } else if ((c & 0xF0) == 0xE0) { 55 const unsigned c2 = *++p; 56 if ((c2 & 0xC0) == 0x80) { 57 const unsigned c3 = *++p; 58 if ((c3 & 0xC0) == 0x80) { 59 *out = ((c & 0x0F) << 12) 60 | ((c2 & 0x3F) << 6) 61 | (c3 & 0x3F); 62 } else { 63 return WIND_ERR_INVALID_UTF8; 64 } 65 } else { 66 return WIND_ERR_INVALID_UTF8; 67 } 68 } else if ((c & 0xF8) == 0xF0) { 69 const unsigned c2 = *++p; 70 if ((c2 & 0xC0) == 0x80) { 71 const unsigned c3 = *++p; 72 if ((c3 & 0xC0) == 0x80) { 73 const unsigned c4 = *++p; 74 if ((c4 & 0xC0) == 0x80) { 75 *out = ((c & 0x07) << 18) 76 | ((c2 & 0x3F) << 12) 77 | ((c3 & 0x3F) << 6) 78 | (c4 & 0x3F); 79 } else { 80 return WIND_ERR_INVALID_UTF8; 81 } 82 } else { 83 return WIND_ERR_INVALID_UTF8; 84 } 85 } else { 86 return WIND_ERR_INVALID_UTF8; 87 } 88 } else { 89 return WIND_ERR_INVALID_UTF8; 90 } 91 } else { 92 *out = c; 93 } 94 95 *pp = p; 96 97 return 0; 98 } 99 100 /** 101 * Convert an UTF-8 string to an UCS4 string. 102 * 103 * @param in an UTF-8 string to convert. 104 * @param out the resulting UCS4 strint, must be at least 105 * wind_utf8ucs4_length() long. If out is NULL, the function will 106 * calculate the needed space for the out variable (just like 107 * wind_utf8ucs4_length()). 108 * @param out_len before processing out_len should be the length of 109 * the out variable, after processing it will be the length of the out 110 * string. 111 * 112 * @return returns 0 on success, an wind error code otherwise 113 * @ingroup wind 114 */ 115 116 int 117 wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len) 118 { 119 const unsigned char *p; 120 size_t o = 0; 121 int ret; 122 123 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 124 uint32_t u; 125 126 ret = utf8toutf32(&p, &u); 127 if (ret) 128 return ret; 129 130 if (out) { 131 if (o >= *out_len) 132 return WIND_ERR_OVERRUN; 133 out[o] = u; 134 } 135 o++; 136 } 137 *out_len = o; 138 return 0; 139 } 140 141 /** 142 * Calculate the length of from converting a UTF-8 string to a UCS4 143 * string. 144 * 145 * @param in an UTF-8 string to convert. 146 * @param out_len the length of the resulting UCS4 string. 147 * 148 * @return returns 0 on success, an wind error code otherwise 149 * @ingroup wind 150 */ 151 152 int 153 wind_utf8ucs4_length(const char *in, size_t *out_len) 154 { 155 return wind_utf8ucs4(in, NULL, out_len); 156 } 157 158 static const char first_char[4] = 159 { 0x00, 0xC0, 0xE0, 0xF0 }; 160 161 /** 162 * Convert an UCS4 string to a UTF-8 string. 163 * 164 * @param in an UCS4 string to convert. 165 * @param in_len the length input array. 166 167 * @param out the resulting UTF-8 strint, must be at least 168 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If 169 * out is NULL, the function will calculate the needed space for the 170 * out variable (just like wind_ucs4utf8_length()). 171 172 * @param out_len before processing out_len should be the length of 173 * the out variable, after processing it will be the length of the out 174 * string. 175 * 176 * @return returns 0 on success, an wind error code otherwise 177 * @ingroup wind 178 */ 179 180 int 181 wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len) 182 { 183 uint32_t ch; 184 size_t i, len, o; 185 186 for (o = 0, i = 0; i < in_len; i++) { 187 ch = in[i]; 188 189 if (ch < 0x80) { 190 len = 1; 191 } else if (ch < 0x800) { 192 len = 2; 193 } else if (ch < 0x10000) { 194 len = 3; 195 } else if (ch <= 0x10FFFF) { 196 len = 4; 197 } else 198 return WIND_ERR_INVALID_UTF32; 199 200 o += len; 201 202 if (out) { 203 if (o >= *out_len) 204 return WIND_ERR_OVERRUN; 205 206 switch(len) { 207 case 4: 208 out[3] = (ch | 0x80) & 0xbf; 209 ch = ch << 6; 210 case 3: 211 out[2] = (ch | 0x80) & 0xbf; 212 ch = ch << 6; 213 case 2: 214 out[1] = (ch | 0x80) & 0xbf; 215 ch = ch << 6; 216 case 1: 217 out[0] = ch | first_char[len - 1]; 218 } 219 } 220 out += len; 221 } 222 if (out) { 223 if (o + 1 >= *out_len) 224 return WIND_ERR_OVERRUN; 225 *out = '\0'; 226 } 227 *out_len = o; 228 return 0; 229 } 230 231 /** 232 * Calculate the length of from converting a UCS4 string to an UTF-8 string. 233 * 234 * @param in an UCS4 string to convert. 235 * @param in_len the length of UCS4 string to convert. 236 * @param out_len the length of the resulting UTF-8 string. 237 * 238 * @return returns 0 on success, an wind error code otherwise 239 * @ingroup wind 240 */ 241 242 int 243 wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len) 244 { 245 return wind_ucs4utf8(in, in_len, NULL, out_len); 246 } 247 248 /** 249 * Read in an UCS2 from a buffer. 250 * 251 * @param ptr The input buffer to read from. 252 * @param len the length of the input buffer. 253 * @param flags Flags to control the behavior of the function. 254 * @param out the output UCS2, the array must be at least out/2 long. 255 * @param out_len the output length 256 * 257 * @return returns 0 on success, an wind error code otherwise. 258 * @ingroup wind 259 */ 260 261 int 262 wind_ucs2read(const void *ptr, size_t len, unsigned int *flags, 263 uint16_t *out, size_t *out_len) 264 { 265 const unsigned char *p = ptr; 266 int little = ((*flags) & WIND_RW_LE); 267 size_t olen = *out_len; 268 269 /** if len is zero, flags are unchanged */ 270 if (len == 0) { 271 *out_len = 0; 272 return 0; 273 } 274 275 /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */ 276 if (len & 1) 277 return WIND_ERR_LENGTH_NOT_MOD2; 278 279 /** 280 * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is 281 * found, check is LE/BE flag is already and use that otherwise 282 * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and 283 * the LE/BE flag and set the resulting LE/BE flag. 284 */ 285 if ((*flags) & WIND_RW_BOM) { 286 uint16_t bom = (p[0] << 8) + p[1]; 287 if (bom == 0xfffe || bom == 0xfeff) { 288 little = (bom == 0xfffe); 289 p += 2; 290 len -= 2; 291 } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) { 292 /* little already set */ 293 } else 294 return WIND_ERR_NO_BOM; 295 *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE)); 296 *flags |= little ? WIND_RW_LE : WIND_RW_BE; 297 } 298 299 while (len) { 300 if (olen < 1) 301 return WIND_ERR_OVERRUN; 302 if (little) 303 *out = (p[1] << 8) + p[0]; 304 else 305 *out = (p[0] << 8) + p[1]; 306 out++; p += 2; len -= 2; olen--; 307 } 308 *out_len -= olen; 309 return 0; 310 } 311 312 /** 313 * Write an UCS2 string to a buffer. 314 * 315 * @param in The input UCS2 string. 316 * @param in_len the length of the input buffer. 317 * @param flags Flags to control the behavior of the function. 318 * @param ptr The input buffer to write to, the array must be at least 319 * (in + 1) * 2 bytes long. 320 * @param out_len the output length 321 * 322 * @return returns 0 on success, an wind error code otherwise. 323 * @ingroup wind 324 */ 325 326 int 327 wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags, 328 void *ptr, size_t *out_len) 329 { 330 unsigned char *p = ptr; 331 size_t len = *out_len; 332 333 /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/ 334 if (len & 1) 335 return WIND_ERR_LENGTH_NOT_MOD2; 336 337 /** On zero input length, flags are preserved */ 338 if (in_len == 0) { 339 *out_len = 0; 340 return 0; 341 } 342 /** If flags have WIND_RW_BOM set, the byte order mark is written 343 * first to the output data */ 344 if ((*flags) & WIND_RW_BOM) { 345 uint16_t bom = 0xfffe; 346 347 if (len < 2) 348 return WIND_ERR_OVERRUN; 349 350 if ((*flags) & WIND_RW_LE) { 351 p[0] = (bom >> 8) & 0xff; 352 p[1] = (bom ) & 0xff; 353 } else { 354 p[1] = (bom ) & 0xff; 355 p[0] = (bom >> 8) & 0xff; 356 } 357 len -= 2; 358 } 359 360 while (in_len) { 361 /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */ 362 if (len < 2) 363 return WIND_ERR_OVERRUN; 364 if ((*flags) & WIND_RW_LE) { 365 p[0] = (in[0] >> 8) & 0xff; 366 p[1] = (in[0] ) & 0xff; 367 } else { 368 p[1] = (in[0] ) & 0xff; 369 p[0] = (in[0] >> 8) & 0xff; 370 } 371 len -= 2; 372 in_len--; 373 p += 2; 374 in++; 375 } 376 *out_len -= len; 377 return 0; 378 } 379 380 381 /** 382 * Convert an UTF-8 string to an UCS2 string. 383 * 384 * @param in an UTF-8 string to convert. 385 * @param out the resulting UCS2 strint, must be at least 386 * wind_utf8ucs2_length() long. If out is NULL, the function will 387 * calculate the needed space for the out variable (just like 388 * wind_utf8ucs2_length()). 389 * @param out_len before processing out_len should be the length of 390 * the out variable, after processing it will be the length of the out 391 * string. 392 * 393 * @return returns 0 on success, an wind error code otherwise 394 * @ingroup wind 395 */ 396 397 int 398 wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len) 399 { 400 const unsigned char *p; 401 size_t o = 0; 402 int ret; 403 404 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 405 uint32_t u; 406 407 ret = utf8toutf32(&p, &u); 408 if (ret) 409 return ret; 410 411 if (u & 0xffff0000) 412 return WIND_ERR_NOT_UTF16; 413 414 if (out) { 415 if (o >= *out_len) 416 return WIND_ERR_OVERRUN; 417 out[o] = u; 418 } 419 o++; 420 } 421 *out_len = o; 422 return 0; 423 } 424 425 /** 426 * Calculate the length of from converting a UTF-8 string to a UCS2 427 * string. 428 * 429 * @param in an UTF-8 string to convert. 430 * @param out_len the length of the resulting UCS4 string. 431 * 432 * @return returns 0 on success, an wind error code otherwise 433 * @ingroup wind 434 */ 435 436 int 437 wind_utf8ucs2_length(const char *in, size_t *out_len) 438 { 439 return wind_utf8ucs2(in, NULL, out_len); 440 } 441 442 /** 443 * Convert an UCS2 string to a UTF-8 string. 444 * 445 * @param in an UCS2 string to convert. 446 * @param in_len the length of the in UCS2 string. 447 * @param out the resulting UTF-8 strint, must be at least 448 * wind_ucs2utf8_length() long. If out is NULL, the function will 449 * calculate the needed space for the out variable (just like 450 * wind_ucs2utf8_length()). 451 * @param out_len before processing out_len should be the length of 452 * the out variable, after processing it will be the length of the out 453 * string. 454 * 455 * @return returns 0 on success, an wind error code otherwise 456 * @ingroup wind 457 */ 458 459 int 460 wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) 461 { 462 uint16_t ch; 463 size_t i, len, o; 464 465 for (o = 0, i = 0; i < in_len; i++) { 466 ch = in[i]; 467 468 if (ch < 0x80) { 469 len = 1; 470 } else if (ch < 0x800) { 471 len = 2; 472 } else 473 len = 3; 474 475 o += len; 476 477 if (out) { 478 if (o >= *out_len) 479 return WIND_ERR_OVERRUN; 480 481 switch(len) { 482 case 3: 483 out[2] = (ch | 0x80) & 0xbf; 484 ch = ch << 6; 485 case 2: 486 out[1] = (ch | 0x80) & 0xbf; 487 ch = ch << 6; 488 case 1: 489 out[0] = ch | first_char[len - 1]; 490 } 491 out += len; 492 } 493 } 494 if (out) { 495 if (o >= *out_len) 496 return WIND_ERR_OVERRUN; 497 *out = '\0'; 498 } 499 *out_len = o; 500 return 0; 501 } 502 503 /** 504 * Calculate the length of from converting a UCS2 string to an UTF-8 string. 505 * 506 * @param in an UCS2 string to convert. 507 * @param in_len an UCS2 string length to convert. 508 * @param out_len the length of the resulting UTF-8 string. 509 * 510 * @return returns 0 on success, an wind error code otherwise 511 * @ingroup wind 512 */ 513 514 int 515 wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len) 516 { 517 return wind_ucs2utf8(in, in_len, NULL, out_len); 518 } 519