1 /* $OpenBSD: citrus_utf8.c,v 1.4 2011/04/21 00:16:06 yasuoka Exp $ */ 2 3 /*- 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/errno.h> 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/limits.h> 34 35 #include <errno.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <stddef.h> 39 #include <string.h> 40 #include <wchar.h> 41 42 #include "citrus_ctype.h" 43 #include "citrus_utf8.h" 44 45 _CITRUS_CTYPE_DEF_OPS(utf8); 46 47 struct _utf8_state { 48 wchar_t ch; 49 int want; 50 wchar_t lbound; 51 }; 52 53 size_t 54 /*ARGSUSED*/ 55 _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc, 56 const char * __restrict s, size_t n, 57 void * __restrict pspriv) 58 { 59 struct _utf8_state *us; 60 int ch, i, mask, want; 61 wchar_t lbound, wch; 62 63 us = (struct _utf8_state *)pspriv; 64 65 if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) { 66 errno = EINVAL; 67 return ((size_t)-1); 68 } 69 70 if (s == NULL) { 71 s = ""; 72 n = 1; 73 pwc = NULL; 74 } 75 76 if (n == 0) { 77 /* Incomplete multibyte sequence */ 78 return ((size_t)-2); 79 } 80 81 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 82 /* Fast path for plain ASCII characters. */ 83 if (pwc != NULL) 84 *pwc = ch; 85 return (ch != '\0' ? 1 : 0); 86 } 87 88 if (us->want == 0) { 89 /* 90 * Determine the number of octets that make up this character 91 * from the first octet, and a mask that extracts the 92 * interesting bits of the first octet. We already know 93 * the character is at least two bytes long. 94 * 95 * We also specify a lower bound for the character code to 96 * detect redundant, non-"shortest form" encodings. For 97 * example, the sequence C0 80 is _not_ a legal representation 98 * of the null character. This enforces a 1-to-1 mapping 99 * between character codes and their multibyte representations. 100 */ 101 ch = (unsigned char)*s; 102 if ((ch & 0x80) == 0) { 103 mask = 0x7f; 104 want = 1; 105 lbound = 0; 106 } else if ((ch & 0xe0) == 0xc0) { 107 mask = 0x1f; 108 want = 2; 109 lbound = 0x80; 110 } else if ((ch & 0xf0) == 0xe0) { 111 mask = 0x0f; 112 want = 3; 113 lbound = 0x800; 114 } else if ((ch & 0xf8) == 0xf0) { 115 mask = 0x07; 116 want = 4; 117 lbound = 0x10000; 118 } else { 119 /* 120 * Malformed input; input is not UTF-8. 121 * See RFC 3629. 122 */ 123 errno = EILSEQ; 124 return ((size_t)-1); 125 } 126 } else { 127 want = us->want; 128 lbound = us->lbound; 129 } 130 131 /* 132 * Decode the octet sequence representing the character in chunks 133 * of 6 bits, most significant first. 134 */ 135 if (us->want == 0) 136 wch = (unsigned char)*s++ & mask; 137 else 138 wch = us->ch; 139 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 140 if ((*s & 0xc0) != 0x80) { 141 /* 142 * Malformed input; bad characters in the middle 143 * of a character. 144 */ 145 errno = EILSEQ; 146 return ((size_t)-1); 147 } 148 wch <<= 6; 149 wch |= *s++ & 0x3f; 150 } 151 if (i < want) { 152 /* Incomplete multibyte sequence. */ 153 us->want = want - i; 154 us->lbound = lbound; 155 us->ch = wch; 156 return ((size_t)-2); 157 } 158 if (wch < lbound) { 159 /* 160 * Malformed input; redundant encoding. 161 */ 162 errno = EILSEQ; 163 return ((size_t)-1); 164 } 165 if ((wch >= 0xd800 && wch <= 0xdfff) || 166 wch == 0xfffe || wch == 0xffff) { 167 /* 168 * Malformed input; invalid code points. 169 */ 170 errno = EILSEQ; 171 return ((size_t)-1); 172 } 173 if (pwc != NULL) 174 *pwc = wch; 175 us->want = 0; 176 return (wch == L'\0' ? 0 : want); 177 } 178 179 int 180 /*ARGSUSED*/ 181 _citrus_utf8_ctype_mbsinit(const void * __restrict pspriv) 182 { 183 return (pspriv == NULL || 184 ((const struct _utf8_state *)pspriv)->want == 0); 185 } 186 187 size_t 188 /*ARGSUSED*/ 189 _citrus_utf8_ctype_mbsrtowcs(wchar_t * __restrict pwcs, 190 const char ** __restrict s, size_t n, 191 void * __restrict pspriv) 192 { 193 struct _utf8_state *us; 194 const char *src; 195 size_t nchr; 196 wchar_t wc; 197 size_t nb; 198 199 us = (struct _utf8_state *)pspriv; 200 src = *s; 201 nchr = 0; 202 203 if (pwcs == NULL) { 204 /* 205 * The fast path in the loop below is not safe if an ASCII 206 * character appears as anything but the first byte of a 207 * multibyte sequence. Check now to avoid doing it in the loop. 208 */ 209 if (us->want > 0 && (signed char)*src > 0) { 210 errno = EILSEQ; 211 return ((size_t)-1); 212 } 213 for (;;) { 214 if ((signed char)*src > 0) { 215 /* 216 * Fast path for plain ASCII characters 217 * excluding NUL. 218 */ 219 nb = 1; 220 } else { 221 nb = _citrus_utf8_ctype_mbrtowc(&wc, src, 222 _CITRUS_UTF8_MB_CUR_MAX, us); 223 if (nb == (size_t)-1) { 224 /* Invalid sequence. */ 225 return (nb); 226 } 227 if (nb == 0 || nb == (size_t)-2) { 228 return (nchr); 229 } 230 } 231 232 src += nb; 233 nchr++; 234 } 235 /*NOTREACHED*/ 236 } 237 238 /* 239 * The fast path in the loop below is not safe if an ASCII 240 * character appears as anything but the first byte of a 241 * multibyte sequence. Check now to avoid doing it in the loop. 242 */ 243 if (n > 0 && us->want > 0 && (signed char)*src > 0) { 244 errno = EILSEQ; 245 return ((size_t)-1); 246 } 247 while (n-- > 0) { 248 if ((signed char)*src > 0) { 249 /* 250 * Fast path for plain ASCII characters 251 * excluding NUL. 252 */ 253 *pwcs = (wchar_t)*src; 254 nb = 1; 255 } else { 256 nb = _citrus_utf8_ctype_mbrtowc(pwcs, src, 257 _CITRUS_UTF8_MB_CUR_MAX, us); 258 if (nb == (size_t)-1) { 259 *s = src; 260 return (nb); 261 } 262 if (nb == (size_t)-2) { 263 *s = src; 264 return (nchr); 265 } 266 if (nb == 0) { 267 *s = NULL; 268 return (nchr); 269 } 270 } 271 src += nb; 272 nchr++; 273 pwcs++; 274 } 275 *s = src; 276 return (nchr); 277 } 278 279 size_t 280 /*ARGSUSED*/ 281 _citrus_utf8_ctype_wcrtomb(char * __restrict s, 282 wchar_t wc, void * __restrict pspriv) 283 { 284 struct _utf8_state *us; 285 unsigned char lead; 286 int i, len; 287 288 us = (struct _utf8_state *)pspriv; 289 290 if (us->want != 0) { 291 errno = EINVAL; 292 return ((size_t)-1); 293 } 294 295 if (s == NULL) { 296 /* Reset to initial shift state (no-op) */ 297 return (1); 298 } 299 300 if ((wc & ~0x7f) == 0) { 301 /* Fast path for plain ASCII characters. */ 302 *s = (char)wc; 303 return (1); 304 } 305 306 /* 307 * Determine the number of octets needed to represent this character. 308 * We always output the shortest sequence possible. Also specify the 309 * first few bits of the first octet, which contains the information 310 * about the sequence length. 311 */ 312 if ((wc & ~0x7f) == 0) { 313 lead = 0; 314 len = 1; 315 } else if ((wc & ~0x7ff) == 0) { 316 lead = 0xc0; 317 len = 2; 318 } else if ((wc & ~0xffff) == 0) { 319 lead = 0xe0; 320 len = 3; 321 } else if ((wc & ~0x1fffff) == 0) { 322 lead = 0xf0; 323 len = 4; 324 } else { 325 errno = EILSEQ; 326 return ((size_t)-1); 327 } 328 329 /* 330 * Output the octets representing the character in chunks 331 * of 6 bits, least significant last. The first octet is 332 * a special case because it contains the sequence length 333 * information. 334 */ 335 for (i = len - 1; i > 0; i--) { 336 s[i] = (wc & 0x3f) | 0x80; 337 wc >>= 6; 338 } 339 *s = (wc & 0xff) | lead; 340 341 return (len); 342 } 343 344 size_t 345 /*ARGSUSED*/ 346 _citrus_utf8_ctype_wcsrtombs(char * __restrict s, 347 const wchar_t ** __restrict pwcs, size_t n, 348 void * __restrict pspriv) 349 { 350 struct _utf8_state *us; 351 char buf[_CITRUS_UTF8_MB_CUR_MAX]; 352 const wchar_t *src; 353 size_t nbytes; 354 size_t nb; 355 356 us = (struct _utf8_state *)pspriv; 357 358 if (us->want != 0) { 359 errno = EINVAL; 360 return ((size_t)-1); 361 } 362 363 src = *pwcs; 364 nbytes = 0; 365 366 if (s == NULL) { 367 for (;;) { 368 if (0 <= *src && *src < 0x80) 369 /* Fast path for plain ASCII characters. */ 370 nb = 1; 371 else { 372 nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us); 373 if (nb == (size_t)-1) { 374 /* Invalid character */ 375 return (nb); 376 } 377 } 378 if (*src == L'\0') { 379 return (nbytes + nb - 1); 380 } 381 src++; 382 nbytes += nb; 383 } 384 /*NOTREACHED*/ 385 } 386 387 while (n > 0) { 388 if (0 <= *src && *src < 0x80) { 389 /* Fast path for plain ASCII characters. */ 390 nb = 1; 391 *s = *src; 392 } else if (n > (size_t)_CITRUS_UTF8_MB_CUR_MAX) { 393 /* Enough space to translate in-place. */ 394 nb = _citrus_utf8_ctype_wcrtomb(s, *src, us); 395 if (nb == (size_t)-1) { 396 *pwcs = src; 397 return (nb); 398 } 399 } else { 400 /* 401 * May not be enough space; use temp. buffer. 402 */ 403 nb = _citrus_utf8_ctype_wcrtomb(buf, *src, us); 404 if (nb == (size_t)-1) { 405 *pwcs = src; 406 return (nb); 407 } 408 if (nb > n) 409 /* MB sequence for character won't fit. */ 410 break; 411 memcpy(s, buf, nb); 412 } 413 if (*src == L'\0') { 414 *pwcs = NULL; 415 return (nbytes + nb - 1); 416 } 417 src++; 418 s += nb; 419 n -= nb; 420 nbytes += nb; 421 } 422 *pwcs = src; 423 return (nbytes); 424 } 425