1 /* $OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */ 2 3 /*- 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 31 #include <errno.h> 32 #include <string.h> 33 #include <wchar.h> 34 35 #include "citrus_ctype.h" 36 37 struct _utf8_state { 38 wchar_t ch; 39 int want; 40 wchar_t lbound; 41 }; 42 43 size_t 44 _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc, 45 const char * __restrict s, size_t n, mbstate_t * __restrict ps) 46 { 47 struct _utf8_state *us; 48 int ch, i, mask, want; 49 wchar_t lbound, wch; 50 51 us = (struct _utf8_state *)ps; 52 53 if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) { 54 errno = EINVAL; 55 return -1; 56 } 57 58 if (s == NULL) { 59 s = ""; 60 n = 1; 61 pwc = NULL; 62 } 63 64 if (n == 0) 65 return -2; 66 67 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 68 /* Fast path for plain ASCII characters. */ 69 if (pwc != NULL) 70 *pwc = ch; 71 return ch != '\0' ? 1 : 0; 72 } 73 74 if (us->want == 0) { 75 /* 76 * Determine the number of bytes that make up this character 77 * from the first byte, and a mask that extracts the 78 * interesting bits of the first byte. We already know 79 * the character is at least two bytes long. 80 * 81 * We also specify a lower bound for the character code to 82 * detect redundant, non-"shortest form" encodings. For 83 * example, the sequence C0 80 is _not_ a legal representation 84 * of the null character. This enforces a 1-to-1 mapping 85 * between character codes and their multibyte representations. 86 */ 87 ch = (unsigned char)*s; 88 if ((ch & 0x80) == 0) { 89 mask = 0x7f; 90 want = 1; 91 lbound = 0; 92 } else if ((ch & 0xe0) == 0xc0) { 93 mask = 0x1f; 94 want = 2; 95 lbound = 0x80; 96 } else if ((ch & 0xf0) == 0xe0) { 97 mask = 0x0f; 98 want = 3; 99 lbound = 0x800; 100 } else if ((ch & 0xf8) == 0xf0) { 101 mask = 0x07; 102 want = 4; 103 lbound = 0x10000; 104 } else { 105 /* 106 * Malformed input; input is not UTF-8. 107 * See RFC 3629. 108 */ 109 errno = EILSEQ; 110 return -1; 111 } 112 } else { 113 want = us->want; 114 lbound = us->lbound; 115 } 116 117 /* 118 * Decode the byte sequence representing the character in chunks 119 * of 6 bits, most significant first. 120 */ 121 if (us->want == 0) 122 wch = (unsigned char)*s++ & mask; 123 else 124 wch = us->ch; 125 for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) { 126 if ((*s & 0xc0) != 0x80) { 127 /* 128 * Malformed input; bad byte in the middle 129 * of a character. 130 */ 131 errno = EILSEQ; 132 return -1; 133 } 134 wch <<= 6; 135 wch |= *s++ & 0x3f; 136 } 137 if (i < want) { 138 /* Incomplete multibyte sequence. */ 139 us->want = want - i; 140 us->lbound = lbound; 141 us->ch = wch; 142 return -2; 143 } 144 if (wch < lbound) { 145 /* 146 * Malformed input; redundant encoding. 147 */ 148 errno = EILSEQ; 149 return -1; 150 } 151 if (wch >= 0xd800 && wch <= 0xdfff) { 152 /* 153 * Malformed input; invalid code points. 154 */ 155 errno = EILSEQ; 156 return -1; 157 } 158 if (wch > 0x10ffff) { 159 /* 160 * Malformed input; invalid code points. 161 */ 162 errno = EILSEQ; 163 return -1; 164 } 165 if (pwc != NULL) 166 *pwc = wch; 167 us->want = 0; 168 return wch == L'\0' ? 0 : want; 169 } 170 171 int 172 _citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps) 173 { 174 return ((const struct _utf8_state *)ps)->want == 0; 175 } 176 177 size_t 178 _citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst, 179 const char ** __restrict src, size_t nmc, size_t len, 180 mbstate_t * __restrict ps) 181 { 182 struct _utf8_state *us; 183 size_t i, o, r; 184 185 us = (struct _utf8_state *)ps; 186 187 if (dst == NULL) { 188 /* 189 * The fast path in the loop below is not safe if an ASCII 190 * character appears as anything but the first byte of a 191 * multibyte sequence. Check now to avoid doing it in the loop. 192 */ 193 if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) { 194 errno = EILSEQ; 195 return -1; 196 } 197 for (i = o = 0; i < nmc; i += r, o++) { 198 if ((unsigned char)(*src)[i] < 0x80) { 199 /* Fast path for plain ASCII characters. */ 200 if ((*src)[i] == '\0') 201 return o; 202 r = 1; 203 } else { 204 r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i, 205 nmc - i, ps); 206 if (r == (size_t)-1) 207 return r; 208 if (r == (size_t)-2) 209 return o; 210 if (r == 0) 211 return o; 212 } 213 } 214 return o; 215 } 216 217 /* 218 * The fast path in the loop below is not safe if an ASCII 219 * character appears as anything but the first byte of a 220 * multibyte sequence. Check now to avoid doing it in the loop. 221 */ 222 if (len > 0 && nmc > 0 && us->want > 0 && 223 (unsigned char)(*src)[0] < 0x80) { 224 errno = EILSEQ; 225 return -1; 226 } 227 for (i = o = 0; i < nmc && o < len; i += r, o++) { 228 if ((unsigned char)(*src)[i] < 0x80) { 229 /* Fast path for plain ASCII characters. */ 230 dst[o] = (wchar_t)(unsigned char)(*src)[i]; 231 if ((*src)[i] == '\0') { 232 *src = NULL; 233 return o; 234 } 235 r = 1; 236 } else { 237 r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i, 238 nmc - i, ps); 239 if (r == (size_t)-1) { 240 *src += i; 241 return r; 242 } 243 if (r == (size_t)-2) { 244 *src += nmc; 245 return o; 246 } 247 if (r == 0) { 248 *src = NULL; 249 return o; 250 } 251 } 252 } 253 *src += i; 254 return o; 255 } 256 257 size_t 258 _citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc, 259 mbstate_t * __restrict ps) 260 { 261 struct _utf8_state *us; 262 unsigned char lead; 263 int i, len; 264 265 us = (struct _utf8_state *)ps; 266 267 if (us->want != 0) { 268 errno = EINVAL; 269 return -1; 270 } 271 272 if (s == NULL) 273 return 1; 274 275 if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) { 276 errno = EILSEQ; 277 return -1; 278 } 279 280 /* 281 * Determine the number of bytes needed to represent this character. 282 * We always output the shortest sequence possible. Also specify the 283 * first few bits of the first byte, which contains the information 284 * about the sequence length. 285 */ 286 if (wc <= 0x7f) { 287 /* Fast path for plain ASCII characters. */ 288 *s = (char)wc; 289 return 1; 290 } else if (wc <= 0x7ff) { 291 lead = 0xc0; 292 len = 2; 293 } else if (wc <= 0xffff) { 294 lead = 0xe0; 295 len = 3; 296 } else { 297 lead = 0xf0; 298 len = 4; 299 } 300 301 /* 302 * Output the bytes representing the character in chunks 303 * of 6 bits, least significant last. The first byte is 304 * a special case because it contains the sequence length 305 * information. 306 */ 307 for (i = len - 1; i > 0; i--) { 308 s[i] = (wc & 0x3f) | 0x80; 309 wc >>= 6; 310 } 311 *s = (wc & 0xff) | lead; 312 313 return len; 314 } 315 316 size_t 317 _citrus_utf8_ctype_wcsnrtombs(char * __restrict dst, 318 const wchar_t ** __restrict src, size_t nwc, size_t len, 319 mbstate_t * __restrict ps) 320 { 321 struct _utf8_state *us; 322 char buf[_CITRUS_UTF8_MB_CUR_MAX]; 323 size_t i, o, r; 324 325 us = (struct _utf8_state *)ps; 326 327 if (us->want != 0) { 328 errno = EINVAL; 329 return -1; 330 } 331 332 if (dst == NULL) { 333 for (i = o = 0; i < nwc; i++, o += r) { 334 wchar_t wc = (*src)[i]; 335 if (wc >= 0 && wc < 0x80) { 336 /* Fast path for plain ASCII characters. */ 337 if (wc == 0) 338 return o; 339 r = 1; 340 } else { 341 r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps); 342 if (r == (size_t)-1) 343 return r; 344 } 345 } 346 return o; 347 } 348 349 for (i = o = 0; i < nwc && o < len; i++, o += r) { 350 wchar_t wc = (*src)[i]; 351 if (wc >= 0 && wc < 0x80) { 352 /* Fast path for plain ASCII characters. */ 353 dst[o] = (wchar_t)wc; 354 if (wc == 0) { 355 *src = NULL; 356 return o; 357 } 358 r = 1; 359 } else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) { 360 /* Enough space to translate in-place. */ 361 r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps); 362 if (r == (size_t)-1) { 363 *src += i; 364 return r; 365 } 366 } else { 367 /* May not be enough space; use temp buffer. */ 368 r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps); 369 if (r == (size_t)-1) { 370 *src += i; 371 return r; 372 } 373 if (r > len - o) 374 break; 375 memcpy(dst + o, buf, r); 376 } 377 } 378 *src += i; 379 return o; 380 } 381