1 /* $NetBSD: citrus_utf1632.c,v 1.11 2010/03/20 18:15:32 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c)2003 Citrus Project, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #if defined(LIBC_SCCS) && !defined(lint) 31 __RCSID("$NetBSD: citrus_utf1632.c,v 1.11 2010/03/20 18:15:32 tnozaki Exp $"); 32 #endif /* LIBC_SCCS and not lint */ 33 34 #include <assert.h> 35 #include <errno.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <stddef.h> 40 #include <limits.h> 41 #include <wchar.h> 42 #include <sys/types.h> 43 #include <machine/endian.h> 44 45 #include "citrus_namespace.h" 46 #include "citrus_types.h" 47 #include "citrus_module.h" 48 #include "citrus_stdenc.h" 49 #include "citrus_bcs.h" 50 51 #include "citrus_utf1632.h" 52 53 54 /* ---------------------------------------------------------------------- 55 * private stuffs used by templates 56 */ 57 58 typedef struct { 59 u_int8_t ch[4]; 60 int chlen; 61 int current_endian; 62 } _UTF1632State; 63 64 typedef struct { 65 int preffered_endian; 66 unsigned int cur_max; 67 #define _ENDIAN_UNKNOWN 0 68 #define _ENDIAN_BIG 1 69 #define _ENDIAN_LITTLE 2 70 u_int32_t mode; 71 #define _MODE_UTF32 0x00000001U 72 #define _MODE_FORCE_ENDIAN 0x00000002U 73 } _UTF1632EncodingInfo; 74 75 #define _FUNCNAME(m) _citrus_UTF1632_##m 76 #define _ENCODING_INFO _UTF1632EncodingInfo 77 #define _ENCODING_STATE _UTF1632State 78 #define _ENCODING_MB_CUR_MAX(_ei_) ((_ei_)->cur_max) 79 #define _ENCODING_IS_STATE_DEPENDENT 0 80 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_) 0 81 82 83 static __inline void 84 /*ARGSUSED*/ 85 _citrus_UTF1632_init_state(_UTF1632EncodingInfo *ei, _UTF1632State *s) 86 { 87 memset(s, 0, sizeof(*s)); 88 } 89 90 static int 91 _citrus_UTF1632_mbrtowc_priv(_UTF1632EncodingInfo *ei, wchar_t *pwc, 92 const char **s, size_t n, _UTF1632State *psenc, 93 size_t *nresult) 94 { 95 int chlenbak, endian, needlen; 96 wchar_t wc; 97 size_t result; 98 const char *s0; 99 100 _DIAGASSERT(nresult != 0); 101 _DIAGASSERT(ei != NULL); 102 _DIAGASSERT(s != NULL); 103 _DIAGASSERT(psenc != NULL); 104 105 s0 = *s; 106 107 if (s0 == NULL) { 108 _citrus_UTF1632_init_state(ei, psenc); 109 *nresult = 0; /* state independent */ 110 return (0); 111 } 112 113 result = 0; 114 chlenbak = psenc->chlen; 115 116 refetch: 117 if ((ei->mode & _MODE_UTF32) != 0 || chlenbak>=2) 118 needlen = 4; 119 else 120 needlen = 2; 121 122 while (chlenbak < needlen) { 123 if (n==0) 124 goto restart; 125 psenc->ch[chlenbak++] = *s0++; 126 n--; 127 result++; 128 } 129 130 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 131 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 132 /* judge endian marker */ 133 if ((ei->mode & _MODE_UTF32) == 0) { 134 /* UTF16 */ 135 if (psenc->ch[0]==0xFE && psenc->ch[1]==0xFF) { 136 psenc->current_endian = _ENDIAN_BIG; 137 chlenbak = 0; 138 goto refetch; 139 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE) { 140 psenc->current_endian = _ENDIAN_LITTLE; 141 chlenbak = 0; 142 goto refetch; 143 } 144 } else { 145 /* UTF32 */ 146 if (psenc->ch[0]==0x00 && psenc->ch[1]==0x00 && 147 psenc->ch[2]==0xFE && psenc->ch[3]==0xFF) { 148 psenc->current_endian = _ENDIAN_BIG; 149 chlenbak = 0; 150 goto refetch; 151 } else if (psenc->ch[0]==0xFF && psenc->ch[1]==0xFE && 152 psenc->ch[2]==0x00 && psenc->ch[3]==0x00) { 153 psenc->current_endian = _ENDIAN_LITTLE; 154 chlenbak = 0; 155 goto refetch; 156 } 157 } 158 } 159 psenc->current_endian = ei->preffered_endian; 160 } 161 endian = psenc->current_endian; 162 163 /* get wc */ 164 if ((ei->mode & _MODE_UTF32) == 0) { 165 /* UTF16 */ 166 if (needlen==2) { 167 switch (endian) { 168 case _ENDIAN_LITTLE: 169 wc = (psenc->ch[0] | 170 ((wchar_t)psenc->ch[1] << 8)); 171 break; 172 case _ENDIAN_BIG: 173 wc = (psenc->ch[1] | 174 ((wchar_t)psenc->ch[0] << 8)); 175 break; 176 default: 177 goto ilseq; 178 } 179 if (wc >= 0xD800 && wc <= 0xDBFF) { 180 /* surrogate high */ 181 needlen=4; 182 goto refetch; 183 } 184 } else { 185 /* surrogate low */ 186 wc -= 0xD800; /* wc : surrogate high (see above) */ 187 wc <<= 10; 188 switch (endian) { 189 case _ENDIAN_LITTLE: 190 if (psenc->ch[3]<0xDC || psenc->ch[3]>0xDF) 191 goto ilseq; 192 wc |= psenc->ch[2]; 193 wc |= (wchar_t)(psenc->ch[3] & 3) << 8; 194 break; 195 case _ENDIAN_BIG: 196 if (psenc->ch[2]<0xDC || psenc->ch[2]>0xDF) 197 goto ilseq; 198 wc |= psenc->ch[3]; 199 wc |= (wchar_t)(psenc->ch[2] & 3) << 8; 200 break; 201 default: 202 goto ilseq; 203 } 204 wc += 0x10000; 205 } 206 } else { 207 /* UTF32 */ 208 switch (endian) { 209 case _ENDIAN_LITTLE: 210 wc = (psenc->ch[0] | 211 ((wchar_t)psenc->ch[1] << 8) | 212 ((wchar_t)psenc->ch[2] << 16) | 213 ((wchar_t)psenc->ch[3] << 24)); 214 break; 215 case _ENDIAN_BIG: 216 wc = (psenc->ch[3] | 217 ((wchar_t)psenc->ch[2] << 8) | 218 ((wchar_t)psenc->ch[1] << 16) | 219 ((wchar_t)psenc->ch[0] << 24)); 220 break; 221 default: 222 goto ilseq; 223 } 224 if (wc >= 0xD800 && wc <= 0xDFFF) 225 goto ilseq; 226 } 227 228 229 *pwc = wc; 230 psenc->chlen = 0; 231 *nresult = result; 232 *s = s0; 233 234 return (0); 235 236 ilseq: 237 *nresult = (size_t)-1; 238 psenc->chlen = 0; 239 return (EILSEQ); 240 241 restart: 242 *nresult = (size_t)-2; 243 psenc->chlen = chlenbak; 244 *s = s0; 245 return (0); 246 } 247 248 static int 249 _citrus_UTF1632_wcrtomb_priv(_UTF1632EncodingInfo *ei, char *s, size_t n, 250 wchar_t wc, _UTF1632State *psenc, 251 size_t *nresult) 252 { 253 int ret; 254 wchar_t wc2; 255 static const char _bom[4] = { 256 #if BYTE_ORDER == BIG_ENDIAN 257 0x00, 0x00, 0xFE, 0xFF, 258 #else 259 0xFF, 0xFE, 0x00, 0x00, 260 #endif 261 }; 262 const char *bom = &_bom[0]; 263 size_t cnt; 264 265 _DIAGASSERT(ei != NULL); 266 _DIAGASSERT(nresult != 0); 267 _DIAGASSERT(s != NULL); 268 269 cnt = (size_t)0; 270 if (psenc->current_endian == _ENDIAN_UNKNOWN) { 271 if ((ei->mode & _MODE_FORCE_ENDIAN) == 0) { 272 if (ei->mode & _MODE_UTF32) { 273 cnt = 4; 274 } else { 275 cnt = 2; 276 #if BYTE_ORDER == BIG_ENDIAN 277 bom += 2; 278 #endif 279 } 280 if (n < cnt) 281 goto e2big; 282 memcpy(s, bom, cnt); 283 s += cnt, n -= cnt; 284 } 285 psenc->current_endian = ei->preffered_endian; 286 } 287 288 wc2 = 0; 289 if ((ei->mode & _MODE_UTF32)==0) { 290 /* UTF16 */ 291 if (wc>0xFFFF) { 292 /* surrogate */ 293 if (wc>0x10FFFF) 294 goto ilseq; 295 if (n < 4) 296 goto e2big; 297 cnt += 4; 298 wc -= 0x10000; 299 wc2 = (wc & 0x3FF) | 0xDC00; 300 wc = (wc>>10) | 0xD800; 301 } else { 302 if (n < 2) 303 goto e2big; 304 cnt += 2; 305 } 306 307 surrogate: 308 switch (psenc->current_endian) { 309 case _ENDIAN_BIG: 310 s[1] = wc; 311 s[0] = (wc >>= 8); 312 break; 313 case _ENDIAN_LITTLE: 314 s[0] = wc; 315 s[1] = (wc >>= 8); 316 break; 317 } 318 if (wc2!=0) { 319 wc = wc2; 320 wc2 = 0; 321 s += 2; 322 goto surrogate; 323 } 324 } else { 325 /* UTF32 */ 326 if (wc >= 0xD800 && wc <= 0xDFFF) 327 goto ilseq; 328 if (n < 4) 329 goto e2big; 330 cnt += 4; 331 switch (psenc->current_endian) { 332 case _ENDIAN_BIG: 333 s[3] = wc; 334 s[2] = (wc >>= 8); 335 s[1] = (wc >>= 8); 336 s[0] = (wc >>= 8); 337 break; 338 case _ENDIAN_LITTLE: 339 s[0] = wc; 340 s[1] = (wc >>= 8); 341 s[2] = (wc >>= 8); 342 s[3] = (wc >>= 8); 343 break; 344 } 345 } 346 *nresult = cnt; 347 348 return 0; 349 350 ilseq: 351 *nresult = (size_t)-1; 352 return EILSEQ; 353 e2big: 354 *nresult = (size_t)-1; 355 return E2BIG; 356 } 357 358 static void 359 parse_variable(_UTF1632EncodingInfo * __restrict ei, 360 const void * __restrict var, size_t lenvar) 361 { 362 #define MATCH(x, act) \ 363 do { \ 364 if (lenvar >= (sizeof(#x)-1) && \ 365 _bcs_strncasecmp(p, #x, sizeof(#x)-1) == 0) { \ 366 act; \ 367 lenvar -= sizeof(#x)-1; \ 368 p += sizeof(#x)-1; \ 369 } \ 370 } while (/*CONSTCOND*/0) 371 const char *p; 372 p = var; 373 while (lenvar>0) { 374 switch (*p) { 375 case 'B': 376 case 'b': 377 MATCH(big, ei->preffered_endian = _ENDIAN_BIG); 378 break; 379 case 'L': 380 case 'l': 381 MATCH(little, ei->preffered_endian = _ENDIAN_LITTLE); 382 break; 383 case 'F': 384 case 'f': 385 MATCH(force, ei->mode |= _MODE_FORCE_ENDIAN); 386 break; 387 case 'U': 388 case 'u': 389 MATCH(utf32, ei->mode |= _MODE_UTF32); 390 break; 391 } 392 p++; 393 lenvar--; 394 } 395 } 396 397 static int 398 /*ARGSUSED*/ 399 _citrus_UTF1632_encoding_module_init(_UTF1632EncodingInfo * __restrict ei, 400 const void * __restrict var, 401 size_t lenvar) 402 { 403 _DIAGASSERT(ei != NULL); 404 405 memset((void *)ei, 0, sizeof(*ei)); 406 407 parse_variable(ei, var, lenvar); 408 409 if ((ei->mode&_MODE_UTF32)==0) 410 ei->cur_max = 6; /* endian + surrogate */ 411 else 412 ei->cur_max = 8; /* endian + normal */ 413 414 if (ei->preffered_endian == _ENDIAN_UNKNOWN) { 415 #if BYTE_ORDER == BIG_ENDIAN 416 ei->preffered_endian = _ENDIAN_BIG; 417 #else 418 ei->preffered_endian = _ENDIAN_LITTLE; 419 #endif 420 } 421 422 return (0); 423 } 424 425 static void 426 /*ARGSUSED*/ 427 _citrus_UTF1632_encoding_module_uninit(_UTF1632EncodingInfo *ei) 428 { 429 } 430 431 static __inline int 432 /*ARGSUSED*/ 433 _citrus_UTF1632_stdenc_wctocs(_UTF1632EncodingInfo * __restrict ei, 434 _csid_t * __restrict csid, 435 _index_t * __restrict idx, 436 _wc_t wc) 437 { 438 439 _DIAGASSERT(csid != NULL && idx != NULL); 440 441 *csid = 0; 442 *idx = (_index_t)wc; 443 444 return (0); 445 } 446 447 static __inline int 448 /*ARGSUSED*/ 449 _citrus_UTF1632_stdenc_cstowc(_UTF1632EncodingInfo * __restrict ei, 450 _wc_t * __restrict wc, 451 _csid_t csid, _index_t idx) 452 { 453 454 _DIAGASSERT(wc != NULL); 455 456 if (csid != 0) 457 return (EILSEQ); 458 459 *wc = (_wc_t)idx; 460 461 return (0); 462 } 463 464 static __inline int 465 /*ARGSUSED*/ 466 _citrus_UTF1632_stdenc_get_state_desc_generic(_UTF1632EncodingInfo * __restrict ei, 467 _UTF1632State * __restrict psenc, 468 int * __restrict rstate) 469 { 470 471 if (psenc->chlen == 0) 472 *rstate = _STDENC_SDGEN_INITIAL; 473 else 474 *rstate = _STDENC_SDGEN_INCOMPLETE_CHAR; 475 476 return 0; 477 } 478 479 /* ---------------------------------------------------------------------- 480 * public interface for stdenc 481 */ 482 483 _CITRUS_STDENC_DECLS(UTF1632); 484 _CITRUS_STDENC_DEF_OPS(UTF1632); 485 486 #include "citrus_stdenc_template.h" 487