1 /* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <stddef.h> 39 #include <string.h> 40 #include <sys/types.h> 41 #include <wchar.h> 42 #include <ctype.h> 43 #include <wctype.h> 44 #include <unistd.h> 45 #include "localedef.h" 46 #include "parser.h" 47 #include "runefile.h" 48 #include "avl.h" 49 50 51 #define _ISUPPER _CTYPE_U 52 #define _ISLOWER _CTYPE_L 53 #define _ISDIGIT _CTYPE_D 54 #define _ISXDIGIT _CTYPE_X 55 #define _ISSPACE _CTYPE_S 56 #define _ISBLANK _CTYPE_B 57 #define _ISALPHA _CTYPE_A 58 #define _ISPUNCT _CTYPE_P 59 #define _ISGRAPH _CTYPE_G 60 #define _ISPRINT _CTYPE_R 61 #define _ISCNTRL _CTYPE_C 62 #define _E1 _CTYPE_Q 63 #define _E2 _CTYPE_I 64 #define _E3 0 65 #define _E4 0 66 #define _E5 _CTYPE_T 67 68 static avl_tree_t ctypes; 69 70 static wchar_t last_ctype; 71 72 typedef struct ctype_node { 73 wchar_t wc; 74 int32_t ctype; 75 int32_t toupper; 76 int32_t tolower; 77 avl_node_t avl; 78 } ctype_node_t; 79 80 typedef struct width_node { 81 wchar_t start; 82 wchar_t end; 83 int8_t width; 84 avl_node_t avl; 85 } width_node_t; 86 87 static int 88 ctype_compare(const void *n1, const void *n2) 89 { 90 const ctype_node_t *c1 = n1; 91 const ctype_node_t *c2 = n2; 92 93 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 94 } 95 96 void 97 init_ctype(void) 98 { 99 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 100 offsetof(ctype_node_t, avl)); 101 } 102 103 104 static void 105 add_ctype_impl(ctype_node_t *ctn) 106 { 107 switch (last_kw) { 108 case T_ISUPPER: 109 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 110 break; 111 case T_ISLOWER: 112 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 113 break; 114 case T_ISALPHA: 115 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 116 break; 117 case T_ISDIGIT: 118 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 119 break; 120 case T_ISSPACE: 121 ctn->ctype |= _ISSPACE; 122 break; 123 case T_ISCNTRL: 124 ctn->ctype |= _ISCNTRL; 125 break; 126 case T_ISGRAPH: 127 ctn->ctype |= (_ISGRAPH | _ISPRINT); 128 break; 129 case T_ISPRINT: 130 ctn->ctype |= _ISPRINT; 131 break; 132 case T_ISPUNCT: 133 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 134 break; 135 case T_ISXDIGIT: 136 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 137 break; 138 case T_ISBLANK: 139 ctn->ctype |= (_ISBLANK | _ISSPACE); 140 break; 141 case T_ISPHONOGRAM: 142 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 143 break; 144 case T_ISIDEOGRAM: 145 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 146 break; 147 case T_ISENGLISH: 148 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 149 break; 150 case T_ISNUMBER: 151 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 152 break; 153 case T_ISSPECIAL: 154 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 155 break; 156 case T_ISALNUM: 157 /* 158 * We can't do anything with this. The character 159 * should already be specified as a digit or alpha. 160 */ 161 break; 162 default: 163 errf("not a valid character class"); 164 } 165 } 166 167 static ctype_node_t * 168 get_ctype(wchar_t wc) 169 { 170 ctype_node_t srch; 171 ctype_node_t *ctn; 172 avl_index_t where; 173 174 srch.wc = wc; 175 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 176 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 177 errf("out of memory"); 178 return (NULL); 179 } 180 ctn->wc = wc; 181 182 avl_insert(&ctypes, ctn, where); 183 } 184 return (ctn); 185 } 186 187 void 188 add_ctype(int val) 189 { 190 ctype_node_t *ctn; 191 192 if ((ctn = get_ctype(val)) == NULL) { 193 INTERR; 194 return; 195 } 196 add_ctype_impl(ctn); 197 last_ctype = ctn->wc; 198 } 199 200 void 201 add_ctype_range(int end) 202 { 203 ctype_node_t *ctn; 204 wchar_t cur; 205 206 if (end < last_ctype) { 207 errf("malformed character range (%u ... %u))", 208 last_ctype, end); 209 return; 210 } 211 for (cur = last_ctype + 1; cur <= end; cur++) { 212 if ((ctn = get_ctype(cur)) == NULL) { 213 INTERR; 214 return; 215 } 216 add_ctype_impl(ctn); 217 } 218 last_ctype = end; 219 220 } 221 222 /* 223 * A word about widths: if the width mask is specified, then libc 224 * unconditionally honors it. Otherwise, it assumes printable 225 * characters have width 1, and non-printable characters have width 226 * -1 (except for NULL which is special with with 0). Hence, we have 227 * no need to inject defaults here -- the "default" unset value of 0 228 * indicates that libc should use its own logic in wcwidth as described. 229 */ 230 void 231 add_width(int wc, int width) 232 { 233 ctype_node_t *ctn; 234 235 if ((ctn = get_ctype(wc)) == NULL) { 236 INTERR; 237 return; 238 } 239 ctn->ctype &= ~(_CTYPE_SWM); 240 switch (width) { 241 case 0: 242 ctn->ctype |= _CTYPE_SW0; 243 break; 244 case 1: 245 ctn->ctype |= _CTYPE_SW1; 246 break; 247 case 2: 248 ctn->ctype |= _CTYPE_SW2; 249 break; 250 case 3: 251 ctn->ctype |= _CTYPE_SW3; 252 break; 253 } 254 } 255 256 void 257 add_width_range(int start, int end, int width) 258 { 259 for (; start <= end; start++) { 260 add_width(start, width); 261 } 262 } 263 264 void 265 add_caseconv(int val, int wc) 266 { 267 ctype_node_t *ctn; 268 269 ctn = get_ctype(val); 270 if (ctn == NULL) { 271 INTERR; 272 return; 273 } 274 275 switch (last_kw) { 276 case T_TOUPPER: 277 ctn->toupper = wc; 278 break; 279 case T_TOLOWER: 280 ctn->tolower = wc; 281 break; 282 default: 283 INTERR; 284 break; 285 } 286 } 287 288 void 289 dump_ctype(void) 290 { 291 FILE *f; 292 _FileRuneLocale rl; 293 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 294 _FileRuneEntry *ct = NULL; 295 _FileRuneEntry *lo = NULL; 296 _FileRuneEntry *up = NULL; 297 wchar_t wc; 298 299 (void) memset(&rl, 0, sizeof (rl)); 300 last_ct = NULL; 301 last_lo = NULL; 302 last_up = NULL; 303 304 if ((f = open_category()) == NULL) 305 return; 306 307 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 308 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 309 310 /* 311 * Initialize the identity map. 312 */ 313 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 314 rl.maplower[wc] = wc; 315 rl.mapupper[wc] = wc; 316 } 317 318 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 319 int conflict = 0; 320 321 322 wc = ctn->wc; 323 324 /* 325 * POSIX requires certain portable characters have 326 * certain types. Add them if they are missing. 327 */ 328 if ((wc >= 1) && (wc <= 127)) { 329 if ((wc >= 'A') && (wc <= 'Z')) 330 ctn->ctype |= _ISUPPER; 331 if ((wc >= 'a') && (wc <= 'z')) 332 ctn->ctype |= _ISLOWER; 333 if ((wc >= '0') && (wc <= '9')) 334 ctn->ctype |= _ISDIGIT; 335 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 336 ctn->ctype |= _ISSPACE; 337 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 338 ctn->ctype |= _ISXDIGIT; 339 if (strchr(" \t", (char)wc)) 340 ctn->ctype |= _ISBLANK; 341 if (wc == ' ') 342 ctn->ctype |= _ISPRINT; 343 344 /* 345 * Technically these settings are only 346 * required for the C locale. However, it 347 * turns out that because of the historical 348 * version of isprint(), we need them for all 349 * locales as well. Note that these are not 350 * necessarily valid punctation characters in 351 * the current language, but ispunct() needs 352 * to return TRUE for them. 353 */ 354 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 355 (char)wc)) 356 ctn->ctype |= _ISPUNCT; 357 } 358 359 /* 360 * POSIX also requires that certain types imply 361 * others. Add any inferred types here. 362 */ 363 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 364 ctn->ctype |= _ISALPHA; 365 if (ctn->ctype & _ISDIGIT) 366 ctn->ctype |= _ISXDIGIT; 367 if (ctn->ctype & _ISBLANK) 368 ctn->ctype |= _ISSPACE; 369 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 370 ctn->ctype |= _ISGRAPH; 371 if (ctn->ctype & _ISGRAPH) 372 ctn->ctype |= _ISPRINT; 373 374 /* 375 * Finally, POSIX requires that certain combinations 376 * are invalid. We don't flag this as a fatal error, 377 * but we will warn about. 378 */ 379 if ((ctn->ctype & _ISALPHA) && 380 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 381 conflict++; 382 if ((ctn->ctype & _ISPUNCT) & 383 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 384 conflict++; 385 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 386 conflict++; 387 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 388 conflict++; 389 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 390 conflict++; 391 392 if (conflict) { 393 warn("conflicting classes for character 0x%x (%x)", 394 wc, ctn->ctype); 395 } 396 /* 397 * Handle the lower 256 characters using the simple 398 * optimization. Note that if we have not defined the 399 * upper/lower case, then we identity map it. 400 */ 401 if ((unsigned)wc < _CACHED_RUNES) { 402 rl.runetype[wc] = ctn->ctype; 403 if (ctn->tolower) 404 rl.maplower[wc] = ctn->tolower; 405 if (ctn->toupper) 406 rl.mapupper[wc] = ctn->toupper; 407 continue; 408 } 409 410 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 411 ct[rl.runetype_ext_nranges-1].max = wc; 412 last_ct = ctn; 413 } else { 414 rl.runetype_ext_nranges++; 415 ct = realloc(ct, 416 sizeof (*ct) * rl.runetype_ext_nranges); 417 ct[rl.runetype_ext_nranges - 1].min = wc; 418 ct[rl.runetype_ext_nranges - 1].max = wc; 419 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 420 last_ct = ctn; 421 } 422 if (ctn->tolower == 0) { 423 last_lo = NULL; 424 } else if ((last_lo != NULL) && 425 (last_lo->tolower + 1 == ctn->tolower)) { 426 lo[rl.maplower_ext_nranges-1].max = wc; 427 last_lo = ctn; 428 } else { 429 rl.maplower_ext_nranges++; 430 lo = realloc(lo, 431 sizeof (*lo) * rl.maplower_ext_nranges); 432 lo[rl.maplower_ext_nranges - 1].min = wc; 433 lo[rl.maplower_ext_nranges - 1].max = wc; 434 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 435 last_lo = ctn; 436 } 437 438 if (ctn->toupper == 0) { 439 last_up = NULL; 440 } else if ((last_up != NULL) && 441 (last_up->toupper + 1 == ctn->toupper)) { 442 up[rl.mapupper_ext_nranges-1].max = wc; 443 last_up = ctn; 444 } else { 445 rl.mapupper_ext_nranges++; 446 up = realloc(up, 447 sizeof (*up) * rl.mapupper_ext_nranges); 448 up[rl.mapupper_ext_nranges - 1].min = wc; 449 up[rl.mapupper_ext_nranges - 1].max = wc; 450 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 451 last_up = ctn; 452 } 453 } 454 455 if ((wr_category(&rl, sizeof (rl), f) < 0) || 456 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 457 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 458 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 459 return; 460 } 461 462 close_category(f); 463 } 464