1 /* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <stddef.h> 39 #include <string.h> 40 #include <sys/types.h> 41 #include <wchar.h> 42 #include <ctype.h> 43 #include <wctype.h> 44 #include <unistd.h> 45 #include "localedef.h" 46 #include "parser.h" 47 #include "runefile.h" 48 #include "avl.h" 49 50 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */ 51 #ifndef _CTYPE_N 52 #define _CTYPE_N 0x00400000L 53 #endif 54 55 #define _ISUPPER _CTYPE_U 56 #define _ISLOWER _CTYPE_L 57 #define _ISDIGIT _CTYPE_D 58 #define _ISXDIGIT _CTYPE_X 59 #define _ISSPACE _CTYPE_S 60 #define _ISBLANK _CTYPE_B 61 #define _ISALPHA _CTYPE_A 62 #define _ISPUNCT _CTYPE_P 63 #define _ISGRAPH _CTYPE_G 64 #define _ISPRINT _CTYPE_R 65 #define _ISCNTRL _CTYPE_C 66 #define _E1 _CTYPE_Q 67 #define _E2 _CTYPE_I 68 #define _E3 0 69 #define _E4 _CTYPE_N 70 #define _E5 _CTYPE_T 71 72 static avl_tree_t ctypes; 73 74 static wchar_t last_ctype; 75 76 typedef struct ctype_node { 77 wchar_t wc; 78 int32_t ctype; 79 int32_t toupper; 80 int32_t tolower; 81 avl_node_t avl; 82 } ctype_node_t; 83 84 typedef struct width_node { 85 wchar_t start; 86 wchar_t end; 87 int8_t width; 88 avl_node_t avl; 89 } width_node_t; 90 91 static int 92 ctype_compare(const void *n1, const void *n2) 93 { 94 const ctype_node_t *c1 = n1; 95 const ctype_node_t *c2 = n2; 96 97 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 98 } 99 100 void 101 init_ctype(void) 102 { 103 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 104 offsetof(ctype_node_t, avl)); 105 } 106 107 108 static void 109 add_ctype_impl(ctype_node_t *ctn) 110 { 111 switch (last_kw) { 112 case T_ISUPPER: 113 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 114 break; 115 case T_ISLOWER: 116 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 117 break; 118 case T_ISALPHA: 119 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 120 break; 121 case T_ISDIGIT: 122 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 123 break; 124 case T_ISSPACE: 125 ctn->ctype |= _ISSPACE; 126 break; 127 case T_ISCNTRL: 128 ctn->ctype |= _ISCNTRL; 129 break; 130 case T_ISGRAPH: 131 ctn->ctype |= (_ISGRAPH | _ISPRINT); 132 break; 133 case T_ISPRINT: 134 ctn->ctype |= _ISPRINT; 135 break; 136 case T_ISPUNCT: 137 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 138 break; 139 case T_ISXDIGIT: 140 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 141 break; 142 case T_ISBLANK: 143 ctn->ctype |= (_ISBLANK | _ISSPACE); 144 break; 145 case T_ISPHONOGRAM: 146 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 147 break; 148 case T_ISIDEOGRAM: 149 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 150 break; 151 case T_ISENGLISH: 152 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 153 break; 154 case T_ISNUMBER: 155 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 156 break; 157 case T_ISSPECIAL: 158 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 159 break; 160 case T_ISALNUM: 161 /* 162 * We can't do anything with this. The character 163 * should already be specified as a digit or alpha. 164 */ 165 break; 166 default: 167 errf("not a valid character class"); 168 } 169 } 170 171 static ctype_node_t * 172 get_ctype(wchar_t wc) 173 { 174 ctype_node_t srch; 175 ctype_node_t *ctn; 176 avl_index_t where; 177 178 srch.wc = wc; 179 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 180 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 181 errf("out of memory"); 182 return (NULL); 183 } 184 ctn->wc = wc; 185 186 avl_insert(&ctypes, ctn, where); 187 } 188 return (ctn); 189 } 190 191 void 192 add_ctype(int val) 193 { 194 ctype_node_t *ctn; 195 196 if ((ctn = get_ctype(val)) == NULL) { 197 INTERR; 198 return; 199 } 200 add_ctype_impl(ctn); 201 last_ctype = ctn->wc; 202 } 203 204 void 205 add_ctype_range(int end) 206 { 207 ctype_node_t *ctn; 208 wchar_t cur; 209 210 if (end < last_ctype) { 211 errf("malformed character range (%u ... %u))", 212 last_ctype, end); 213 return; 214 } 215 for (cur = last_ctype + 1; cur <= end; cur++) { 216 if ((ctn = get_ctype(cur)) == NULL) { 217 INTERR; 218 return; 219 } 220 add_ctype_impl(ctn); 221 } 222 last_ctype = end; 223 224 } 225 226 /* 227 * A word about widths: if the width mask is specified, then libc 228 * unconditionally honors it. Otherwise, it assumes printable 229 * characters have width 1, and non-printable characters have width 230 * -1 (except for NULL which is special with with 0). Hence, we have 231 * no need to inject defaults here -- the "default" unset value of 0 232 * indicates that libc should use its own logic in wcwidth as described. 233 */ 234 void 235 add_width(int wc, int width) 236 { 237 ctype_node_t *ctn; 238 239 if ((ctn = get_ctype(wc)) == NULL) { 240 INTERR; 241 return; 242 } 243 ctn->ctype &= ~(_CTYPE_SWM); 244 switch (width) { 245 case 0: 246 ctn->ctype |= _CTYPE_SW0; 247 break; 248 case 1: 249 ctn->ctype |= _CTYPE_SW1; 250 break; 251 case 2: 252 ctn->ctype |= _CTYPE_SW2; 253 break; 254 case 3: 255 ctn->ctype |= _CTYPE_SW3; 256 break; 257 } 258 } 259 260 void 261 add_width_range(int start, int end, int width) 262 { 263 for (; start <= end; start++) { 264 add_width(start, width); 265 } 266 } 267 268 void 269 add_caseconv(int val, int wc) 270 { 271 ctype_node_t *ctn; 272 273 ctn = get_ctype(val); 274 if (ctn == NULL) { 275 INTERR; 276 return; 277 } 278 279 switch (last_kw) { 280 case T_TOUPPER: 281 ctn->toupper = wc; 282 break; 283 case T_TOLOWER: 284 ctn->tolower = wc; 285 break; 286 default: 287 INTERR; 288 break; 289 } 290 } 291 292 void 293 dump_ctype(void) 294 { 295 FILE *f; 296 _FileRuneLocale rl; 297 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 298 _FileRuneEntry *ct = NULL; 299 _FileRuneEntry *lo = NULL; 300 _FileRuneEntry *up = NULL; 301 wchar_t wc; 302 303 (void) memset(&rl, 0, sizeof (rl)); 304 last_ct = NULL; 305 last_lo = NULL; 306 last_up = NULL; 307 308 if ((f = open_category()) == NULL) 309 return; 310 311 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 312 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 313 314 /* 315 * Initialize the identity map. 316 */ 317 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 318 rl.maplower[wc] = wc; 319 rl.mapupper[wc] = wc; 320 } 321 322 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 323 int conflict = 0; 324 325 326 wc = ctn->wc; 327 328 /* 329 * POSIX requires certain portable characters have 330 * certain types. Add them if they are missing. 331 */ 332 if ((wc >= 1) && (wc <= 127)) { 333 if ((wc >= 'A') && (wc <= 'Z')) 334 ctn->ctype |= _ISUPPER; 335 if ((wc >= 'a') && (wc <= 'z')) 336 ctn->ctype |= _ISLOWER; 337 if ((wc >= '0') && (wc <= '9')) 338 ctn->ctype |= _ISDIGIT; 339 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 340 ctn->ctype |= _ISSPACE; 341 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 342 ctn->ctype |= _ISXDIGIT; 343 if (strchr(" \t", (char)wc)) 344 ctn->ctype |= _ISBLANK; 345 if (wc == ' ') 346 ctn->ctype |= _ISPRINT; 347 348 /* 349 * Technically these settings are only 350 * required for the C locale. However, it 351 * turns out that because of the historical 352 * version of isprint(), we need them for all 353 * locales as well. Note that these are not 354 * necessarily valid punctation characters in 355 * the current language, but ispunct() needs 356 * to return TRUE for them. 357 */ 358 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 359 (char)wc)) 360 ctn->ctype |= _ISPUNCT; 361 } 362 363 /* 364 * POSIX also requires that certain types imply 365 * others. Add any inferred types here. 366 */ 367 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 368 ctn->ctype |= _ISALPHA; 369 if (ctn->ctype & _ISDIGIT) 370 ctn->ctype |= _ISXDIGIT; 371 if (ctn->ctype & _ISBLANK) 372 ctn->ctype |= _ISSPACE; 373 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 374 ctn->ctype |= _ISGRAPH; 375 if (ctn->ctype & _ISGRAPH) 376 ctn->ctype |= _ISPRINT; 377 378 /* 379 * Finally, POSIX requires that certain combinations 380 * are invalid. We don't flag this as a fatal error, 381 * but we will warn about. 382 */ 383 if ((ctn->ctype & _ISALPHA) && 384 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 385 conflict++; 386 if ((ctn->ctype & _ISPUNCT) & 387 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 388 conflict++; 389 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 390 conflict++; 391 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 392 conflict++; 393 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 394 conflict++; 395 396 if (conflict) { 397 warn("conflicting classes for character 0x%x (%x)", 398 wc, ctn->ctype); 399 } 400 /* 401 * Handle the lower 256 characters using the simple 402 * optimization. Note that if we have not defined the 403 * upper/lower case, then we identity map it. 404 */ 405 if ((unsigned)wc < _CACHED_RUNES) { 406 rl.runetype[wc] = ctn->ctype; 407 if (ctn->tolower) 408 rl.maplower[wc] = ctn->tolower; 409 if (ctn->toupper) 410 rl.mapupper[wc] = ctn->toupper; 411 continue; 412 } 413 414 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 415 ct[rl.runetype_ext_nranges-1].max = wc; 416 last_ct = ctn; 417 } else { 418 rl.runetype_ext_nranges++; 419 ct = realloc(ct, 420 sizeof (*ct) * rl.runetype_ext_nranges); 421 ct[rl.runetype_ext_nranges - 1].min = wc; 422 ct[rl.runetype_ext_nranges - 1].max = wc; 423 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 424 last_ct = ctn; 425 } 426 if (ctn->tolower == 0) { 427 last_lo = NULL; 428 } else if ((last_lo != NULL) && 429 (last_lo->tolower + 1 == ctn->tolower)) { 430 lo[rl.maplower_ext_nranges-1].max = wc; 431 last_lo = ctn; 432 } else { 433 rl.maplower_ext_nranges++; 434 lo = realloc(lo, 435 sizeof (*lo) * rl.maplower_ext_nranges); 436 lo[rl.maplower_ext_nranges - 1].min = wc; 437 lo[rl.maplower_ext_nranges - 1].max = wc; 438 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 439 last_lo = ctn; 440 } 441 442 if (ctn->toupper == 0) { 443 last_up = NULL; 444 } else if ((last_up != NULL) && 445 (last_up->toupper + 1 == ctn->toupper)) { 446 up[rl.mapupper_ext_nranges-1].max = wc; 447 last_up = ctn; 448 } else { 449 rl.mapupper_ext_nranges++; 450 up = realloc(up, 451 sizeof (*up) * rl.mapupper_ext_nranges); 452 up[rl.mapupper_ext_nranges - 1].min = wc; 453 up[rl.mapupper_ext_nranges - 1].max = wc; 454 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 455 last_up = ctn; 456 } 457 } 458 459 if ((wr_category(&rl, sizeof (rl), f) < 0) || 460 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 461 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 462 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 463 return; 464 } 465 466 close_category(f); 467 } 468