1 /* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 36 #include <sys/tree.h> 37 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <stddef.h> 41 #include <string.h> 42 #include <sys/types.h> 43 #include <wchar.h> 44 #include <ctype.h> 45 #include <wctype.h> 46 #include <unistd.h> 47 #include "localedef.h" 48 #include "parser.h" 49 #include "runefile.h" 50 51 52 /* Needed for bootstrapping, _CTYPE_N not available before 1 Sep 2015 */ 53 #ifndef _CTYPE_N 54 #define _CTYPE_N 0x00400000L 55 #endif 56 57 #define _ISUPPER _CTYPE_U 58 #define _ISLOWER _CTYPE_L 59 #define _ISDIGIT _CTYPE_D 60 #define _ISXDIGIT _CTYPE_X 61 #define _ISSPACE _CTYPE_S 62 #define _ISBLANK _CTYPE_B 63 #define _ISALPHA _CTYPE_A 64 #define _ISPUNCT _CTYPE_P 65 #define _ISGRAPH _CTYPE_G 66 #define _ISPRINT _CTYPE_R 67 #define _ISCNTRL _CTYPE_C 68 #define _E1 _CTYPE_Q 69 #define _E2 _CTYPE_I 70 #define _E3 0 71 #define _E4 _CTYPE_N 72 #define _E5 _CTYPE_T 73 74 static wchar_t last_ctype; 75 static int ctype_compare(const void *n1, const void *n2); 76 77 typedef struct ctype_node { 78 wchar_t wc; 79 int32_t ctype; 80 int32_t toupper; 81 int32_t tolower; 82 RB_ENTRY(ctype_node) entry; 83 } ctype_node_t; 84 85 static RB_HEAD(ctypes, ctype_node) ctypes; 86 RB_PROTOTYPE_STATIC(ctypes, ctype_node, entry, ctype_compare); 87 RB_GENERATE(ctypes, ctype_node, entry, ctype_compare); 88 89 static int 90 ctype_compare(const void *n1, const void *n2) 91 { 92 const ctype_node_t *c1 = n1; 93 const ctype_node_t *c2 = n2; 94 95 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 96 } 97 98 void 99 init_ctype(void) 100 { 101 RB_INIT(&ctypes); 102 } 103 104 105 static void 106 add_ctype_impl(ctype_node_t *ctn) 107 { 108 switch (last_kw) { 109 case T_ISUPPER: 110 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 111 break; 112 case T_ISLOWER: 113 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 114 break; 115 case T_ISALPHA: 116 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 117 break; 118 case T_ISDIGIT: 119 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 120 break; 121 case T_ISSPACE: 122 ctn->ctype |= _ISSPACE; 123 break; 124 case T_ISCNTRL: 125 ctn->ctype |= _ISCNTRL; 126 break; 127 case T_ISGRAPH: 128 ctn->ctype |= (_ISGRAPH | _ISPRINT); 129 break; 130 case T_ISPRINT: 131 ctn->ctype |= _ISPRINT; 132 break; 133 case T_ISPUNCT: 134 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 135 break; 136 case T_ISXDIGIT: 137 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 138 break; 139 case T_ISBLANK: 140 ctn->ctype |= (_ISBLANK | _ISSPACE); 141 break; 142 case T_ISPHONOGRAM: 143 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 144 break; 145 case T_ISIDEOGRAM: 146 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 147 break; 148 case T_ISENGLISH: 149 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 150 break; 151 case T_ISNUMBER: 152 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 153 break; 154 case T_ISSPECIAL: 155 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 156 break; 157 case T_ISALNUM: 158 /* 159 * We can't do anything with this. The character 160 * should already be specified as a digit or alpha. 161 */ 162 break; 163 default: 164 errf("not a valid character class"); 165 } 166 } 167 168 static ctype_node_t * 169 get_ctype(wchar_t wc) 170 { 171 ctype_node_t srch; 172 ctype_node_t *ctn; 173 174 srch.wc = wc; 175 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 176 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 177 errf("out of memory"); 178 return (NULL); 179 } 180 ctn->wc = wc; 181 182 RB_INSERT(ctypes, &ctypes, ctn); 183 } 184 return (ctn); 185 } 186 187 void 188 add_ctype(int val) 189 { 190 ctype_node_t *ctn; 191 192 if ((ctn = get_ctype(val)) == NULL) { 193 INTERR; 194 return; 195 } 196 add_ctype_impl(ctn); 197 last_ctype = ctn->wc; 198 } 199 200 void 201 add_ctype_range(wchar_t end) 202 { 203 ctype_node_t *ctn; 204 wchar_t cur; 205 206 if (end < last_ctype) { 207 errf("malformed character range (%u ... %u))", 208 last_ctype, end); 209 return; 210 } 211 for (cur = last_ctype + 1; cur <= end; cur++) { 212 if ((ctn = get_ctype(cur)) == NULL) { 213 INTERR; 214 return; 215 } 216 add_ctype_impl(ctn); 217 } 218 last_ctype = end; 219 220 } 221 222 /* 223 * A word about widths: if the width mask is specified, then libc 224 * unconditionally honors it. Otherwise, it assumes printable 225 * characters have width 1, and non-printable characters have width 226 * -1 (except for NULL which is special with with 0). Hence, we have 227 * no need to inject defaults here -- the "default" unset value of 0 228 * indicates that libc should use its own logic in wcwidth as described. 229 */ 230 void 231 add_width(int wc, int width) 232 { 233 ctype_node_t *ctn; 234 235 if ((ctn = get_ctype(wc)) == NULL) { 236 INTERR; 237 return; 238 } 239 ctn->ctype &= ~(_CTYPE_SWM); 240 switch (width) { 241 case 0: 242 ctn->ctype |= _CTYPE_SW0; 243 break; 244 case 1: 245 ctn->ctype |= _CTYPE_SW1; 246 break; 247 case 2: 248 ctn->ctype |= _CTYPE_SW2; 249 break; 250 case 3: 251 ctn->ctype |= _CTYPE_SW3; 252 break; 253 } 254 } 255 256 void 257 add_width_range(int start, int end, int width) 258 { 259 for (; start <= end; start++) { 260 add_width(start, width); 261 } 262 } 263 264 void 265 add_caseconv(int val, int wc) 266 { 267 ctype_node_t *ctn; 268 269 ctn = get_ctype(val); 270 if (ctn == NULL) { 271 INTERR; 272 return; 273 } 274 275 switch (last_kw) { 276 case T_TOUPPER: 277 ctn->toupper = wc; 278 break; 279 case T_TOLOWER: 280 ctn->tolower = wc; 281 break; 282 default: 283 INTERR; 284 break; 285 } 286 } 287 288 void 289 dump_ctype(void) 290 { 291 FILE *f; 292 _FileRuneLocale rl; 293 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 294 _FileRuneEntry *ct = NULL; 295 _FileRuneEntry *lo = NULL; 296 _FileRuneEntry *up = NULL; 297 wchar_t wc; 298 299 (void) memset(&rl, 0, sizeof (rl)); 300 last_ct = NULL; 301 last_lo = NULL; 302 last_up = NULL; 303 304 if ((f = open_category()) == NULL) 305 return; 306 307 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 308 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 309 310 /* 311 * Initialize the identity map. 312 */ 313 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 314 rl.maplower[wc] = wc; 315 rl.mapupper[wc] = wc; 316 } 317 318 RB_FOREACH(ctn, ctypes, &ctypes) { 319 int conflict = 0; 320 321 wc = ctn->wc; 322 323 /* 324 * POSIX requires certain portable characters have 325 * certain types. Add them if they are missing. 326 */ 327 if ((wc >= 1) && (wc <= 127)) { 328 if ((wc >= 'A') && (wc <= 'Z')) 329 ctn->ctype |= _ISUPPER; 330 if ((wc >= 'a') && (wc <= 'z')) 331 ctn->ctype |= _ISLOWER; 332 if ((wc >= '0') && (wc <= '9')) 333 ctn->ctype |= _ISDIGIT; 334 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 335 ctn->ctype |= _ISSPACE; 336 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 337 ctn->ctype |= _ISXDIGIT; 338 if (strchr(" \t", (char)wc)) 339 ctn->ctype |= _ISBLANK; 340 if (wc == ' ') 341 ctn->ctype |= _ISPRINT; 342 343 /* 344 * Technically these settings are only 345 * required for the C locale. However, it 346 * turns out that because of the historical 347 * version of isprint(), we need them for all 348 * locales as well. Note that these are not 349 * necessarily valid punctation characters in 350 * the current language, but ispunct() needs 351 * to return TRUE for them. 352 */ 353 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 354 (char)wc)) 355 ctn->ctype |= _ISPUNCT; 356 } 357 358 /* 359 * POSIX also requires that certain types imply 360 * others. Add any inferred types here. 361 */ 362 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 363 ctn->ctype |= _ISALPHA; 364 if (ctn->ctype & _ISDIGIT) 365 ctn->ctype |= _ISXDIGIT; 366 if (ctn->ctype & _ISBLANK) 367 ctn->ctype |= _ISSPACE; 368 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 369 ctn->ctype |= _ISGRAPH; 370 if (ctn->ctype & _ISGRAPH) 371 ctn->ctype |= _ISPRINT; 372 373 /* 374 * Finally, POSIX requires that certain combinations 375 * are invalid. We don't flag this as a fatal error, 376 * but we will warn about. 377 */ 378 if ((ctn->ctype & _ISALPHA) && 379 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 380 conflict++; 381 if ((ctn->ctype & _ISPUNCT) & 382 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 383 conflict++; 384 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 385 conflict++; 386 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 387 conflict++; 388 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 389 conflict++; 390 391 if (conflict) { 392 warn("conflicting classes for character 0x%x (%x)", 393 wc, ctn->ctype); 394 } 395 /* 396 * Handle the lower 256 characters using the simple 397 * optimization. Note that if we have not defined the 398 * upper/lower case, then we identity map it. 399 */ 400 if ((unsigned)wc < _CACHED_RUNES) { 401 rl.runetype[wc] = ctn->ctype; 402 if (ctn->tolower) 403 rl.maplower[wc] = ctn->tolower; 404 if (ctn->toupper) 405 rl.mapupper[wc] = ctn->toupper; 406 continue; 407 } 408 409 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 410 (last_ct->wc + 1 == wc)) { 411 ct[rl.runetype_ext_nranges-1].max = wc; 412 } else { 413 rl.runetype_ext_nranges++; 414 ct = realloc(ct, 415 sizeof (*ct) * rl.runetype_ext_nranges); 416 ct[rl.runetype_ext_nranges - 1].min = wc; 417 ct[rl.runetype_ext_nranges - 1].max = wc; 418 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 419 } 420 last_ct = ctn; 421 if (ctn->tolower == 0) { 422 last_lo = NULL; 423 } else if ((last_lo != NULL) && 424 (last_lo->tolower + 1 == ctn->tolower)) { 425 lo[rl.maplower_ext_nranges-1].max = wc; 426 last_lo = ctn; 427 } else { 428 rl.maplower_ext_nranges++; 429 lo = realloc(lo, 430 sizeof (*lo) * rl.maplower_ext_nranges); 431 lo[rl.maplower_ext_nranges - 1].min = wc; 432 lo[rl.maplower_ext_nranges - 1].max = wc; 433 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 434 last_lo = ctn; 435 } 436 437 if (ctn->toupper == 0) { 438 last_up = NULL; 439 } else if ((last_up != NULL) && 440 (last_up->toupper + 1 == ctn->toupper)) { 441 up[rl.mapupper_ext_nranges-1].max = wc; 442 last_up = ctn; 443 } else { 444 rl.mapupper_ext_nranges++; 445 up = realloc(up, 446 sizeof (*up) * rl.mapupper_ext_nranges); 447 up[rl.mapupper_ext_nranges - 1].min = wc; 448 up[rl.mapupper_ext_nranges - 1].max = wc; 449 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 450 last_up = ctn; 451 } 452 } 453 454 if ((wr_category(&rl, sizeof (rl), f) < 0) || 455 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 456 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 457 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 458 return; 459 } 460 461 close_category(f); 462 } 463