1 /* $NetBSD: ucgendat.c,v 1.1.1.3 2010/12/12 15:21:57 adam Exp $ */ 2 3 /* OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.39.2.5 2010/04/13 20:23:04 kurt Exp */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2010 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37 /* Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp" */ 38 39 #include "portable.h" 40 #include "ldap_config.h" 41 42 #include <stdio.h> 43 #include <ac/ctype.h> 44 #include <ac/stdlib.h> 45 #include <ac/string.h> 46 #include <ac/unistd.h> 47 48 #include <ac/bytes.h> 49 50 #include <lutil.h> 51 52 #ifndef HARDCODE_DATA 53 #define HARDCODE_DATA 1 54 #endif 55 56 #undef ishdigit 57 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ 58 ((cc) >= 'A' && (cc) <= 'F') ||\ 59 ((cc) >= 'a' && (cc) <= 'f')) 60 61 /* 62 * A header written to the output file with the byte-order-mark and the number 63 * of property nodes. 64 */ 65 static ac_uint2 hdr[2] = {0xfeff, 0}; 66 67 #define NUMPROPS 50 68 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) 69 70 typedef struct { 71 char *name; 72 int len; 73 } _prop_t; 74 75 /* 76 * List of properties expected to be found in the Unicode Character Database 77 * including some implementation specific properties. 78 * 79 * The implementation specific properties are: 80 * Cm = Composed (can be decomposed) 81 * Nb = Non-breaking 82 * Sy = Symmetric (has left and right forms) 83 * Hd = Hex digit 84 * Qm = Quote marks 85 * Mr = Mirroring 86 * Ss = Space, other 87 * Cp = Defined character 88 */ 89 static _prop_t props[NUMPROPS] = { 90 {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2}, 91 {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2}, 92 {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2}, 93 {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2}, 94 {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1}, 95 {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1}, 96 {"S", 1}, {"WS", 2}, {"ON", 2}, 97 {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2}, 98 {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2} 99 }; 100 101 typedef struct { 102 ac_uint4 *ranges; 103 ac_uint2 used; 104 ac_uint2 size; 105 } _ranges_t; 106 107 static _ranges_t proptbl[NUMPROPS]; 108 109 /* 110 * Make sure this array is sized to be on a 4-byte boundary at compile time. 111 */ 112 static ac_uint2 propcnt[NEEDPROPS]; 113 114 /* 115 * Array used to collect a decomposition before adding it to the decomposition 116 * table. 117 */ 118 static ac_uint4 dectmp[64]; 119 static ac_uint4 dectmp_size; 120 121 typedef struct { 122 ac_uint4 code; 123 ac_uint2 size; 124 ac_uint2 used; 125 ac_uint4 *decomp; 126 } _decomp_t; 127 128 /* 129 * List of decomposition. Created and expanded in order as the characters are 130 * encountered. First list contains canonical mappings, second also includes 131 * compatibility mappings. 132 */ 133 static _decomp_t *decomps; 134 static ac_uint4 decomps_used; 135 static ac_uint4 decomps_size; 136 137 static _decomp_t *kdecomps; 138 static ac_uint4 kdecomps_used; 139 static ac_uint4 kdecomps_size; 140 141 /* 142 * Composition exclusion table stuff. 143 */ 144 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31))) 145 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31))) 146 static ac_uint4 compexs[8192]; 147 148 /* 149 * Struct for holding a composition pair, and array of composition pairs 150 */ 151 typedef struct { 152 ac_uint4 comp; 153 ac_uint4 count; 154 ac_uint4 code1; 155 ac_uint4 code2; 156 } _comp_t; 157 158 static _comp_t *comps; 159 static ac_uint4 comps_used; 160 161 /* 162 * Types and lists for handling lists of case mappings. 163 */ 164 typedef struct { 165 ac_uint4 key; 166 ac_uint4 other1; 167 ac_uint4 other2; 168 } _case_t; 169 170 static _case_t *upper; 171 static _case_t *lower; 172 static _case_t *title; 173 static ac_uint4 upper_used; 174 static ac_uint4 upper_size; 175 static ac_uint4 lower_used; 176 static ac_uint4 lower_size; 177 static ac_uint4 title_used; 178 static ac_uint4 title_size; 179 180 /* 181 * Array used to collect case mappings before adding them to a list. 182 */ 183 static ac_uint4 cases[3]; 184 185 /* 186 * An array to hold ranges for combining classes. 187 */ 188 static ac_uint4 *ccl; 189 static ac_uint4 ccl_used; 190 static ac_uint4 ccl_size; 191 192 /* 193 * Structures for handling numbers. 194 */ 195 typedef struct { 196 ac_uint4 code; 197 ac_uint4 idx; 198 } _codeidx_t; 199 200 typedef struct { 201 short numerator; 202 short denominator; 203 } _num_t; 204 205 /* 206 * Arrays to hold the mapping of codes to numbers. 207 */ 208 static _codeidx_t *ncodes; 209 static ac_uint4 ncodes_used; 210 static ac_uint4 ncodes_size; 211 212 static _num_t *nums; 213 static ac_uint4 nums_used; 214 static ac_uint4 nums_size; 215 216 /* 217 * Array for holding numbers. 218 */ 219 static _num_t *nums; 220 static ac_uint4 nums_used; 221 static ac_uint4 nums_size; 222 223 static void 224 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2) 225 { 226 int i, j, k, len; 227 _ranges_t *rlp; 228 char *name; 229 230 for (k = 0; k < 2; k++) { 231 if (k == 0) { 232 name = p1; 233 len = 2; 234 } else { 235 if (p2 == 0) 236 break; 237 238 name = p2; 239 len = 1; 240 } 241 242 for (i = 0; i < NUMPROPS; i++) { 243 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 244 break; 245 } 246 247 if (i == NUMPROPS) 248 continue; 249 250 rlp = &proptbl[i]; 251 252 /* 253 * Resize the range list if necessary. 254 */ 255 if (rlp->used == rlp->size) { 256 if (rlp->size == 0) 257 rlp->ranges = (ac_uint4 *) 258 malloc(sizeof(ac_uint4) << 3); 259 else 260 rlp->ranges = (ac_uint4 *) 261 realloc((char *) rlp->ranges, 262 sizeof(ac_uint4) * (rlp->size + 8)); 263 rlp->size += 8; 264 } 265 266 /* 267 * If this is the first code for this property list, just add it 268 * and return. 269 */ 270 if (rlp->used == 0) { 271 rlp->ranges[0] = start; 272 rlp->ranges[1] = end; 273 rlp->used += 2; 274 continue; 275 } 276 277 /* 278 * Optimize the case of adding the range to the end. 279 */ 280 j = rlp->used - 1; 281 if (start > rlp->ranges[j]) { 282 j = rlp->used; 283 rlp->ranges[j++] = start; 284 rlp->ranges[j++] = end; 285 rlp->used = j; 286 continue; 287 } 288 289 /* 290 * Need to locate the insertion point. 291 */ 292 for (i = 0; 293 i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ; 294 295 /* 296 * If the start value lies in the current range, then simply set the 297 * new end point of the range to the end value passed as a parameter. 298 */ 299 if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) { 300 rlp->ranges[i + 1] = end; 301 return; 302 } 303 304 /* 305 * Shift following values up by two. 306 */ 307 for (j = rlp->used; j > i; j -= 2) { 308 rlp->ranges[j] = rlp->ranges[j - 2]; 309 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 310 } 311 312 /* 313 * Add the new range at the insertion point. 314 */ 315 rlp->ranges[i] = start; 316 rlp->ranges[i + 1] = end; 317 rlp->used += 2; 318 } 319 } 320 321 static void 322 ordered_range_insert(ac_uint4 c, char *name, int len) 323 { 324 int i, j; 325 ac_uint4 s, e; 326 _ranges_t *rlp; 327 328 if (len == 0) 329 return; 330 331 /* 332 * Deal with directionality codes introduced in Unicode 3.0. 333 */ 334 if ((len == 2 && memcmp(name, "BN", 2) == 0) || 335 (len == 3 && 336 (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 || 337 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 || 338 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) { 339 /* 340 * Mark all of these as Other Neutral to preserve compatibility with 341 * older versions. 342 */ 343 len = 2; 344 name = "ON"; 345 } 346 347 for (i = 0; i < NUMPROPS; i++) { 348 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 349 break; 350 } 351 352 if (i == NUMPROPS) 353 return; 354 355 /* 356 * Have a match, so insert the code in order. 357 */ 358 rlp = &proptbl[i]; 359 360 /* 361 * Resize the range list if necessary. 362 */ 363 if (rlp->used == rlp->size) { 364 if (rlp->size == 0) 365 rlp->ranges = (ac_uint4 *) 366 malloc(sizeof(ac_uint4) << 3); 367 else 368 rlp->ranges = (ac_uint4 *) 369 realloc((char *) rlp->ranges, 370 sizeof(ac_uint4) * (rlp->size + 8)); 371 rlp->size += 8; 372 } 373 374 /* 375 * If this is the first code for this property list, just add it 376 * and return. 377 */ 378 if (rlp->used == 0) { 379 rlp->ranges[0] = rlp->ranges[1] = c; 380 rlp->used += 2; 381 return; 382 } 383 384 /* 385 * Optimize the cases of extending the last range and adding new ranges to 386 * the end. 387 */ 388 j = rlp->used - 1; 389 e = rlp->ranges[j]; 390 s = rlp->ranges[j - 1]; 391 392 if (c == e + 1) { 393 /* 394 * Extend the last range. 395 */ 396 rlp->ranges[j] = c; 397 return; 398 } 399 400 if (c > e + 1) { 401 /* 402 * Start another range on the end. 403 */ 404 j = rlp->used; 405 rlp->ranges[j] = rlp->ranges[j + 1] = c; 406 rlp->used += 2; 407 return; 408 } 409 410 if (c >= s) 411 /* 412 * The code is a duplicate of a code in the last range, so just return. 413 */ 414 return; 415 416 /* 417 * The code should be inserted somewhere before the last range in the 418 * list. Locate the insertion point. 419 */ 420 for (i = 0; 421 i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ; 422 423 s = rlp->ranges[i]; 424 e = rlp->ranges[i + 1]; 425 426 if (c == e + 1) 427 /* 428 * Simply extend the current range. 429 */ 430 rlp->ranges[i + 1] = c; 431 else if (c < s) { 432 /* 433 * Add a new entry before the current location. Shift all entries 434 * before the current one up by one to make room. 435 */ 436 for (j = rlp->used; j > i; j -= 2) { 437 rlp->ranges[j] = rlp->ranges[j - 2]; 438 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 439 } 440 rlp->ranges[i] = rlp->ranges[i + 1] = c; 441 442 rlp->used += 2; 443 } 444 } 445 446 static void 447 add_decomp(ac_uint4 code, short compat) 448 { 449 ac_uint4 i, j, size; 450 _decomp_t **pdecomps; 451 ac_uint4 *pdecomps_used; 452 ac_uint4 *pdecomps_size; 453 454 if (compat) { 455 pdecomps = &kdecomps; 456 pdecomps_used = &kdecomps_used; 457 pdecomps_size = &kdecomps_size; 458 } else { 459 pdecomps = &decomps; 460 pdecomps_used = &decomps_used; 461 pdecomps_size = &decomps_size; 462 } 463 464 /* 465 * Add the code to the composite property. 466 */ 467 if (!compat) { 468 ordered_range_insert(code, "Cm", 2); 469 } 470 471 /* 472 * Locate the insertion point for the code. 473 */ 474 for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; 475 476 /* 477 * Allocate space for a new decomposition. 478 */ 479 if (*pdecomps_used == *pdecomps_size) { 480 if (*pdecomps_size == 0) 481 *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); 482 else 483 *pdecomps = (_decomp_t *) 484 realloc((char *) *pdecomps, 485 sizeof(_decomp_t) * (*pdecomps_size + 8)); 486 (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', 487 sizeof(_decomp_t) << 3); 488 *pdecomps_size += 8; 489 } 490 491 if (i < *pdecomps_used && code != (*pdecomps)[i].code) { 492 /* 493 * Shift the decomps up by one if the codes don't match. 494 */ 495 for (j = *pdecomps_used; j > i; j--) 496 (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], 497 sizeof(_decomp_t)); 498 } 499 500 /* 501 * Insert or replace a decomposition. 502 */ 503 size = dectmp_size + (4 - (dectmp_size & 3)); 504 if ((*pdecomps)[i].size < size) { 505 if ((*pdecomps)[i].size == 0) 506 (*pdecomps)[i].decomp = (ac_uint4 *) 507 malloc(sizeof(ac_uint4) * size); 508 else 509 (*pdecomps)[i].decomp = (ac_uint4 *) 510 realloc((char *) (*pdecomps)[i].decomp, 511 sizeof(ac_uint4) * size); 512 (*pdecomps)[i].size = size; 513 } 514 515 if ((*pdecomps)[i].code != code) 516 (*pdecomps_used)++; 517 518 (*pdecomps)[i].code = code; 519 (*pdecomps)[i].used = dectmp_size; 520 (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, 521 sizeof(ac_uint4) * dectmp_size); 522 523 /* 524 * NOTICE: This needs changing later so it is more general than simply 525 * pairs. This calculation is done here to simplify allocation elsewhere. 526 */ 527 if (!compat && dectmp_size == 2) 528 comps_used++; 529 } 530 531 static void 532 add_title(ac_uint4 code) 533 { 534 ac_uint4 i, j; 535 536 /* 537 * Always map the code to itself. 538 */ 539 cases[2] = code; 540 541 if (title_used == title_size) { 542 if (title_size == 0) 543 title = (_case_t *) malloc(sizeof(_case_t) << 3); 544 else 545 title = (_case_t *) realloc((char *) title, 546 sizeof(_case_t) * (title_size + 8)); 547 title_size += 8; 548 } 549 550 /* 551 * Locate the insertion point. 552 */ 553 for (i = 0; i < title_used && code > title[i].key; i++) ; 554 555 if (i < title_used) { 556 /* 557 * Shift the array up by one. 558 */ 559 for (j = title_used; j > i; j--) 560 (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1], 561 sizeof(_case_t)); 562 } 563 564 title[i].key = cases[2]; /* Title */ 565 title[i].other1 = cases[0]; /* Upper */ 566 title[i].other2 = cases[1]; /* Lower */ 567 568 title_used++; 569 } 570 571 static void 572 add_upper(ac_uint4 code) 573 { 574 ac_uint4 i, j; 575 576 /* 577 * Always map the code to itself. 578 */ 579 cases[0] = code; 580 581 /* 582 * If the title case character is not present, then make it the same as 583 * the upper case. 584 */ 585 if (cases[2] == 0) 586 cases[2] = code; 587 588 if (upper_used == upper_size) { 589 if (upper_size == 0) 590 upper = (_case_t *) malloc(sizeof(_case_t) << 3); 591 else 592 upper = (_case_t *) realloc((char *) upper, 593 sizeof(_case_t) * (upper_size + 8)); 594 upper_size += 8; 595 } 596 597 /* 598 * Locate the insertion point. 599 */ 600 for (i = 0; i < upper_used && code > upper[i].key; i++) ; 601 602 if (i < upper_used) { 603 /* 604 * Shift the array up by one. 605 */ 606 for (j = upper_used; j > i; j--) 607 (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1], 608 sizeof(_case_t)); 609 } 610 611 upper[i].key = cases[0]; /* Upper */ 612 upper[i].other1 = cases[1]; /* Lower */ 613 upper[i].other2 = cases[2]; /* Title */ 614 615 upper_used++; 616 } 617 618 static void 619 add_lower(ac_uint4 code) 620 { 621 ac_uint4 i, j; 622 623 /* 624 * Always map the code to itself. 625 */ 626 cases[1] = code; 627 628 /* 629 * If the title case character is empty, then make it the same as the 630 * upper case. 631 */ 632 if (cases[2] == 0) 633 cases[2] = cases[0]; 634 635 if (lower_used == lower_size) { 636 if (lower_size == 0) 637 lower = (_case_t *) malloc(sizeof(_case_t) << 3); 638 else 639 lower = (_case_t *) realloc((char *) lower, 640 sizeof(_case_t) * (lower_size + 8)); 641 lower_size += 8; 642 } 643 644 /* 645 * Locate the insertion point. 646 */ 647 for (i = 0; i < lower_used && code > lower[i].key; i++) ; 648 649 if (i < lower_used) { 650 /* 651 * Shift the array up by one. 652 */ 653 for (j = lower_used; j > i; j--) 654 (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1], 655 sizeof(_case_t)); 656 } 657 658 lower[i].key = cases[1]; /* Lower */ 659 lower[i].other1 = cases[0]; /* Upper */ 660 lower[i].other2 = cases[2]; /* Title */ 661 662 lower_used++; 663 } 664 665 static void 666 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code) 667 { 668 ac_uint4 i, j; 669 670 if (ccl_used == ccl_size) { 671 if (ccl_size == 0) 672 ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24); 673 else 674 ccl = (ac_uint4 *) 675 realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24)); 676 ccl_size += 24; 677 } 678 679 /* 680 * Optimize adding the first item. 681 */ 682 if (ccl_used == 0) { 683 ccl[0] = ccl[1] = c; 684 ccl[2] = ccl_code; 685 ccl_used += 3; 686 return; 687 } 688 689 /* 690 * Handle the special case of extending the range on the end. This 691 * requires that the combining class codes are the same. 692 */ 693 if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) { 694 ccl[ccl_used - 2] = c; 695 return; 696 } 697 698 /* 699 * Handle the special case of adding another range on the end. 700 */ 701 if (c > ccl[ccl_used - 2] + 1 || 702 (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) { 703 ccl[ccl_used++] = c; 704 ccl[ccl_used++] = c; 705 ccl[ccl_used++] = ccl_code; 706 return; 707 } 708 709 /* 710 * Locate either the insertion point or range for the code. 711 */ 712 for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ; 713 714 if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) { 715 /* 716 * Extend an existing range. 717 */ 718 ccl[i + 1] = c; 719 return; 720 } else if (c < ccl[i]) { 721 /* 722 * Start a new range before the current location. 723 */ 724 for (j = ccl_used; j > i; j -= 3) { 725 ccl[j] = ccl[j - 3]; 726 ccl[j - 1] = ccl[j - 4]; 727 ccl[j - 2] = ccl[j - 5]; 728 } 729 ccl[i] = ccl[i + 1] = c; 730 ccl[i + 2] = ccl_code; 731 } 732 } 733 734 /* 735 * Adds a number if it does not already exist and returns an index value 736 * multiplied by 2. 737 */ 738 static ac_uint4 739 make_number(short num, short denom) 740 { 741 ac_uint4 n; 742 743 /* 744 * Determine if the number already exists. 745 */ 746 for (n = 0; n < nums_used; n++) { 747 if (nums[n].numerator == num && nums[n].denominator == denom) 748 return n << 1; 749 } 750 751 if (nums_used == nums_size) { 752 if (nums_size == 0) 753 nums = (_num_t *) malloc(sizeof(_num_t) << 3); 754 else 755 nums = (_num_t *) realloc((char *) nums, 756 sizeof(_num_t) * (nums_size + 8)); 757 nums_size += 8; 758 } 759 760 n = nums_used++; 761 nums[n].numerator = num; 762 nums[n].denominator = denom; 763 764 return n << 1; 765 } 766 767 static void 768 add_number(ac_uint4 code, short num, short denom) 769 { 770 ac_uint4 i, j; 771 772 /* 773 * Insert the code in order. 774 */ 775 for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ; 776 777 /* 778 * Handle the case of the codes matching and simply replace the number 779 * that was there before. 780 */ 781 if (i < ncodes_used && code == ncodes[i].code) { 782 ncodes[i].idx = make_number(num, denom); 783 return; 784 } 785 786 /* 787 * Resize the array if necessary. 788 */ 789 if (ncodes_used == ncodes_size) { 790 if (ncodes_size == 0) 791 ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3); 792 else 793 ncodes = (_codeidx_t *) 794 realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8)); 795 796 ncodes_size += 8; 797 } 798 799 /* 800 * Shift things around to insert the code if necessary. 801 */ 802 if (i < ncodes_used) { 803 for (j = ncodes_used; j > i; j--) { 804 ncodes[j].code = ncodes[j - 1].code; 805 ncodes[j].idx = ncodes[j - 1].idx; 806 } 807 } 808 ncodes[i].code = code; 809 ncodes[i].idx = make_number(num, denom); 810 811 ncodes_used++; 812 } 813 814 /* 815 * This routine assumes that the line is a valid Unicode Character Database 816 * entry. 817 */ 818 static void 819 read_cdata(FILE *in) 820 { 821 ac_uint4 i, lineno, skip, code, ccl_code; 822 short wnum, neg, number[2], compat; 823 char line[512], *s, *e; 824 825 lineno = skip = 0; 826 while (fgets(line, sizeof(line), in)) { 827 if( (s=strchr(line, '\n')) ) *s = '\0'; 828 lineno++; 829 830 /* 831 * Skip blank lines and lines that start with a '#'. 832 */ 833 if (line[0] == 0 || line[0] == '#') 834 continue; 835 836 /* 837 * If lines need to be skipped, do it here. 838 */ 839 if (skip) { 840 skip--; 841 continue; 842 } 843 844 /* 845 * Collect the code. The code can be up to 6 hex digits in length to 846 * allow surrogates to be specified. 847 */ 848 for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) { 849 code <<= 4; 850 if (*s >= '0' && *s <= '9') 851 code += *s - '0'; 852 else if (*s >= 'A' && *s <= 'F') 853 code += (*s - 'A') + 10; 854 else if (*s >= 'a' && *s <= 'f') 855 code += (*s - 'a') + 10; 856 } 857 858 /* 859 * Handle the following special cases: 860 * 1. 4E00-9FA5 CJK Ideographs. 861 * 2. AC00-D7A3 Hangul Syllables. 862 * 3. D800-DFFF Surrogates. 863 * 4. E000-F8FF Private Use Area. 864 * 5. F900-FA2D Han compatibility. 865 * ...Plus additional ranges in newer Unicode versions... 866 */ 867 switch (code) { 868 case 0x3400: 869 /* CJK Ideograph Extension A */ 870 add_range(0x3400, 0x4db5, "Lo", "L"); 871 872 add_range(0x3400, 0x4db5, "Cp", 0); 873 874 skip = 1; 875 break; 876 case 0x4e00: 877 /* 878 * The Han ideographs. 879 */ 880 add_range(0x4e00, 0x9fff, "Lo", "L"); 881 882 /* 883 * Add the characters to the defined category. 884 */ 885 add_range(0x4e00, 0x9fa5, "Cp", 0); 886 887 skip = 1; 888 break; 889 case 0xac00: 890 /* 891 * The Hangul syllables. 892 */ 893 add_range(0xac00, 0xd7a3, "Lo", "L"); 894 895 /* 896 * Add the characters to the defined category. 897 */ 898 add_range(0xac00, 0xd7a3, "Cp", 0); 899 900 skip = 1; 901 break; 902 case 0xd800: 903 /* 904 * Make a range of all surrogates and assume some default 905 * properties. 906 */ 907 add_range(0x010000, 0x10ffff, "Cs", "L"); 908 skip = 5; 909 break; 910 case 0xe000: 911 /* 912 * The Private Use area. Add with a default set of properties. 913 */ 914 add_range(0xe000, 0xf8ff, "Co", "L"); 915 skip = 1; 916 break; 917 case 0xf900: 918 /* 919 * The CJK compatibility area. 920 */ 921 add_range(0xf900, 0xfaff, "Lo", "L"); 922 923 /* 924 * Add the characters to the defined category. 925 */ 926 add_range(0xf900, 0xfaff, "Cp", 0); 927 928 skip = 1; 929 break; 930 case 0x20000: 931 /* CJK Ideograph Extension B */ 932 add_range(0x20000, 0x2a6d6, "Lo", "L"); 933 934 add_range(0x20000, 0x2a6d6, "Cp", 0); 935 936 skip = 1; 937 break; 938 case 0xf0000: 939 /* Plane 15 private use */ 940 add_range(0xf0000, 0xffffd, "Co", "L"); 941 skip = 1; 942 break; 943 944 case 0x100000: 945 /* Plane 16 private use */ 946 add_range(0x100000, 0x10fffd, "Co", "L"); 947 skip = 1; 948 break; 949 } 950 951 if (skip) 952 continue; 953 954 /* 955 * Add the code to the defined category. 956 */ 957 ordered_range_insert(code, "Cp", 2); 958 959 /* 960 * Locate the first character property field. 961 */ 962 for (i = 0; *s != 0 && i < 2; s++) { 963 if (*s == ';') 964 i++; 965 } 966 for (e = s; *e && *e != ';'; e++) ; 967 968 ordered_range_insert(code, s, e - s); 969 970 /* 971 * Locate the combining class code. 972 */ 973 for (s = e; *s != 0 && i < 3; s++) { 974 if (*s == ';') 975 i++; 976 } 977 978 /* 979 * Convert the combining class code from decimal. 980 */ 981 for (ccl_code = 0, e = s; *e && *e != ';'; e++) 982 ccl_code = (ccl_code * 10) + (*e - '0'); 983 984 /* 985 * Add the code if it not 0. 986 */ 987 if (ccl_code != 0) 988 ordered_ccl_insert(code, ccl_code); 989 990 /* 991 * Locate the second character property field. 992 */ 993 for (s = e; *s != 0 && i < 4; s++) { 994 if (*s == ';') 995 i++; 996 } 997 for (e = s; *e && *e != ';'; e++) ; 998 999 ordered_range_insert(code, s, e - s); 1000 1001 /* 1002 * Check for a decomposition. 1003 */ 1004 s = ++e; 1005 if (*s != ';') { 1006 compat = *s == '<'; 1007 if (compat) { 1008 /* 1009 * Skip compatibility formatting tag. 1010 */ 1011 while (*s++ != '>'); 1012 } 1013 /* 1014 * Collect the codes of the decomposition. 1015 */ 1016 for (dectmp_size = 0; *s != ';'; ) { 1017 /* 1018 * Skip all leading non-hex digits. 1019 */ 1020 while (!ishdigit(*s)) 1021 s++; 1022 1023 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { 1024 dectmp[dectmp_size] <<= 4; 1025 if (*s >= '0' && *s <= '9') 1026 dectmp[dectmp_size] += *s - '0'; 1027 else if (*s >= 'A' && *s <= 'F') 1028 dectmp[dectmp_size] += (*s - 'A') + 10; 1029 else if (*s >= 'a' && *s <= 'f') 1030 dectmp[dectmp_size] += (*s - 'a') + 10; 1031 } 1032 dectmp_size++; 1033 } 1034 1035 /* 1036 * If there are any codes in the temporary decomposition array, 1037 * then add the character with its decomposition. 1038 */ 1039 if (dectmp_size > 0) { 1040 if (!compat) { 1041 add_decomp(code, 0); 1042 } 1043 add_decomp(code, 1); 1044 } 1045 } 1046 1047 /* 1048 * Skip to the number field. 1049 */ 1050 for (i = 0; i < 3 && *s; s++) { 1051 if (*s == ';') 1052 i++; 1053 } 1054 1055 /* 1056 * Scan the number in. 1057 */ 1058 number[0] = number[1] = 0; 1059 for (e = s, neg = wnum = 0; *e && *e != ';'; e++) { 1060 if (*e == '-') { 1061 neg = 1; 1062 continue; 1063 } 1064 1065 if (*e == '/') { 1066 /* 1067 * Move the the denominator of the fraction. 1068 */ 1069 if (neg) 1070 number[wnum] *= -1; 1071 neg = 0; 1072 e++; 1073 wnum++; 1074 } 1075 number[wnum] = (number[wnum] * 10) + (*e - '0'); 1076 } 1077 1078 if (e > s) { 1079 /* 1080 * Adjust the denominator in case of integers and add the number. 1081 */ 1082 if (wnum == 0) 1083 number[1] = 1; 1084 1085 add_number(code, number[0], number[1]); 1086 } 1087 1088 /* 1089 * Skip to the start of the possible case mappings. 1090 */ 1091 for (s = e, i = 0; i < 4 && *s; s++) { 1092 if (*s == ';') 1093 i++; 1094 } 1095 1096 /* 1097 * Collect the case mappings. 1098 */ 1099 cases[0] = cases[1] = cases[2] = 0; 1100 for (i = 0; i < 3; i++) { 1101 while (ishdigit(*s)) { 1102 cases[i] <<= 4; 1103 if (*s >= '0' && *s <= '9') 1104 cases[i] += *s - '0'; 1105 else if (*s >= 'A' && *s <= 'F') 1106 cases[i] += (*s - 'A') + 10; 1107 else if (*s >= 'a' && *s <= 'f') 1108 cases[i] += (*s - 'a') + 10; 1109 s++; 1110 } 1111 if (*s == ';') 1112 s++; 1113 } 1114 if (cases[0] && cases[1]) 1115 /* 1116 * Add the upper and lower mappings for a title case character. 1117 */ 1118 add_title(code); 1119 else if (cases[1]) 1120 /* 1121 * Add the lower and title case mappings for the upper case 1122 * character. 1123 */ 1124 add_upper(code); 1125 else if (cases[0]) 1126 /* 1127 * Add the upper and title case mappings for the lower case 1128 * character. 1129 */ 1130 add_lower(code); 1131 } 1132 } 1133 1134 static _decomp_t * 1135 find_decomp(ac_uint4 code, short compat) 1136 { 1137 long l, r, m; 1138 _decomp_t *decs; 1139 1140 l = 0; 1141 r = (compat ? kdecomps_used : decomps_used) - 1; 1142 decs = compat ? kdecomps : decomps; 1143 while (l <= r) { 1144 m = (l + r) >> 1; 1145 if (code > decs[m].code) 1146 l = m + 1; 1147 else if (code < decs[m].code) 1148 r = m - 1; 1149 else 1150 return &decs[m]; 1151 } 1152 return 0; 1153 } 1154 1155 static void 1156 decomp_it(_decomp_t *d, short compat) 1157 { 1158 ac_uint4 i; 1159 _decomp_t *dp; 1160 1161 for (i = 0; i < d->used; i++) { 1162 if ((dp = find_decomp(d->decomp[i], compat)) != 0) 1163 decomp_it(dp, compat); 1164 else 1165 dectmp[dectmp_size++] = d->decomp[i]; 1166 } 1167 } 1168 1169 /* 1170 * Expand all decompositions by recursively decomposing each character 1171 * in the decomposition. 1172 */ 1173 static void 1174 expand_decomp(void) 1175 { 1176 ac_uint4 i; 1177 1178 for (i = 0; i < decomps_used; i++) { 1179 dectmp_size = 0; 1180 decomp_it(&decomps[i], 0); 1181 if (dectmp_size > 0) 1182 add_decomp(decomps[i].code, 0); 1183 } 1184 1185 for (i = 0; i < kdecomps_used; i++) { 1186 dectmp_size = 0; 1187 decomp_it(&kdecomps[i], 1); 1188 if (dectmp_size > 0) 1189 add_decomp(kdecomps[i].code, 1); 1190 } 1191 } 1192 1193 static int 1194 cmpcomps(const void *v_comp1, const void *v_comp2) 1195 { 1196 const _comp_t *comp1 = v_comp1, *comp2 = v_comp2; 1197 long diff = comp1->code1 - comp2->code1; 1198 1199 if (!diff) 1200 diff = comp1->code2 - comp2->code2; 1201 return (int) diff; 1202 } 1203 1204 /* 1205 * Load composition exclusion data 1206 */ 1207 static void 1208 read_compexdata(FILE *in) 1209 { 1210 ac_uint2 i; 1211 ac_uint4 code; 1212 char line[512], *s; 1213 1214 (void) memset((char *) compexs, 0, sizeof(compexs)); 1215 1216 while (fgets(line, sizeof(line), in)) { 1217 if( (s=strchr(line, '\n')) ) *s = '\0'; 1218 /* 1219 * Skip blank lines and lines that start with a '#'. 1220 */ 1221 if (line[0] == 0 || line[0] == '#') 1222 continue; 1223 1224 /* 1225 * Collect the code. Assume max 6 digits 1226 */ 1227 1228 for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) { 1229 if (isspace((unsigned char)*s)) break; 1230 code <<= 4; 1231 if (*s >= '0' && *s <= '9') 1232 code += *s - '0'; 1233 else if (*s >= 'A' && *s <= 'F') 1234 code += (*s - 'A') + 10; 1235 else if (*s >= 'a' && *s <= 'f') 1236 code += (*s - 'a') + 10; 1237 } 1238 COMPEX_SET(code); 1239 } 1240 } 1241 1242 /* 1243 * Creates array of compositions from decomposition array 1244 */ 1245 static void 1246 create_comps(void) 1247 { 1248 ac_uint4 i, cu; 1249 1250 comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t)); 1251 1252 for (i = cu = 0; i < decomps_used; i++) { 1253 if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code)) 1254 continue; 1255 comps[cu].comp = decomps[i].code; 1256 comps[cu].count = 2; 1257 comps[cu].code1 = decomps[i].decomp[0]; 1258 comps[cu].code2 = decomps[i].decomp[1]; 1259 cu++; 1260 } 1261 comps_used = cu; 1262 qsort(comps, comps_used, sizeof(_comp_t), cmpcomps); 1263 } 1264 1265 #if HARDCODE_DATA 1266 static void 1267 write_case(FILE *out, _case_t *tab, int num, int first) 1268 { 1269 int i; 1270 1271 for (i=0; i<num; i++) { 1272 if (first) first = 0; 1273 else fprintf(out, ","); 1274 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx", 1275 (unsigned long) tab[i].key, (unsigned long) tab[i].other1, 1276 (unsigned long) tab[i].other2); 1277 } 1278 } 1279 1280 #define PREF "static const " 1281 1282 #endif 1283 1284 static void 1285 write_cdata(char *opath) 1286 { 1287 FILE *out; 1288 ac_uint4 bytes; 1289 ac_uint4 i, idx, nprops; 1290 #if !(HARDCODE_DATA) 1291 ac_uint2 casecnt[2]; 1292 #endif 1293 char path[BUFSIZ]; 1294 #if HARDCODE_DATA 1295 int j, k; 1296 1297 /***************************************************************** 1298 * 1299 * Generate the ctype data. 1300 * 1301 *****************************************************************/ 1302 1303 /* 1304 * Open the output file. 1305 */ 1306 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath); 1307 if ((out = fopen(path, "w")) == 0) 1308 return; 1309 #else 1310 /* 1311 * Open the ctype.dat file. 1312 */ 1313 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath); 1314 if ((out = fopen(path, "wb")) == 0) 1315 return; 1316 #endif 1317 1318 /* 1319 * Collect the offsets for the properties. The offsets array is 1320 * on a 4-byte boundary to keep things efficient for architectures 1321 * that need such a thing. 1322 */ 1323 for (i = idx = 0; i < NUMPROPS; i++) { 1324 propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff; 1325 idx += proptbl[i].used; 1326 } 1327 1328 /* 1329 * Add the sentinel index which is used by the binary search as the upper 1330 * bound for a search. 1331 */ 1332 propcnt[i] = idx; 1333 1334 /* 1335 * Record the actual number of property lists. This may be different than 1336 * the number of offsets actually written because of aligning on a 4-byte 1337 * boundary. 1338 */ 1339 hdr[1] = NUMPROPS; 1340 1341 /* 1342 * Calculate the byte count needed and pad the property counts array to a 1343 * 4-byte boundary. 1344 */ 1345 if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3) 1346 bytes += 4 - (bytes & 3); 1347 nprops = bytes / sizeof(ac_uint2); 1348 bytes += sizeof(ac_uint4) * idx; 1349 1350 #if HARDCODE_DATA 1351 fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS); 1352 1353 fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {"); 1354 1355 for (i = 0; i<nprops; i++) { 1356 if (i) fprintf(out, ","); 1357 if (!(i&7)) fprintf(out, "\n\t"); 1358 else fprintf(out, " "); 1359 fprintf(out, "0x%04x", propcnt[i]); 1360 } 1361 fprintf(out, "\n};\n\n"); 1362 1363 fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {"); 1364 1365 k = 0; 1366 for (i = 0; i < NUMPROPS; i++) { 1367 if (proptbl[i].used > 0) { 1368 for (j=0; j<proptbl[i].used; j++) { 1369 if (k) fprintf(out, ","); 1370 if (!(k&3)) fprintf(out,"\n\t"); 1371 else fprintf(out, " "); 1372 k++; 1373 fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]); 1374 } 1375 } 1376 } 1377 fprintf(out, "\n};\n\n"); 1378 #else 1379 /* 1380 * Write the header. 1381 */ 1382 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1383 1384 /* 1385 * Write the byte count. 1386 */ 1387 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1388 1389 /* 1390 * Write the property list counts. 1391 */ 1392 fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out); 1393 1394 /* 1395 * Write the property lists. 1396 */ 1397 for (i = 0; i < NUMPROPS; i++) { 1398 if (proptbl[i].used > 0) 1399 fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4), 1400 proptbl[i].used, out); 1401 } 1402 1403 fclose(out); 1404 #endif 1405 1406 /***************************************************************** 1407 * 1408 * Generate the case mapping data. 1409 * 1410 *****************************************************************/ 1411 1412 #if HARDCODE_DATA 1413 fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n", 1414 (long) (upper_used + lower_used + title_used)); 1415 1416 fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n", 1417 (long) upper_used, (long) lower_used); 1418 fprintf(out, PREF "ac_uint4 _uccase_map[] = {"); 1419 1420 if (upper_used > 0) 1421 /* 1422 * Write the upper case table. 1423 */ 1424 write_case(out, upper, upper_used, 1); 1425 1426 if (lower_used > 0) 1427 /* 1428 * Write the lower case table. 1429 */ 1430 write_case(out, lower, lower_used, !upper_used); 1431 1432 if (title_used > 0) 1433 /* 1434 * Write the title case table. 1435 */ 1436 write_case(out, title, title_used, !(upper_used||lower_used)); 1437 1438 if (!(upper_used || lower_used || title_used)) 1439 fprintf(out, "\t0"); 1440 1441 fprintf(out, "\n};\n\n"); 1442 #else 1443 /* 1444 * Open the case.dat file. 1445 */ 1446 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath); 1447 if ((out = fopen(path, "wb")) == 0) 1448 return; 1449 1450 /* 1451 * Write the case mapping tables. 1452 */ 1453 hdr[1] = upper_used + lower_used + title_used; 1454 casecnt[0] = upper_used; 1455 casecnt[1] = lower_used; 1456 1457 /* 1458 * Write the header. 1459 */ 1460 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1461 1462 /* 1463 * Write the upper and lower case table sizes. 1464 */ 1465 fwrite((char *) casecnt, sizeof(ac_uint2), 2, out); 1466 1467 if (upper_used > 0) 1468 /* 1469 * Write the upper case table. 1470 */ 1471 fwrite((char *) upper, sizeof(_case_t), upper_used, out); 1472 1473 if (lower_used > 0) 1474 /* 1475 * Write the lower case table. 1476 */ 1477 fwrite((char *) lower, sizeof(_case_t), lower_used, out); 1478 1479 if (title_used > 0) 1480 /* 1481 * Write the title case table. 1482 */ 1483 fwrite((char *) title, sizeof(_case_t), title_used, out); 1484 1485 fclose(out); 1486 #endif 1487 1488 /***************************************************************** 1489 * 1490 * Generate the composition data. 1491 * 1492 *****************************************************************/ 1493 1494 /* 1495 * Create compositions from decomposition data 1496 */ 1497 create_comps(); 1498 1499 #if HARDCODE_DATA 1500 fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n", 1501 comps_used * 4L); 1502 1503 fprintf(out, PREF "ac_uint4 _uccomp_data[] = {"); 1504 1505 /* 1506 * Now, if comps exist, write them out. 1507 */ 1508 if (comps_used > 0) { 1509 for (i=0; i<comps_used; i++) { 1510 if (i) fprintf(out, ","); 1511 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx", 1512 (unsigned long) comps[i].comp, (unsigned long) comps[i].count, 1513 (unsigned long) comps[i].code1, (unsigned long) comps[i].code2); 1514 } 1515 } else { 1516 fprintf(out, "\t0"); 1517 } 1518 fprintf(out, "\n};\n\n"); 1519 #else 1520 /* 1521 * Open the comp.dat file. 1522 */ 1523 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath); 1524 if ((out = fopen(path, "wb")) == 0) 1525 return; 1526 1527 /* 1528 * Write the header. 1529 */ 1530 hdr[1] = (ac_uint2) comps_used * 4; 1531 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1532 1533 /* 1534 * Write out the byte count to maintain header size. 1535 */ 1536 bytes = comps_used * sizeof(_comp_t); 1537 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1538 1539 /* 1540 * Now, if comps exist, write them out. 1541 */ 1542 if (comps_used > 0) 1543 fwrite((char *) comps, sizeof(_comp_t), comps_used, out); 1544 1545 fclose(out); 1546 #endif 1547 1548 /***************************************************************** 1549 * 1550 * Generate the decomposition data. 1551 * 1552 *****************************************************************/ 1553 1554 /* 1555 * Fully expand all decompositions before generating the output file. 1556 */ 1557 expand_decomp(); 1558 1559 #if HARDCODE_DATA 1560 fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n", 1561 decomps_used * 2L); 1562 1563 fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {"); 1564 1565 if (decomps_used) { 1566 /* 1567 * Write the list of decomp nodes. 1568 */ 1569 for (i = idx = 0; i < decomps_used; i++) { 1570 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1571 (unsigned long) decomps[i].code, (unsigned long) idx); 1572 idx += decomps[i].used; 1573 } 1574 1575 /* 1576 * Write the sentinel index as the last decomp node. 1577 */ 1578 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1579 1580 fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {"); 1581 /* 1582 * Write the decompositions themselves. 1583 */ 1584 k = 0; 1585 for (i = 0; i < decomps_used; i++) 1586 for (j=0; j<decomps[i].used; j++) { 1587 if (k) fprintf(out, ","); 1588 if (!(k&3)) fprintf(out,"\n\t"); 1589 else fprintf(out, " "); 1590 k++; 1591 fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]); 1592 } 1593 fprintf(out, "\n};\n\n"); 1594 } 1595 #else 1596 /* 1597 * Open the decomp.dat file. 1598 */ 1599 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath); 1600 if ((out = fopen(path, "wb")) == 0) 1601 return; 1602 1603 hdr[1] = decomps_used; 1604 1605 /* 1606 * Write the header. 1607 */ 1608 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1609 1610 /* 1611 * Write a temporary byte count which will be calculated as the 1612 * decompositions are written out. 1613 */ 1614 bytes = 0; 1615 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1616 1617 if (decomps_used) { 1618 /* 1619 * Write the list of decomp nodes. 1620 */ 1621 for (i = idx = 0; i < decomps_used; i++) { 1622 fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out); 1623 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1624 idx += decomps[i].used; 1625 } 1626 1627 /* 1628 * Write the sentinel index as the last decomp node. 1629 */ 1630 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1631 1632 /* 1633 * Write the decompositions themselves. 1634 */ 1635 for (i = 0; i < decomps_used; i++) 1636 fwrite((char *) decomps[i].decomp, sizeof(ac_uint4), 1637 decomps[i].used, out); 1638 1639 /* 1640 * Seek back to the beginning and write the byte count. 1641 */ 1642 bytes = (sizeof(ac_uint4) * idx) + 1643 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1644 fseek(out, sizeof(ac_uint2) << 1, 0L); 1645 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1646 1647 fclose(out); 1648 } 1649 #endif 1650 1651 #ifdef HARDCODE_DATA 1652 fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n", 1653 kdecomps_used * 2L); 1654 1655 fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {"); 1656 1657 if (kdecomps_used) { 1658 /* 1659 * Write the list of kdecomp nodes. 1660 */ 1661 for (i = idx = 0; i < kdecomps_used; i++) { 1662 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1663 (unsigned long) kdecomps[i].code, (unsigned long) idx); 1664 idx += kdecomps[i].used; 1665 } 1666 1667 /* 1668 * Write the sentinel index as the last decomp node. 1669 */ 1670 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1671 1672 fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {"); 1673 1674 /* 1675 * Write the decompositions themselves. 1676 */ 1677 k = 0; 1678 for (i = 0; i < kdecomps_used; i++) 1679 for (j=0; j<kdecomps[i].used; j++) { 1680 if (k) fprintf(out, ","); 1681 if (!(k&3)) fprintf(out,"\n\t"); 1682 else fprintf(out, " "); 1683 k++; 1684 fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]); 1685 } 1686 fprintf(out, "\n};\n\n"); 1687 } 1688 #else 1689 /* 1690 * Open the kdecomp.dat file. 1691 */ 1692 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath); 1693 if ((out = fopen(path, "wb")) == 0) 1694 return; 1695 1696 hdr[1] = kdecomps_used; 1697 1698 /* 1699 * Write the header. 1700 */ 1701 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1702 1703 /* 1704 * Write a temporary byte count which will be calculated as the 1705 * decompositions are written out. 1706 */ 1707 bytes = 0; 1708 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1709 1710 if (kdecomps_used) { 1711 /* 1712 * Write the list of kdecomp nodes. 1713 */ 1714 for (i = idx = 0; i < kdecomps_used; i++) { 1715 fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out); 1716 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1717 idx += kdecomps[i].used; 1718 } 1719 1720 /* 1721 * Write the sentinel index as the last decomp node. 1722 */ 1723 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1724 1725 /* 1726 * Write the decompositions themselves. 1727 */ 1728 for (i = 0; i < kdecomps_used; i++) 1729 fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4), 1730 kdecomps[i].used, out); 1731 1732 /* 1733 * Seek back to the beginning and write the byte count. 1734 */ 1735 bytes = (sizeof(ac_uint4) * idx) + 1736 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1737 fseek(out, sizeof(ac_uint2) << 1, 0L); 1738 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1739 1740 fclose(out); 1741 } 1742 #endif 1743 1744 /***************************************************************** 1745 * 1746 * Generate the combining class data. 1747 * 1748 *****************************************************************/ 1749 #ifdef HARDCODE_DATA 1750 fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used); 1751 1752 fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {"); 1753 1754 if (ccl_used > 0) { 1755 /* 1756 * Write the combining class ranges out. 1757 */ 1758 for (i = 0; i<ccl_used; i++) { 1759 if (i) fprintf(out, ","); 1760 if (!(i&3)) fprintf(out, "\n\t"); 1761 else fprintf(out, " "); 1762 fprintf(out, "0x%08lx", (unsigned long) ccl[i]); 1763 } 1764 } else { 1765 fprintf(out, "\t0"); 1766 } 1767 fprintf(out, "\n};\n\n"); 1768 #else 1769 /* 1770 * Open the cmbcl.dat file. 1771 */ 1772 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath); 1773 if ((out = fopen(path, "wb")) == 0) 1774 return; 1775 1776 /* 1777 * Set the number of ranges used. Each range has a combining class which 1778 * means each entry is a 3-tuple. 1779 */ 1780 hdr[1] = ccl_used / 3; 1781 1782 /* 1783 * Write the header. 1784 */ 1785 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1786 1787 /* 1788 * Write out the byte count to maintain header size. 1789 */ 1790 bytes = ccl_used * sizeof(ac_uint4); 1791 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1792 1793 if (ccl_used > 0) 1794 /* 1795 * Write the combining class ranges out. 1796 */ 1797 fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out); 1798 1799 fclose(out); 1800 #endif 1801 1802 /***************************************************************** 1803 * 1804 * Generate the number data. 1805 * 1806 *****************************************************************/ 1807 1808 #if HARDCODE_DATA 1809 fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n", 1810 (unsigned long)ncodes_used<<1); 1811 1812 fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {"); 1813 1814 /* 1815 * Now, if number mappings exist, write them out. 1816 */ 1817 if (ncodes_used > 0) { 1818 for (i = 0; i<ncodes_used; i++) { 1819 if (i) fprintf(out, ","); 1820 if (!(i&1)) fprintf(out, "\n\t"); 1821 else fprintf(out, " "); 1822 fprintf(out, "0x%08lx, 0x%08lx", 1823 (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx); 1824 } 1825 fprintf(out, "\n};\n\n"); 1826 1827 fprintf(out, PREF "short _ucnum_vals[] = {"); 1828 for (i = 0; i<nums_used; i++) { 1829 if (i) fprintf(out, ","); 1830 if (!(i&3)) fprintf(out, "\n\t"); 1831 else fprintf(out, " "); 1832 if (nums[i].numerator < 0) { 1833 fprintf(out, "%6d, 0x%04x", 1834 nums[i].numerator, nums[i].denominator); 1835 } else { 1836 fprintf(out, "0x%04x, 0x%04x", 1837 nums[i].numerator, nums[i].denominator); 1838 } 1839 } 1840 fprintf(out, "\n};\n\n"); 1841 } 1842 #else 1843 /* 1844 * Open the num.dat file. 1845 */ 1846 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath); 1847 if ((out = fopen(path, "wb")) == 0) 1848 return; 1849 1850 /* 1851 * The count part of the header will be the total number of codes that 1852 * have numbers. 1853 */ 1854 hdr[1] = (ac_uint2) (ncodes_used << 1); 1855 bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t)); 1856 1857 /* 1858 * Write the header. 1859 */ 1860 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1861 1862 /* 1863 * Write out the byte count to maintain header size. 1864 */ 1865 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1866 1867 /* 1868 * Now, if number mappings exist, write them out. 1869 */ 1870 if (ncodes_used > 0) { 1871 fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out); 1872 fwrite((char *) nums, sizeof(_num_t), nums_used, out); 1873 } 1874 #endif 1875 1876 fclose(out); 1877 } 1878 1879 static void 1880 usage(char *prog) 1881 { 1882 fprintf(stderr, 1883 "Usage: %s [-o output-directory|-x composition-exclusions]", prog); 1884 fprintf(stderr, " datafile1 datafile2 ...\n\n"); 1885 fprintf(stderr, 1886 "-o output-directory\n\t\tWrite the output files to a different"); 1887 fprintf(stderr, " directory (default: .).\n"); 1888 fprintf(stderr, 1889 "-x composition-exclusion\n\t\tFile of composition codes"); 1890 fprintf(stderr, " that should be excluded.\n"); 1891 exit(1); 1892 } 1893 1894 int 1895 main(int argc, char *argv[]) 1896 { 1897 FILE *in; 1898 char *prog, *opath; 1899 1900 prog = lutil_progname( "ucgendat", argc, argv ); 1901 1902 opath = 0; 1903 in = stdin; 1904 1905 argc--; 1906 argv++; 1907 1908 while (argc > 0) { 1909 if (argv[0][0] == '-') { 1910 switch (argv[0][1]) { 1911 case 'o': 1912 argc--; 1913 argv++; 1914 opath = argv[0]; 1915 break; 1916 case 'x': 1917 argc--; 1918 argv++; 1919 if ((in = fopen(argv[0], "r")) == 0) 1920 fprintf(stderr, 1921 "%s: unable to open composition exclusion file %s\n", 1922 prog, argv[0]); 1923 else { 1924 read_compexdata(in); 1925 fclose(in); 1926 in = 0; 1927 } 1928 break; 1929 default: 1930 usage(prog); 1931 } 1932 } else { 1933 if (in != stdin && in != NULL) 1934 fclose(in); 1935 if ((in = fopen(argv[0], "r")) == 0) 1936 fprintf(stderr, "%s: unable to open ctype file %s\n", 1937 prog, argv[0]); 1938 else { 1939 read_cdata(in); 1940 fclose(in); 1941 in = 0; 1942 } 1943 } 1944 argc--; 1945 argv++; 1946 } 1947 1948 if (opath == 0) 1949 opath = "."; 1950 write_cdata(opath); 1951 1952 return 0; 1953 } 1954