1#!/usr/bin/env slsh 2% The unicode database contains 15 fields 3 4private define usage () 5{ 6 () = fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt\n", __argv[0]); 7 exit (1); 8} 9 10if (__argc != 3) 11 usage (); 12 13private variable Unicode_Data_File = __argv[1]; 14private variable East_Asian_File = __argv[2]; 15 16private define make_char_def_table (num) 17{ 18 variable s = struct 19 { 20 code_point, 21 char_name, 22 general_cat, 23 combining_class, 24 bidirectional_cat, 25 char_decomp_map, 26 decimal_digit_val, 27 digit_val, 28 numeric_val, 29 is_mirrored, 30 unicode1_name, 31 iso10646_comment, 32 uppercase_mapping, 33 lowercase_mapping, 34 titlecase_mapping, 35 east_asian_prop, 36 }; 37 38 s.code_point = Int32_Type[num]; s.code_point[*] = [0:num-1]; 39 s.char_name = String_Type[num]; s.char_name[*] = ""; 40 s.general_cat = String_Type[num]; s.general_cat[*] = ""; 41 s.combining_class = String_Type[num]; s.combining_class[*] = ""; 42 s.bidirectional_cat = String_Type[num]; s.bidirectional_cat[*] = ""; 43 s.char_decomp_map = String_Type[num]; s.char_decomp_map[*] = ""; 44 s.decimal_digit_val = String_Type[num]; 45 s.digit_val = String_Type[num]; 46 s.numeric_val = String_Type[num]; 47 s.is_mirrored = Char_Type[num]; 48 s.unicode1_name = String_Type[num]; 49 s.iso10646_comment = String_Type[num]; 50 s.lowercase_mapping = Int32_Type[num]; 51 s.uppercase_mapping = Int32_Type[num]; 52 s.titlecase_mapping = Int32_Type[num]; 53 s.east_asian_prop = String_Type[num]; s.east_asian_prop[*] = ""; 54 return s; 55} 56 57private define fixup_ranges (starts, stops, s) 58{ 59 variable nranges = length (starts); 60 61 foreach (get_struct_field_names (s)) 62 { 63 variable field_name = (); 64 65 if (field_name == "code_point") 66 continue; 67 68 variable field = get_struct_field (s, field_name); 69 _for (0, nranges-1, 1) 70 { 71 variable i = (); 72 variable start = starts[i]; 73 variable stop = stops[i]; 74 75 field[[start+1:stop-1]] = field[start]; 76 } 77 } 78} 79 80private define hexstr_to_int (s) 81{ 82 return integer (strcat ("0x0", strtrim (s))); 83} 84 85private define read_file (file) 86{ 87 variable lines = fgetslines (fopen (file, "r")); 88 89 % Get the code point of the last line since it determines the number of 90 % code points 91 variable num = 1+hexstr_to_int (strchop (lines[-1], ';', 0)[0]); 92 93 variable s = make_char_def_table (num); 94 variable is_range = Char_Type[num]; 95 variable i, j; 96 97 foreach (lines) 98 { 99 variable line = (); 100 variable fields = strchop (line, ';', 0); 101 102 if (fields[2] == "Cs") 103 continue; % surrogate 104 105 i = hexstr_to_int (fields[0]); 106 107 variable field = fields[1]; 108 s.char_name[i] = field; 109 % A range is specified if the field is of the form <xxx, First> 110 % or <xxx, Last> 111 if (field[0] == '<') 112 { 113 if (string_match (field, ", First>$", 1)) 114 is_range [i] = 1; 115 else if (string_match (field, ", Last>$", 1)) 116 is_range[i] = -1; 117 } 118 119 s.general_cat[i] = fields[2]; 120 s.combining_class[i] = fields[3]; 121 s.bidirectional_cat[i] = fields[4]; 122 s.char_decomp_map[i] = fields[5]; 123 s.decimal_digit_val[i] = fields[6]; 124 s.digit_val[i] = fields[7]; 125 s.numeric_val[i] = fields[8]; 126 s.is_mirrored[i] = (fields[9] == "Y"); 127 s.unicode1_name[i] = fields[10]; 128 s.iso10646_comment[i] = fields[11]; 129 s.uppercase_mapping[i] = hexstr_to_int (fields[12]); 130 s.lowercase_mapping[i] = hexstr_to_int (fields[13]); 131 s.titlecase_mapping[i] = hexstr_to_int (fields[14]); 132 } 133 134 i = where (is_range == 1); 135 if (length (i)) 136 { 137 j = where (is_range == -1); 138 if (length (i) != length (j)) 139 verror ("First and Last ranges do not match"); 140 141 fixup_ranges (i, j, s); 142 } 143 144 i = where (s.lowercase_mapping == 0); 145 s.lowercase_mapping[i] = s.code_point[i]; 146 i = where (s.uppercase_mapping == 0); 147 s.uppercase_mapping[i] = s.code_point[i]; 148 i = where (s.titlecase_mapping == 0); 149 s.titlecase_mapping[i] = s.code_point[i]; 150 151 return s; 152} 153 154private define read_east_asian_file (s, file) 155{ 156 foreach (fopen (file, "r")) using ("line") 157 { 158 variable line = (); 159 if (line[0] == '#') 160 continue; 161 line = strtrim (line); 162 !if (strlen (line)) 163 continue; 164 variable code, prop; 165 variable fields = strtok (line, "; "); 166 code = fields[0]; 167 if (is_substr (code, "..")) 168 { 169 code = strtok (code, "."); 170 variable code_start = hexstr_to_int (code[0]); 171 variable code_stop = hexstr_to_int (code[1]); 172 prop = fields[1]; 173 _for (code_start, code_stop, 1) 174 { 175 code = (); 176 s.east_asian_prop[code] = prop; 177 } 178 continue; 179 } 180 code = hexstr_to_int (code); 181 s.east_asian_prop[code] = fields[1]; 182 } 183} 184 185private variable LOWER = 0x0001; 186private variable UPPER = 0x0002; 187private variable ALPHA = 0x0004; 188private variable XDIGIT = 0x0008; 189private variable SPACE = 0x0010; 190private variable BLANK = 0x0020; 191private variable CNTRL = 0x0040; 192private variable PRINT = 0x0080; 193 194private variable DIGIT = 0x0100; 195private variable GRAPH = 0x0200; 196private variable ALNUM = 0x0400; 197private variable PUNCT = 0x0800; 198private variable ASCII = 0x1000; 199 200private variable Classification_C_Table_Type = "_pSLuint16_Type"; 201private variable Classification_C_Table_Format = "0x%04X"; 202 203private define init_file (file) 204{ 205 variable fp = fopen (file, "w"); 206 207 () = fprintf (fp, "/* This file was automatically created by %s */\n", __argv[0]); 208 209 return fp; 210} 211 212private define check_data_type (datatype, s, what, table_name) 213{ 214 variable min_val, max_val; 215 216 switch (datatype) 217 { 218 case "char": 219 min_val = -128; max_val = 127; 220 } 221 { 222 case "unsigned char": 223 min_val = 0; max_val = 255; 224 } 225 { 226 case "_pSLint16_Type": 227 min_val = -32768; max_val = 32767; 228 } 229 { 230 case "_pSLuint16_Type": 231 min_val = 0; max_val = 0xFFFF; 232 } 233 { 234 case "_pSLint32_Type": 235 min_val = -2147483648; 236 max_val = 0x7FFFFFFF; 237 } 238 { 239 case "_pSLuint32_Type": 240 min_val = 0; max_val = 0xFFFFFFFFUL; 241 } 242 { 243 case "bit": 244 return; 245 } 246 { 247 () = fprintf (stderr, "check_data_type: %s not supported\n", datatype); 248 return; 249 } 250 251 variable i = wherenot (min_val <= what <= max_val); 252 if (length (i) == 0) 253 return; 254 255 () = fprintf (stderr, "***WARNING: table for %s needs a larger type for char 0x%04X\n", table_name, s.code_point[i[0]]); 256} 257 258private define write_toxxx_table (fp, s, what, datatype, 259 table_name, format, shift_bits, 260 greater_than_max_value) 261{ 262 variable ch = s.code_point; 263 variable use_bitmap = 0; 264 variable i, j, k; 265 variable bits_per_value; 266 267 check_data_type (datatype, s, what, table_name); 268 269 if (datatype == "bit") 270 { 271 variable max_what = max(what); 272 bits_per_value = -1; 273 variable shift_bits_offset = 4; 274 foreach ([1,2,4,8]) % 7, 3, 1, 0 275 { 276 i = (); 277 shift_bits_offset--; 278 if (max_what >= (1 shl i)) 279 continue; 280 281 bits_per_value = i; 282 break; 283 } 284 285 if (bits_per_value == -1) 286 verror ("bit data type cannot represent this object\n"); 287 288 datatype = "unsigned char"; 289 use_bitmap = 1; 290 } 291 292 if (use_bitmap) 293 { 294 variable num_values_per_8bits = 8/bits_per_value; 295 ch = ch/num_values_per_8bits; 296 } 297 298 % Take advantage of the sparseness of the table. To this end, write 299 % N tables with nentries per table. 300 variable nentries = (1 shl shift_bits); 301 variable ntables = max(ch)/nentries + 1; 302 303 variable data = Int_Type[ntables * nentries]; 304 305 if (use_bitmap) 306 { 307 i = length(what)/num_values_per_8bits; 308 if (i * num_values_per_8bits < length(what)) 309 i++; 310 311 if (greater_than_max_value) 312 { 313 vmessage ("Padding table: num_values_per_8bits = %d", num_values_per_8bits); 314 variable new_what = @Array_Type(_typeof(what), [i*num_values_per_8bits]); 315 new_what[[0:length(what)-1]] = what; 316 new_what[[length(what):]] = greater_than_max_value; 317 what = new_what; 318 } 319 variable bitmap = UChar_Type[i]; 320 321 % Fillout the bitmap with the correct values for characters beyond the 322 % tabulated range. 323 324 variable bit = 0; 325 _for (0, num_values_per_8bits-1, 1) 326 { 327 variable b = (); 328 variable values = what[[b::num_values_per_8bits]]; 329 _for (0, bits_per_value-1, 1) 330 { 331 k = (); 332 i = where (values & (1 shl k)); 333 bitmap[i] |= (1 shl bit); 334 bit++; 335 } 336 } 337 what = bitmap; 338 } 339 340 data[[0:max(ch)]] = what; 341 342 variable unique_tables = Array_Type[ntables]; 343 variable tables = Int_Type[ntables]; 344 345 variable num_unique = 0; 346 unique_tables[0] = [1:nentries]*0; 347 num_unique = 1; 348 349 _for (0, ntables-1, 1) 350 { 351 i = (); 352 variable table = data[nentries*i + [0:nentries-1]]; 353 354 j = 0; 355 while (j < num_unique) 356 { 357 if (0 == length (where (unique_tables[j] != table))) 358 break; 359 j++; 360 } 361 362 tables[i] = j; 363 if (j == num_unique) 364 { 365 unique_tables[num_unique] = table; 366 num_unique++; 367 } 368 } 369 370 % How many tables do we really need? 371 i = where (tables != 0); 372 ntables = 1 + i[-1]; 373 374 if (typeof (fp) == String_Type) 375 fp = init_file (fp); 376 377 variable bitmap_multiplier = 1; 378 if (use_bitmap) 379 bitmap_multiplier = num_values_per_8bits; 380 381 variable table_lookup_name = sprintf ("SL_%s_LOOKUP", strup(table_name)); 382 variable max_char_name = sprintf ("SL_%s_MAX_CHAR", strup(table_name)); 383 variable assign_lookup_name = sprintf ("SL_%s_ALOOKUP", strup(table_name)); 384 385 table_name = sprintf ("_pSLwc_%s_Table", table_name); 386 387 () = fprintf (fp, "#define %s 0x%Xul\n\n", max_char_name, 388 bitmap_multiplier * ntables * nentries); 389 390 if (use_bitmap == 0) 391 { 392 () = fprintf (fp, "#define %s(x) \\\n", table_lookup_name); 393 () = fprintf (fp, " (((unsigned)(x)>=%s)?%d:(%s[(unsigned)(x)>>%d][(unsigned)(x)&0x%X]))\n\n", 394 max_char_name, greater_than_max_value, table_name, shift_bits, nentries-1); 395 } 396 else if (num_values_per_8bits == 8) % boolean (0 or 1) 397 { 398 ()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name); 399 ()=fprintf(fp, "{ \\\n"); 400 ()=fprintf(fp, " const %s *_t; \\\n", datatype); 401 ()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name); 402 ()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n", 403 table_name, shift_bits_offset + shift_bits); 404 ()=fprintf(fp, " && (_t[(unsigned)((x)>>%d)&0x%X] & (%d << ((x)&%d)))); \\\n", 405 shift_bits_offset, nentries - 1, int(2^bits_per_value-1), num_values_per_8bits-1); 406 ()=fprintf(fp, "}\n"); 407 } 408 else % bit mapped with num_values_per_8bits = 1,2, or 4 409 { 410 ()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name); 411 ()=fprintf(fp, "{ \\\n"); 412 ()=fprintf(fp, " const %s *_t; \\\n", datatype); 413 ()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name); 414 ()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n", 415 table_name, shift_bits_offset + shift_bits); 416 ()=fprintf(fp, " ? ((_t[(unsigned)((x)>>%d)&0x%X]>>(%d*((x)&%d)))&%d) : %d); \\\n", 417 shift_bits_offset, nentries - 1, bits_per_value, 418 num_values_per_8bits-1, int(2^bits_per_value-1), 419 greater_than_max_value); 420 ()=fprintf(fp, "}\n"); 421 } 422 423 () = fprintf (fp, "extern const %s *%s[%d];\n\n", datatype, table_name, ntables); 424 425 () = fprintf (fp, "#ifdef DEFINE%s\n", strup (table_name)); 426 427 format = [format, format, format, format, format, format, format, format]; 428 format = strcat (" /*0x%02X-0x%02X*/ ", strjoin (format, ", ")); 429 430 _for (0, num_unique-1, 1) 431 { 432 i = (); 433 if ((i == 0) and use_bitmap) 434 continue; 435 436 () = fprintf (fp, "static const %s Table_%02d[%d] =\n{\n", 437 datatype, i, nentries); 438 439 table = unique_tables[i]; 440 _for (0, nentries-1, 8) 441 { 442 j = (); 443 if (j) 444 () = fputs (",\n", fp); 445 () = fprintf (fp, format, 446 j, (j+7), 447 table[j], table[j+1], table[j+2], table[j+3], 448 table[j+4], table[j+5], table[j+6], table[j+7]); 449 } 450 () = fputs ("\n};\n\n", fp); 451 } 452 453 () = fprintf (fp, "const %s *%s[%d] =\n{", datatype, table_name, ntables); 454 i = 0; 455 while (i < ntables) 456 { 457 if (i) () = fputs (",", fp); 458 459 !if (i mod 6) 460 () = fputs ("\n", fp); 461 462 if (use_bitmap and (tables[i] == 0)) 463 () = fprintf (fp, " NULL"); 464 else 465 () = fprintf (fp, " Table_%02d", tables[i]); 466 467 i++; 468 } 469 () = fputs ("\n};\n", fp); 470 () = fprintf (fp, "#endif /* DEFINE%s */\n", strup(table_name)); 471 472 () = fclose (fp); 473 474 variable size; 475 476 if (is_substr (datatype, "char")) 477 size = 1; 478 else if (is_substr (datatype, "short")) 479 size = 2; 480 else size = 4; 481 482 if (use_bitmap == 0) 483 { 484 vmessage ("Estimated table size: %d bytes", 485 4*ntables + size*nentries*num_unique); 486 } 487 else 488 { 489 vmessage ("Estimated table size: %d bytes", 490 4*ntables + size*nentries*(num_unique-1)); 491 } 492} 493 494private define make_char_classes (s) 495{ 496 variable i; 497 variable code_point = s.code_point; 498 variable gcat0 = int (s.general_cat); 499 variable char_classes = UShort_Type[length(code_point)]; 500#iftrue 501 % LOWER 502 i = where (((code_point == s.lowercase_mapping) 503 and (code_point != s.uppercase_mapping))); 504 char_classes[i] |= LOWER; 505 506 % UPPER 507 i = where (((code_point == s.uppercase_mapping) 508 or (code_point == s.titlecase_mapping)) 509 and (code_point != s.lowercase_mapping)); 510 char_classes[i] |= UPPER; 511#endif 512 % LOWER 513 i = where ((s.general_cat == "Ll") and (0 == (char_classes & UPPER))); 514 char_classes[i] |= LOWER; 515 516 % UPPER 517 i = where ((s.general_cat == "Lu") and (0 == (char_classes & LOWER))); 518 char_classes[i] |= UPPER; 519 520 % ALPHA 521 i = where ((char_classes & (UPPER|LOWER)) or (gcat0 == 'L')); 522 char_classes[i] |= ALPHA; 523 524 % XDIGIT 525 i = where (((code_point >= '0') and (code_point <= '9')) 526 or ((code_point >= 'A') and (code_point <= 'F')) 527 or ((code_point >= 'a') and (code_point <= 'f'))); 528 char_classes[i] |= XDIGIT; 529 530 % SPACE, BLANK 531 char_classes[' '] |= SPACE|BLANK; 532 char_classes['\t'] |= SPACE|BLANK; 533 char_classes['\n'] |= SPACE; 534 char_classes['\r'] |= SPACE; 535 char_classes['\f'] |= SPACE; 536 char_classes['\v'] |= SPACE; 537 % char_classes [where (s.bidirectional_cat == "WS")] |= SPACE; 538 i = where ((gcat0 == 'Z') 539 and not array_map (Int_Type, &is_substr, s.char_decomp_map, "<noBreak>")); 540 char_classes [i] |= SPACE; 541 542 % CNTRL 543 char_classes[where(s.char_name == "<control>")] |= CNTRL; 544 545 % PRINT 546 char_classes[where((s.char_name != "") and not (char_classes & CNTRL))] 547 |= PRINT; 548 549 % DIGIT 550 i = where ((char_classes & XDIGIT) and not (char_classes & ALPHA)); 551 char_classes[i] |= DIGIT; 552 553 % GRAPH 554 char_classes[where ((char_classes & PRINT) and not (char_classes & SPACE))] 555 |= GRAPH; 556 557 % ALNUM 558 char_classes[where (char_classes & (ALPHA|DIGIT))] |= ALNUM; 559 560 % PUNCT 561 char_classes[where ((char_classes & GRAPH) and not (char_classes & ALNUM))] 562 |= PUNCT; 563 564 % ASCII 565 char_classes[where (code_point < 0x80)] |= ASCII; 566 return char_classes; 567} 568 569private define write_char_classes (file, s, char_classes) 570{ 571 variable fp = init_file (file); 572 () = fprintf (fp, "#define SLCHARCLASS_LOWER\t0x%04X\n", LOWER); 573 () = fprintf (fp, "#define SLCHARCLASS_UPPER\t0x%04X\n", UPPER); 574 () = fprintf (fp, "#define SLCHARCLASS_ALPHA\t0x%04X\n", ALPHA); 575 () = fprintf (fp, "#define SLCHARCLASS_XDIGIT\t0x%04X\n", XDIGIT); 576 () = fprintf (fp, "#define SLCHARCLASS_SPACE\t0x%04X\n", SPACE); 577 () = fprintf (fp, "#define SLCHARCLASS_BLANK\t0x%04X\n", BLANK); 578 () = fprintf (fp, "#define SLCHARCLASS_CNTRL\t0x%04X\n", CNTRL); 579 () = fprintf (fp, "#define SLCHARCLASS_PRINT\t0x%04X\n", PRINT); 580 () = fprintf (fp, "#define SLCHARCLASS_DIGIT\t0x%04X\n", DIGIT); 581 () = fprintf (fp, "#define SLCHARCLASS_GRAPH\t0x%04X\n", GRAPH); 582 () = fprintf (fp, "#define SLCHARCLASS_ALNUM\t0x%04X\n", ALNUM); 583 () = fprintf (fp, "#define SLCHARCLASS_PUNCT\t0x%04X\n", PUNCT); 584 () = fprintf (fp, "#define SLCHARCLASS_ASCII\t0x%04X\n", ASCII); 585 () = fprintf (fp, "\n\n"); 586 write_toxxx_table (fp, s, char_classes, Classification_C_Table_Type, 587 "Classification", Classification_C_Table_Format, 8, 0); 588} 589 590private define main () 591{ 592 variable s = read_file (Unicode_Data_File); 593 read_east_asian_file (s, East_Asian_File); 594 595 variable char_classes = make_char_classes (s); 596 variable ch = s.code_point; 597 variable is_combining = ((s.general_cat == "Mn") or (s.general_cat == "Me")); 598 % Note: "Mc" (combining, yet spacing) is omitted here since I do 599 % not know what that means. 600 601 % Apparantly Hangul (Conjoining Jamo) characters 0x1160 - 0x11FF 602 % _behave_ like combining characters, but are not flagged as such in 603 % the database. 604 is_combining[[0x1160:0x11FF]] = 1; 605#ifnfalse 606 variable width = UChar_Type[length(ch)]; 607 width[*] = 1; 608 width[where (s.east_asian_prop == "W")] = 2; 609 width[where (s.east_asian_prop == "F")] = 2; 610 width[where (s.east_asian_prop == "A")] = 3; % ambiguous 611 612 width[where (s.general_cat == "Cf")] = 0; 613 width[0xAD] = 3; % SOFT-HYPHEN -- mark is as ambiguous 614 615 width[where(is_combining)] = 0; 616 %width[where (s.bidirectional_cat == "NSM")] = 0; 617 width[where (0 == array_map (Int_Type, &strncmp, s.char_name, 618 "ZERO WIDTH", 10))] 619 = 0; 620 621 width[[0x80:0x9F]] = 4; % displayed as <xx> by SLsmg 622 623 write_toxxx_table ("slwcwidth.h", s, width, "bit", 624 "Width", "0x%02X", 8, 1); 625#endif 626 write_toxxx_table ("slcombin.h", s, 627 is_combining, 628 "bit", "Combining", "0x%02X", 6, 0); 629 write_toxxx_table ("sllower.h", s, s.lowercase_mapping-ch, "_pSLint32_Type", 630 "Tolower", "% 5d", 7, 0); 631 variable tmp = s.lowercase_mapping-ch; 632 write_toxxx_table ("slupper.h", s, s.uppercase_mapping-ch, "_pSLint32_Type", 633 "Toupper", "% 5d", 7, 0); 634 write_char_classes ("slischar.h", s, char_classes); 635 636} 637 638main(); 639