1 /* tag: Tom Lord Tue Dec 4 14:41:49 2001 (unidata-generate.c)
2 */
3 /* unidata-generate.c -
4 *
5 ****************************************************************
6 * Copyright (C) 2000 Tom Lord
7 *
8 * See the file "COPYING" for further information about
9 * the copyright and warranty status of this work.
10 */
11
12
13 #include "hackerlab/arrays/pow2-array-compact.h"
14 #include "hackerlab/arrays/pow2-array-print.h"
15 #include "hackerlab/bitsets/bits.h"
16 #include "hackerlab/bitsets/bits-print.h"
17 #include "hackerlab/bitsets/uni-bits.h"
18 #include "hackerlab/rx-posix/regexps.h"
19 #include "hackerlab/uni/unidata.h"
20 #include "hackerlab/unidata/db-macros.h"
21 #include "hackerlab/unidata/case-db-macros.h"
22 #include "hackerlab/cmd/main.h"
23
24
25
26 static t_uchar * program_name = "unidata-generate";
27 static t_uchar * usage = "[options] input-file";
28 static t_uchar * version_string = "1.0";
29
30 #define OPTS(OP, OP2) \
31 OP (opt_help_msg, "h", "help", 0, \
32 "Display a help message and exit.") \
33 OP (opt_long_help, "H", 0, 0, \
34 "Display a verbose help message and exit.") \
35 OP (opt_version, "V", "version", 0, \
36 "Display a release identifier string") \
37 OP2 (opt_version, 0, 0, 0, "and exit.") \
38 OP (opt_verbose, "v", "verbose", 0, \
39 "Display information about the unidata database on stderr.")
40
41 static t_uchar long_help[] = ("Generate C source code from teh unidata database.\n"
42 "This program is used during the build process of \n"
43 "the hackerlab C library.\n");
44
45
46 enum options
47 {
48 OPTS (OPT_ENUM, OPT_IGN)
49 };
50
51 struct opt_desc opts[] =
52 {
53 OPTS (OPT_DESC, OPT_DESC)
54 {-1, 0, 0, 0, 0}
55 };
56
57
58
59 struct unidata
60 {
61 t_unicode code_value;
62 t_uchar * character_name;
63 enum uni_general_category general_category;
64 t_uint canonical_combining_class;
65 enum uni_bidi_category bidi_category;
66 struct uni_decomposition_mapping character_decomposition_mapping;
67 int decimal_digit_value;
68 int digit_value;
69 struct uni_numeric_value numeric_value;
70 int mirrored;
71 t_uchar * unicode_1_name;
72 t_uchar * comment_10646;
73 t_unicode uppercase_mapping;
74 t_unicode lowercase_mapping;
75 t_unicode titlecase_mapping;
76 };
77
78
79
80 #define UNIDATA_FIELDS \
81 UNIDATA_FIELD(unidata_code_value, "[0-9a-fA-F]\\+") \
82 UNIDATA_FIELD(unidata_character_name, "[^;]\\+") \
83 UNIDATA_FIELD(unidata_general_category, "[A-Z][a-z]") \
84 UNIDATA_FIELD(unidata_canonical_combining_class, "[0-9]\\+") \
85 UNIDATA_FIELD(unidata_bidi_category, "[A-Z]\\{1,3\\}") \
86 UNIDATA_FIELD(unidata_character_decomposition_mapping, "[^;]*") \
87 UNIDATA_FIELD(unidata_decimal_digit_value, "[0-9]*") \
88 UNIDATA_FIELD(unidata_digit_value, "[0-9]*") \
89 UNIDATA_FIELD(unidata_numeric_value, "[0-9/]*") \
90 UNIDATA_FIELD(unidata_mirrored, "[YN]") \
91 UNIDATA_FIELD(unidata_unicode_1_name, "[^;]*") \
92 UNIDATA_FIELD(unidata_comment_10646, "[^;]*") \
93 UNIDATA_FIELD(unidata_uppercase_mapping, "[0-9a-fA-F]*") \
94 UNIDATA_FIELD(unidata_lowercase_mapping, "[0-9a-fA-F]*") \
95 UNIDATA_FIELDX(unidata_titlecase_mapping, "[0-9a-fA-F]*")
96
97 /* positions within pmatch data of fields:
98 */
99 enum unidata_field_positions
100 {
101 #undef UNIDATA_FIELD
102 #define UNIDATA_FIELD(A,B) A,
103 #undef UNIDATA_FIELDX
104 #define UNIDATA_FIELDX(A,B) A,
105
106 unidata_entire_line = 0,
107 UNIDATA_FIELDS
108
109 n_unidata_fields
110 };
111
112 /* regexp for unidata fields
113 */
114 static char unidata_regexp_source[] =
115 #undef UNIDATA_FIELD
116 #define UNIDATA_FIELD(A,B) "\\(" B "\\)" ";"
117 #undef UNIDATA_FIELDX
118 #define UNIDATA_FIELDX(A,B) "\\(" B "\\)"
119 "^" UNIDATA_FIELDS "$";
120
121 static regex_t *
unidata_regexp(void)122 unidata_regexp (void)
123 {
124 static int done = 0;
125 static regex_t answer;
126
127 if (done)
128 return &answer;
129
130 if (regcomp (&answer, unidata_regexp_source, REG_NEWLINE))
131 panic ("internal regcomp error for unidata_regexp_source");
132
133 done = 1;
134
135 return &answer;
136 }
137
138
139
140
141 /* Parsed format of a unidata line.
142 */
143
144
145 #undef UNI_DECOMPOSITION_TYPE
146 #define UNI_DECOMPOSITION_TYPE(NAME) \
147 "<" #NAME ">" "[[:cut %:]]\\|"
148
149 static char uni_decomposition_type_regexp_source[] =
150 "^" "[[:(" UNI_DECOMPOSITION_TYPES "[[:cut 2:]]" "):]]";
151
152 static regex_t *
uni_decomposition_type_regexp(void)153 uni_decomposition_type_regexp (void)
154 {
155 static int done = 0;
156 static regex_t answer;
157
158 if (done)
159 return &answer;
160
161 if (regcomp (&answer, uni_decomposition_type_regexp_source, 0))
162 panic ("internal regcomp error for uni_decomposition_type_regexp_source");
163
164 done = 1;
165
166 return &answer;
167 }
168
169
170
171
172 static char uni_range_first_regexp_source[] = "^[^;]*;[^;]*[[:([[:( First>;[[:cut 1:]]):]]\\|[[:( Last>;[[:cut 2:]]):]]):]]";
173
174 static regex_t *
uni_range_first_regexp(void)175 uni_range_first_regexp (void)
176 {
177 static int done = 0;
178 static regex_t answer;
179
180 if (done)
181 return &answer;
182
183 if (regcomp (&answer, uni_range_first_regexp_source, 0))
184 panic ("internal regcomp error for uni_range_first_regexp");
185
186 done = 1;
187
188 return &answer;
189 }
190
191
192
193
194 void
unidata_parse(struct unidata * unidata,int * in_range,int line_no,t_uchar * line,long len)195 unidata_parse (struct unidata * unidata, int * in_range, int line_no, t_uchar * line, long len)
196 {
197 int errn;
198 int match;
199 regmatch_t pmatch[n_unidata_fields];
200 regmatch_t * pmatch_p = pmatch;
201 t_uint n;
202 t_uchar * syntax_error;
203
204 if (len && (line[len - 1] == '\n'))
205 --len;
206 if (len && (line[len - 1] == '\r'))
207 --len;
208
209 match = regnexec (unidata_regexp (), (char *)line, len, n_unidata_fields, &pmatch_p, 0);
210
211 if (match)
212 {
213 syntax_error = "parsing entire line into fields";
214 syntax_exit:
215 safe_printfmt (2, "unicode database:%d: syntax error (%s)\n", line_no, syntax_error);
216 safe_printfmt (2, "\t%.*s\n", (int)len, line);
217 panic ("unrecoverable error parsing unicode database");
218 }
219
220
221 {
222 regmatch_t range_pmatch[1];
223 regmatch_t * range_pmatch_p = range_pmatch;
224
225 match = regnexec (uni_range_first_regexp (), line, len, 1, &range_pmatch_p, 0);
226 switch (match)
227 {
228 case 0:
229 *in_range = range_pmatch[0].final_tag;
230 break;
231 case REG_NOMATCH:
232 *in_range = 0;
233 break;
234 default:
235 safe_printfmt (2, "unicode database:%d:\n", line_no);
236 panic ("internal regexp error");
237 break;
238 }
239 }
240
241
242 if (cvt_hex_to_uint (&errn,
243 &n,
244 line + pmatch[unidata_code_value].rm_so,
245 pmatch[unidata_code_value].rm_eo - pmatch[unidata_code_value].rm_so))
246 {
247 syntax_error = "parsing code value";
248 goto syntax_exit;
249 }
250 else
251 unidata->code_value = (t_unicode)n;
252
253 unidata->character_name = str_save_n (lim_use_must_malloc,
254 line + pmatch[unidata_character_name].rm_so,
255 pmatch[unidata_character_name].rm_eo - pmatch[unidata_character_name].rm_so);
256
257 unidata->general_category = uni_general_category_lookup_n (line + pmatch[unidata_general_category].rm_so,
258 pmatch[unidata_general_category].rm_eo - pmatch[unidata_general_category].rm_so);
259
260 if (cvt_decimal_to_uint (&errn, &unidata->canonical_combining_class,
261 line + pmatch[unidata_canonical_combining_class].rm_so,
262 pmatch[unidata_canonical_combining_class].rm_eo - pmatch[unidata_canonical_combining_class].rm_so))
263 {
264 syntax_error = "parsing canonical combining class";
265 goto syntax_exit;
266 }
267
268 unidata->bidi_category = uni_bidi_category_lookup_n (line + pmatch[unidata_bidi_category].rm_so,
269 pmatch[unidata_bidi_category].rm_eo - pmatch[unidata_bidi_category].rm_so);
270
271 if (pmatch[unidata_character_decomposition_mapping].rm_eo == pmatch[unidata_character_decomposition_mapping].rm_so)
272 {
273 unidata->character_decomposition_mapping.type = uni_decomposition_none;
274 unidata->character_decomposition_mapping.decomposition = 0;
275 }
276 else
277 {
278 regmatch_t decomp_pmatch[1];
279 regmatch_t * decomp_pmatch_p = decomp_pmatch;
280 t_uchar * str;
281 size_t len;
282
283 match = regnexec (uni_decomposition_type_regexp (),
284 line + pmatch[unidata_character_decomposition_mapping].rm_so,
285 pmatch[unidata_character_decomposition_mapping].rm_eo - pmatch[unidata_character_decomposition_mapping].rm_so,
286 1,
287 &decomp_pmatch_p,
288 0);
289 if (match)
290 {
291 syntax_error = "parsing character decomposition mapping type";
292 goto syntax_exit;
293 }
294
295 unidata->character_decomposition_mapping.type = decomp_pmatch[0].final_tag - 1;
296 unidata->character_decomposition_mapping.decomposition = 0;
297
298 str = line + pmatch[unidata_character_decomposition_mapping].rm_so + decomp_pmatch[0].rm_eo;
299 len = (pmatch[unidata_character_decomposition_mapping].rm_eo - pmatch[unidata_character_decomposition_mapping].rm_so) - decomp_pmatch[0].rm_eo;
300
301 while (1)
302 {
303 t_uint d;
304 t_uchar * d_start;
305 size_t d_len;
306
307 while (len && char_is_space (*str))
308 {
309 ++str;
310 --len;
311 }
312
313 if (!len)
314 break;
315
316 d_start = str;
317 d_len = 0;
318 while (len && char_is_xdigit (*str))
319 {
320 ++d_len;
321 ++str;
322 --len;
323 }
324
325 if (cvt_hex_to_uint (&errn, &d, d_start, d_len))
326 {
327 syntax_error = "parsing decomposition value";
328 goto syntax_exit;
329 }
330
331 *(t_unicode *)ar_push ((void **)&unidata->character_decomposition_mapping.decomposition, lim_use_must_malloc, sizeof (t_unicode)) = (t_unicode)d;
332 }
333 }
334
335
336 if (pmatch[unidata_decimal_digit_value].rm_so == pmatch[unidata_decimal_digit_value].rm_eo)
337 unidata->decimal_digit_value = 10;
338 else if (cvt_decimal_to_int (&errn, &unidata->decimal_digit_value,
339 line + pmatch[unidata_decimal_digit_value].rm_so,
340 pmatch[unidata_decimal_digit_value].rm_eo - pmatch[unidata_decimal_digit_value].rm_so))
341 {
342 syntax_error = "parsing decimal digit value";
343 goto syntax_exit;
344 }
345
346 if (pmatch[unidata_digit_value].rm_so == pmatch[unidata_digit_value].rm_eo)
347 unidata->digit_value = -1;
348 else if (cvt_decimal_to_int (&errn, &unidata->digit_value,
349 line + pmatch[unidata_digit_value].rm_so,
350 pmatch[unidata_digit_value].rm_eo - pmatch[unidata_digit_value].rm_so))
351 {
352 syntax_error = "parsing digit value";
353 goto syntax_exit;
354 }
355
356
357 if (pmatch[unidata_numeric_value].rm_so == pmatch[unidata_numeric_value].rm_eo)
358 unidata->numeric_value.numerator = -1;
359 else
360 {
361 t_uchar * slash;
362 t_uchar * str;
363 size_t len;
364
365 str = line + pmatch[unidata_numeric_value].rm_so;
366 len = pmatch[unidata_numeric_value].rm_eo - pmatch[unidata_numeric_value].rm_so;
367 slash = str_chr_index_n (str, len, '/');
368 if (!slash)
369 {
370 unidata->numeric_value.denominator = 1;
371 if (cvt_decimal_to_int (&errn, &unidata->numeric_value.numerator, str, len))
372 {
373 syntax_error = "parsing numeric value";
374 goto syntax_exit;
375 }
376 }
377 else
378 {
379 if (cvt_decimal_to_uint (&errn, &unidata->numeric_value.numerator, str, slash - str))
380 {
381 syntax_error = "parsing numerator of numeric value";
382 goto syntax_exit;
383 }
384 if (cvt_decimal_to_int (&errn, &unidata->numeric_value.denominator, slash + 1, len - (slash - str) - 1))
385 {
386 syntax_error = "parsing denominator of numeric value";
387 goto syntax_exit;
388 }
389 }
390 }
391
392
393 switch (line[pmatch[unidata_mirrored].rm_so])
394 {
395 case 'Y':
396 unidata->mirrored = 1;
397 break;
398 case 'N':
399 unidata->mirrored = 0;
400 break;
401 default:
402 syntax_error = "parsing mirrored";
403 goto syntax_exit;
404 }
405
406
407 if (pmatch[unidata_unicode_1_name].rm_so == pmatch[unidata_unicode_1_name].rm_eo)
408 unidata->unicode_1_name = 0;
409 else
410 unidata->unicode_1_name = str_save_n (lim_use_must_malloc,
411 line + pmatch[unidata_unicode_1_name].rm_so,
412 pmatch[unidata_unicode_1_name].rm_eo - pmatch[unidata_unicode_1_name].rm_so);
413
414 if (pmatch[unidata_comment_10646].rm_so == pmatch[unidata_comment_10646].rm_eo)
415 unidata->comment_10646 = 0;
416 else
417 unidata->comment_10646 = str_save_n (lim_use_must_malloc,
418 line + pmatch[unidata_comment_10646].rm_so,
419 pmatch[unidata_comment_10646].rm_eo - pmatch[unidata_comment_10646].rm_so);
420
421 if (pmatch[unidata_uppercase_mapping].rm_so == pmatch[unidata_uppercase_mapping].rm_eo)
422 unidata->uppercase_mapping = 0;
423 else if (cvt_hex_to_uint (&errn, &n,
424 line + pmatch[unidata_uppercase_mapping].rm_so,
425 pmatch[unidata_uppercase_mapping].rm_eo - pmatch[unidata_uppercase_mapping].rm_so))
426 {
427 syntax_error = "parsing uppercase mapping";
428 goto syntax_exit;
429 }
430 else
431 unidata->uppercase_mapping = n;
432
433 if (pmatch[unidata_lowercase_mapping].rm_so == pmatch[unidata_lowercase_mapping].rm_eo)
434 unidata->lowercase_mapping = 0;
435 else if (cvt_hex_to_uint (&errn, &n,
436 line + pmatch[unidata_lowercase_mapping].rm_so,
437 pmatch[unidata_lowercase_mapping].rm_eo - pmatch[unidata_lowercase_mapping].rm_so))
438 {
439 syntax_error = "parsing lowercase mapping";
440 goto syntax_exit;
441 }
442 else
443 unidata->lowercase_mapping = n;
444
445 if (pmatch[unidata_titlecase_mapping].rm_so == pmatch[unidata_titlecase_mapping].rm_eo)
446 unidata->titlecase_mapping = 0;
447 else if (cvt_hex_to_uint (&errn, &n,
448 line + pmatch[unidata_titlecase_mapping].rm_so,
449 pmatch[unidata_titlecase_mapping].rm_eo - pmatch[unidata_titlecase_mapping].rm_so))
450 {
451 syntax_error = "parsing titlecase mapping";
452 goto syntax_exit;
453 }
454 else
455 unidata->titlecase_mapping = n;
456 }
457
458
459
460 void
unidata_free(struct unidata * ud)461 unidata_free (struct unidata * ud)
462 {
463 lim_free (lim_use_must_malloc, ud->character_name);
464 ar_free ((void **)&ud->character_decomposition_mapping.decomposition, lim_use_must_malloc);
465 lim_free (lim_use_must_malloc, ud->unicode_1_name);
466 lim_free (lim_use_must_malloc, ud->comment_10646);
467 }
468
469
470
471 int
unidata_next(struct unidata * data,int * in_range,int * line_no,int fd)472 unidata_next (struct unidata * data, int * in_range, int * line_no, int fd)
473 {
474 int errn;
475 t_uchar * line;
476 long len;
477
478 ++*line_no;
479
480 if (0 > vfdbuf_next_line (&errn, &line, &len, fd))
481 {
482 safe_printfmt (2, "unicode database (%d): %s\n", errn, errno_to_string (errn));
483 panic ("unrecoverable error parsing unicode database\n");
484 }
485
486 if (!line)
487 return 0;
488
489 unidata_parse (data, in_range, *line_no, line, (size_t)len);
490 return 1;
491 }
492
493
494 static void
print_t_uint16(int fd,void * elt)495 print_t_uint16 (int fd, void * elt)
496 {
497 safe_printfmt (fd, "%d", (int)(*(t_uint16 *)elt));
498 }
499
500 static void
print_t_case(int fd,void * elt)501 print_t_case (int fd, void * elt)
502 {
503 struct uni_case_mapping * mapping;
504 mapping = (struct uni_case_mapping *)elt;
505 safe_printfmt (fd, "{ 0x%04lX, 0x%04lX, 0x%04lX }", (unsigned long)mapping->upper, (unsigned long)mapping->lower, (unsigned long)mapping->title);
506 /* safe_printfmt (fd, "{ 0x%l04X, 0x%l04X }", (unsigned long)mapping->upper, (unsigned long)mapping->lower); */
507 /* safe_printfmt (fd, "{ 0x%l04X }", (unsigned long)mapping->upper); */
508 }
509
510 static void
print_t_uint8(int fd,void * elt)511 print_t_uint8 (int fd, void * elt)
512 {
513 safe_printfmt (fd, "%d", (int)(*(t_uint8 *)elt));
514 }
515
516 static void
print_t_int16(int fd,void * elt)517 print_t_int16 (int fd, void * elt)
518 {
519 safe_printfmt (fd, "%d", (int)(*(t_int16 *)elt));
520 }
521
522
523 int
main(int argc,char * argv[])524 main (int argc, char * argv[])
525 {
526 int errn;
527 t_uchar * input_file;
528 t_uchar * bits_file;
529 t_uchar * bits_h_file;
530 t_uchar * db_file;
531 t_uchar * db_h_file;
532 t_uchar * case_db_file;
533 t_uchar * case_db_h_file;
534 t_uchar * combine_db_file;
535 t_uchar * combine_db_h_file;
536 t_uchar * decomp_db_file;
537 t_uchar * decomp_db_h_file;
538 int input_fd;
539 int bits_fd;
540 int bits_h_fd;
541 int db_fd;
542 int db_h_fd;
543 int case_db_fd;
544 int case_db_h_fd;
545 int combine_db_fd;
546 int combine_db_h_fd;
547 int decomp_db_fd;
548 int decomp_db_h_fd;
549 bits * sets;
550 bits all_chars;
551 int x;
552 int line_no;
553 int has_decomp;
554 int max_decomp;
555 int total_decomp;
556 t_unicode worst_decomp = 0;
557 int non0_combine;
558 int uppers;
559 int lowers;
560 int titles;
561 int uppers_and_lowers;
562 int uppers_and_title;
563 int two_case;
564 int three_case;
565 int have_case[256];
566 int have_case2[512];
567 int numerics;
568 int non_dec_digits;
569 pow2_array_rules db_rules;
570 pow2_array db_array;
571 pow2_array_rules case_rules;
572 pow2_array case_db_array;
573 pow2_array_rules combine_rules;
574 pow2_array combine_db_array;
575 pow2_array_rules decomp_rules;
576 pow2_array decomp_db_array;
577 union { struct uni_decomposition_mapping * dmp; void *void_ptr; } decompositions;
578 int verbose;
579 int o;
580 struct opt_parsed * option;
581
582 verbose = 0;
583 option = 0;
584
585 while (1)
586 {
587 o = opt_standard (lim_use_must_malloc, &option, opts, &argc, argv, program_name, usage, version_string, long_help, opt_help_msg, opt_long_help, opt_version);
588 if (o == opt_none)
589 break;
590 switch (o)
591 {
592 default:
593 safe_printfmt (2, "unhandled option `%s'\n", option->opt_string);
594 panic ("internal error parsing arguments");
595
596 usage_error:
597 opt_usage (2, argv[0], program_name, usage, 1);
598 panic_exit ();
599
600 #if 0
601 bogus_arg:
602 safe_printfmt (2, "ill-formed argument for `%s' (`%s')\n", option->opt_string, option->arg_string);
603 goto usage_error;
604 #endif
605
606 case opt_verbose:
607 verbose = 1;
608 break;
609 }
610 }
611
612 rx_set_dfa_cache_threshold (2 * 2097152);
613
614 if (argc != 2)
615 goto usage_error;
616
617 input_file = argv[1];
618 bits_file = "bitsets.c";
619 bits_h_file = "bitsets.h";
620 db_file = "db.c";
621 db_h_file = "db.h";
622 case_db_file = "case-db.c";
623 case_db_h_file = "case-db.h";
624 combine_db_file = "combine-db.c";
625 combine_db_h_file = "combine-db.h";
626 decomp_db_file = "decomp-db.c";
627 decomp_db_h_file = "decomp-db.h";
628
629 input_fd = safe_open (input_file, O_RDONLY, 0);
630 bits_fd = safe_open (bits_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
631 bits_h_fd = safe_open (bits_h_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
632 db_fd = safe_open (db_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
633 db_h_fd = safe_open (db_h_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
634 case_db_fd = safe_open (case_db_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
635 case_db_h_fd = safe_open (case_db_h_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
636 combine_db_fd = safe_open (combine_db_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
637 combine_db_h_fd = safe_open (combine_db_h_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
638 decomp_db_fd = safe_open (decomp_db_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
639 decomp_db_h_fd = safe_open (decomp_db_h_file, O_WRONLY | O_CREAT | O_EXCL, 0644);
640
641 if (vfdbuf_buffer_fd (&errn, input_fd, 0, O_RDONLY, 0))
642 panic ("unable to buffer input file");
643 if (vfdbuf_buffer_fd (&errn, bits_fd, 0, O_WRONLY, 0))
644 panic ("unable to buffer bitset output file");
645 if (vfdbuf_buffer_fd (&errn, bits_h_fd, 0, O_WRONLY, 0))
646 panic ("unable to buffer bitset header output file");
647 if (vfdbuf_buffer_fd (&errn, db_fd, 0, O_WRONLY, 0))
648 panic ("unable to buffer db output file");
649 if (vfdbuf_buffer_fd (&errn, db_h_fd, 0, O_WRONLY, 0))
650 panic ("unable to buffer db header output file");
651 if (vfdbuf_buffer_fd (&errn, case_db_fd, 0, O_WRONLY, 0))
652 panic ("unable to buffer db output file");
653 if (vfdbuf_buffer_fd (&errn, case_db_h_fd, 0, O_WRONLY, 0))
654 panic ("unable to buffer db header output file");
655 if (vfdbuf_buffer_fd (&errn, combine_db_fd, 0, O_WRONLY, 0))
656 panic ("unable to buffer db output file");
657 if (vfdbuf_buffer_fd (&errn, combine_db_h_fd, 0, O_WRONLY, 0))
658 panic ("unable to buffer db header output file");
659 if (vfdbuf_buffer_fd (&errn, decomp_db_fd, 0, O_WRONLY, 0))
660 panic ("unable to buffer db output file");
661 if (vfdbuf_buffer_fd (&errn, decomp_db_h_fd, 0, O_WRONLY, 0))
662 panic ("unable to buffer db header output file");
663
664 sets = (bits *)must_malloc (uni_n_categories * sizeof (bits));
665 for (x = 0; x < uni_n_categories; ++x)
666 sets[x] = bits_alloc (0, uni_bits_tree_rule);
667
668 all_chars = bits_alloc (0, uni_bits_tree_rule);
669
670 {
671 static t_uint16 db_default_page[1 << 11];
672 static struct uni_case_mapping case_default_page[16];
673 static t_uint8 combine_default_page[16];
674 static t_int16 decomp_default_page[16];
675
676 {
677 int i;
678 t_uint16 v;
679
680 v = unidata__assemble_db (0, 10, 0, uni_bidi_ON, uni_general_category_Cn);
681 for (i = 0; i < (sizeof (db_default_page) / sizeof (db_default_page[0])); ++i)
682 {
683 db_default_page[i] = v;
684 }
685 }
686
687 db_rules = make_pow2_array_rules (lim_use_must_malloc,
688 sizeof (t_uint16),
689 (void *)db_default_page,
690 11, (size_t)0x3ff,
691 0, (size_t)0x7ff);
692 case_rules = make_pow2_array_rules (lim_use_must_malloc,
693 sizeof (struct uni_case_mapping),
694 (void *)case_default_page,
695 16, (size_t)0x1f,
696 12, (size_t)0xf,
697 8, (size_t)0xf,
698 4, (size_t)0xf,
699 0, (size_t)0xf);
700 combine_rules = make_pow2_array_rules (lim_use_must_malloc,
701 sizeof (t_uint8),
702 (void *)combine_default_page,
703 16, (size_t)0x1f,
704 12, (size_t)0xf,
705 8, (size_t)0xf,
706 4, (size_t)0xf,
707 0, (size_t)0xf);
708 decomp_rules = make_pow2_array_rules (lim_use_must_malloc,
709 sizeof (t_int16),
710 (void *)decomp_default_page,
711 16, (size_t)0x1f,
712 12, (size_t)0xf,
713 8, (size_t)0xf,
714 4, (size_t)0xf,
715 0, (size_t)0xf);
716 }
717
718 decompositions.void_ptr = 0;
719 ar_push (&decompositions.void_ptr, lim_use_must_malloc, sizeof (struct uni_decomposition_mapping));
720 decompositions.dmp[0].type = uni_decomposition_none;
721 decompositions.dmp[0].decomposition = 0;
722
723 db_array = pow2_array_alloc (lim_use_must_malloc, db_rules);
724 case_db_array = pow2_array_alloc (lim_use_must_malloc, case_rules);
725 combine_db_array = pow2_array_alloc (lim_use_must_malloc, combine_rules);
726 decomp_db_array = pow2_array_alloc (lim_use_must_malloc, decomp_rules);
727
728 line_no = 0;
729 has_decomp = 0;
730 max_decomp = 0;
731 total_decomp = 0;
732 non0_combine = 0;
733 uppers = 0;
734 uppers_and_title = 0;
735 uppers_and_lowers = 0;
736 lowers = 0;
737 titles = 0;
738 for (x = 0; x < 256; ++x)
739 have_case[x] = 0;
740 for (x = 0; x < 512; ++x)
741 have_case2[x] = 0;
742 two_case = 0;
743 three_case = 0;
744 numerics = 0;
745 non_dec_digits = 0;
746
747 while (1)
748 {
749 struct unidata data;
750 int in_range;
751 struct unidata data_2;
752
753 if (!unidata_next (&data, &in_range, &line_no, input_fd))
754 break; /* eof */
755
756 if (verbose && !(line_no % 500))
757 safe_printfmt (2, "line %d\n", line_no);
758
759 if (data.general_category == uni_general_category_Cn)
760 {
761 safe_printfmt (2, "Character U+%X is an unassigned character in unidata.txt!", data.code_value);
762 panic ("unidata.txt is broken");
763 }
764
765 {
766 t_uint16 dbv;
767
768 dbv = unidata__assemble_db (1, data.decimal_digit_value, data.mirrored, data.bidi_category, data.general_category);
769 *(t_uint16 *)pow2_array_ref (db_array, data.code_value) = dbv;
770 }
771
772 if (data.digit_value >= 0)
773 ++non_dec_digits;
774
775 if (data.numeric_value.numerator >= 0)
776 {
777 ++numerics;
778 }
779
780 if (data.character_decomposition_mapping.decomposition)
781 {
782 size_t size;
783 ++has_decomp;
784 size = ar_size ((void *)data.character_decomposition_mapping.decomposition, lim_use_must_malloc, sizeof (*data.character_decomposition_mapping.decomposition));
785 total_decomp += (int)size;
786 if (size > max_decomp)
787 {
788 max_decomp = (int) size;
789 worst_decomp = data.code_value;
790 }
791 }
792
793 if (data.uppercase_mapping)
794 {
795 ++uppers;
796 if (data.lowercase_mapping)
797 ++uppers_and_lowers;
798 if (data.titlecase_mapping)
799 ++uppers_and_title;
800 }
801
802 if (data.lowercase_mapping)
803 ++lowers;
804
805 if (data.titlecase_mapping)
806 ++titles;
807
808 {
809 int q;
810
811 q = !!data.uppercase_mapping + !!data.lowercase_mapping + !!data.titlecase_mapping;
812 if (q == 2)
813 ++two_case;
814 else if (q == 3)
815 ++three_case;
816 }
817
818 if (data.uppercase_mapping || data.lowercase_mapping || data.titlecase_mapping)
819 {
820 have_case[0xff & (data.code_value >> 8)] = 1;
821 have_case2[0x1ff & (data.code_value >> 7)] = 1;
822 }
823
824 if (data.uppercase_mapping || data.lowercase_mapping || data.titlecase_mapping)
825 {
826 struct uni_case_mapping * mapping;
827
828 mapping = (struct uni_case_mapping *)pow2_array_ref (case_db_array, data.code_value);
829 mapping->upper = data.uppercase_mapping;
830 mapping->lower = data.lowercase_mapping;
831 mapping->title = data.titlecase_mapping;
832 }
833
834 if (data.canonical_combining_class)
835 {
836 ++non0_combine;
837 *(t_uint8 *)pow2_array_ref (combine_db_array, data.code_value) = data.canonical_combining_class;
838 }
839
840 if (data.character_decomposition_mapping.type != uni_decomposition_none)
841 {
842 struct uni_decomposition_mapping * decomp;
843 t_int16 index;
844
845 if ((1 << 16) <= ar_size (decompositions.void_ptr, lim_use_must_malloc, sizeof (*decompositions.dmp)))
846 panic ("too many characters have decomposition mappings -- unidata-generate needs to be modified\n");
847 index = (t_int16)ar_size (decompositions.void_ptr, lim_use_must_malloc, sizeof (*decompositions.dmp));
848 decomp = (struct uni_decomposition_mapping *)ar_push (&decompositions.void_ptr, lim_use_must_malloc, sizeof (*decompositions.dmp));
849 decomp->type = data.character_decomposition_mapping.type;
850 decomp->decomposition = (t_unicode *)ar_copy ((void *)data.character_decomposition_mapping.decomposition,
851 lim_use_must_malloc,
852 sizeof (t_unicode));
853 *(t_int16 *)pow2_array_ref (decomp_db_array, data.code_value) = index;
854 }
855
856 bits_adjoin (sets[data.general_category], data.code_value);
857
858 if ((data.general_category != uni_general_category_Cs) && (data.general_category != uni_general_category_Co))
859 bits_adjoin (all_chars, data.code_value);
860
861 if (in_range && (in_range != 1))
862 {
863 safe_printfmt (2, "unicode database:%d: found end of range (\"..., Last>;\") without start of range\n", line_no);
864 panic ("unrecoverable error parsing unicode database");
865 }
866 else if (in_range)
867 {
868 if (!unidata_next (&data_2, &in_range, &line_no, input_fd))
869 {
870 safe_printfmt (2, "unicode database:%d: end of line encountered looking for range end\n", line_no);
871 panic ("unrecoverable error parsing unicode database");
872 }
873
874 /* safe_printfmt (2, "line %d (range end)\n", line_no); */
875
876 if (in_range != 2)
877 {
878 safe_printfmt (2, "unicode database:%d: missing end of range (\"..., Last>;\")\n", line_no);
879 panic ("unrecoverable error parsing unicode database");
880 }
881
882 bits_fill_range (sets[data.general_category], data.code_value, data_2.code_value + 1);
883
884 if ((data.general_category != uni_general_category_Cs) && (data.general_category != uni_general_category_Co))
885 {
886 bits_fill_range (all_chars, data.code_value, data_2.code_value + 1);
887 }
888
889 {
890 t_uint16 dbv;
891 int q;
892
893 dbv = unidata__assemble_db (1, data.decimal_digit_value, data.mirrored, data.bidi_category, data.general_category);
894 for (q = data.code_value; q <= data_2.code_value; ++q)
895 *(t_uint16 *)pow2_array_ref (db_array, q) = dbv;
896 if (data.uppercase_mapping || data.lowercase_mapping || data.titlecase_mapping)
897 {
898 struct uni_case_mapping * mapping;
899
900 mapping = (struct uni_case_mapping *)pow2_array_ref (case_db_array, data.code_value);
901 mapping->upper = data.uppercase_mapping;
902 mapping->lower = data.lowercase_mapping;
903 mapping->title = data.titlecase_mapping;
904 for (q = data.code_value; q <= data_2.code_value; ++q)
905 *(struct uni_case_mapping *)pow2_array_ref (case_db_array, q) = *mapping;
906 }
907 if (data.canonical_combining_class)
908 {
909 for (q = data.code_value; q <= data_2.code_value; ++q)
910 {
911 ++non0_combine;
912 *(t_uint8 *)pow2_array_ref (combine_db_array, q) = data.canonical_combining_class;
913 }
914 }
915
916 if (data.character_decomposition_mapping.type != uni_decomposition_none)
917 {
918 t_int16 index;
919
920 index = (t_int16)(ar_size (decompositions.void_ptr, lim_use_must_malloc, sizeof (*decompositions.dmp)) - 1);
921 for (q = data.code_value; q <= data_2.code_value; ++q)
922 *(t_int16 *)pow2_array_ref (decomp_db_array, q) = index;
923 }
924 }
925 }
926 }
927
928 /* "The Private Use character outside of the BMP (U+F0000..U+FFFFD,
929 * U+100000..U+10FFFD) are not listed. These correspond to surrogate
930 * pairs where the first surrogate is in the High Surrogate Private
931 * Use section." - The UnicodeData File Format Version 3.0.0
932 */
933 bits_fill_range (sets[uni_general_category_Co], 0xf0000, 0xffffe);
934 bits_fill_range (all_chars, 0xf0000, 0xffffe);
935 bits_fill_range (sets[uni_general_category_Co], 0x100000, 0x10fff2);
936 bits_fill_range (all_chars, 0x100000, 0x10fff2);
937
938 /* These should appear to be unassigned characters in the database.
939 *
940 * If you encounter a file with private-use characters you don't
941 * recognize, that's an error.
942 *
943 * If you have an application that uses private use characters,
944 * you should make a modified unidata.txt assigning them appropriate
945 * categories (not Co).
946 */
947
948
949 {
950 enum uni_general_category cat;
951
952 for (cat = uni_first_synthetic_category; cat < uni_n_categories; ++cat)
953 {
954 int first_char;
955 int x;
956 bits it;
957
958 first_char = uni_general_category_names[cat].name[0];
959
960 it = bits_alloc (0, uni_bits_tree_rule);
961
962 for (x = 0; uni_general_category_names[x].name; ++x)
963 {
964 if ( (uni_general_category_names[x].name[0] == first_char)
965 && (sets[x]))
966 {
967 bits_union (it, sets[x]);
968 }
969 }
970
971 sets[cat] = it;
972 }
973
974 safe_printfmt (bits_fd, "/* This file automatically generated by unidata-generate */\n\n");
975 safe_printfmt (bits_fd, "#include \"bitsets.h\"\n");
976 safe_printfmt (bits_fd, "\n\n");
977
978 safe_printfmt (bits_h_fd, "/* This file automatically generated by unidata-generate */\n\n");
979 safe_printfmt (bits_h_fd, "#include \"hackerlab/bitsets/bits.h\"\n");
980 safe_printfmt (bits_h_fd, "\n\n");
981
982 for (x = 0; x < uni_n_categories; ++x)
983 {
984 t_uchar * name;
985 t_uchar * stub;
986
987 name = str_alloc_cat (lim_use_must_malloc, "unidata_bitset_", uni_general_category_names[x].name);
988 stub = str_alloc_cat (lim_use_must_malloc, name, "_");
989 bits_compact (sets[x]);
990
991 bits_print (bits_fd, sets[x], name, stub, 0, 0, 0);
992 safe_printfmt (bits_fd, "\n\f\n");
993
994 bits_print (bits_h_fd, sets[x], name, stub, 0, 1, 0);
995 safe_printfmt (bits_h_fd, "\n\f\n");
996 }
997 }
998
999 bits_compact (all_chars);
1000
1001 bits_print (bits_fd, all_chars, "unidata_bitset_universal", "unidata_bitset_universal_", 0, 0, 0);
1002 safe_printfmt (bits_fd, "\n\n");
1003
1004 bits_print (bits_h_fd, all_chars, "unidata_bitset_universal", "unidata_bitset_universal_", 0, 1, 0);
1005 safe_printfmt (bits_h_fd, "\n\n");
1006
1007 if (verbose)
1008 {
1009 safe_printfmt (2, "%d characters have a decomposition mapping\n", has_decomp);
1010 safe_printfmt (2, "%d characters in the widest decomp mapping\n", max_decomp);
1011 safe_printfmt (2, "U+%X is the code value of the widest decomp mapping\n", worst_decomp);
1012 safe_printfmt (2, "%d characters (total) in decomp mappings\n", total_decomp);
1013 safe_printfmt (2, "%d have a non-0 canonical combining class\n", non0_combine);
1014 safe_printfmt (2, "%d have uppercase mappings\n", uppers);
1015 safe_printfmt (2, "%d have lowercase mappings\n", lowers);
1016 safe_printfmt (2, "%d have titlecase mappings\n", titles);
1017 safe_printfmt (2, "%d have upper and lower mappings\n", uppers_and_lowers);
1018 safe_printfmt (2, "%d have upper and title mappings\n", uppers_and_title);
1019 safe_printfmt (2, "%d have lower and title mappings\n", two_case - (uppers_and_lowers + uppers_and_title));
1020 }
1021
1022 if (verbose)
1023 {
1024 {
1025 int case_pages;
1026 int case_half_pages;
1027
1028 case_pages = 0;
1029 case_half_pages = 0;
1030
1031 for (x = 0; x < 256; ++x)
1032 if (have_case[x])
1033 ++case_pages;
1034
1035 for (x = 0; x < 512; ++x)
1036 if (have_case2[x])
1037 ++case_half_pages;
1038
1039 safe_printfmt (2, "%d pages (256 characters/page) have case mappings\n", case_pages);
1040 safe_printfmt (2, "%d half pages (128 characters/page) have case mappings\n", case_half_pages);
1041 safe_printfmt (2, "%d characters have exactly two case mappings\n", two_case);
1042 safe_printfmt (2, "%d characters have exactly three case mappings\n", three_case);
1043 }
1044
1045 safe_printfmt (2, "%d characters have a numeric value\n", numerics);
1046 safe_printfmt (2, "%d characters are non-decimal digits\n", non_dec_digits);
1047 }
1048
1049 safe_printfmt (db_fd, "/* This file automatically generated by unidata-generate */\n\n");
1050 safe_printfmt (db_fd, "#include \"db.h\"\n");
1051 safe_printfmt (db_fd, "\n\n");
1052
1053 pow2_array_compact (db_array, 0, 0, 0);
1054 pow2_array_print (db_fd, db_array, "unidata__db", "unidata__db", 0, 0, 0, "t_uint16", print_t_uint16);
1055 safe_printfmt (db_fd, "\n\n");
1056
1057 safe_printfmt (db_h_fd, "/* This file automatically generated by unidata-generate */\n\n");
1058 safe_printfmt (db_h_fd, "#include \"hackerlab/arrays/pow2-array.h\"\n");
1059 safe_printfmt (db_h_fd, "\n\n");
1060
1061 pow2_array_print (db_h_fd, db_array, "unidata__db", "unidata__db", 1, "unidata__db_ref", 0, "t_uint16", 0);
1062 safe_printfmt (db_h_fd, "\n\n");
1063
1064 safe_printfmt (case_db_fd, "/* This file automatically generated by unidata-generate */\n\n");
1065 safe_printfmt (case_db_fd, "#include \"case-db.h\"\n");
1066 safe_printfmt (case_db_fd, "\n\n");
1067
1068 pow2_array_compact (case_db_array, 0, 0, 0);
1069 pow2_array_print (case_db_fd, case_db_array, "unidata__case_db", "unidata__case_db", 0, 0, 0, "struct uni_case_mapping", print_t_case);
1070 safe_printfmt (case_db_fd, "\n\n");
1071
1072 safe_printfmt (case_db_h_fd, "/* This file automatically generated by unidata-generate */\n\n");
1073 safe_printfmt (case_db_h_fd, "#include \"hackerlab/arrays/pow2-array.h\"\n");
1074 safe_printfmt (case_db_h_fd, "#include \"hackerlab/unidata/case-db-macros.h\"\n");
1075 safe_printfmt (case_db_h_fd, "\n\n");
1076
1077 pow2_array_print (case_db_h_fd, case_db_array, "unidata__case_db", "unidata__case_db", 1, "unidata__case_db_ref", 0, "struct uni_case_mapping", 0);
1078 safe_printfmt (case_db_h_fd, "\n\n");
1079
1080 safe_printfmt (combine_db_fd, "/* This file automatically generated by unidata-generate */\n\n");
1081 safe_printfmt (combine_db_fd, "#include \"combine-db.h\"\n");
1082 safe_printfmt (combine_db_fd, "\n\n");
1083
1084 pow2_array_compact (combine_db_array, 0, 0, 0);
1085 pow2_array_print (combine_db_fd, combine_db_array, "unidata__combine_db", "unidata__combine_db", 0, 0, 0, "t_uint8", print_t_uint8);
1086 safe_printfmt (combine_db_fd, "\n\n");
1087
1088 safe_printfmt (combine_db_h_fd, "/* This file automatically generated by unidata-generate */\n\n");
1089 safe_printfmt (combine_db_h_fd, "#include \"hackerlab/arrays/pow2-array.h\"\n");
1090 safe_printfmt (combine_db_h_fd, "#include \"hackerlab/unidata/combine-db-macros.h\"\n");
1091 safe_printfmt (combine_db_h_fd, "\n\n");
1092
1093 pow2_array_print (combine_db_h_fd, combine_db_array, "unidata__combine_db", "unidata__combine_db", 1, "unidata__combine_db_ref", 0, "t_uint8", 0);
1094 safe_printfmt (combine_db_h_fd, "\n\n");
1095
1096 safe_printfmt (decomp_db_fd, "/* This file automatically generated by unidata-generate */\n\n");
1097 safe_printfmt (decomp_db_fd, "#include \"decomp-db.h\"\n");
1098 safe_printfmt (decomp_db_fd, "\n\n");
1099
1100 pow2_array_compact (decomp_db_array, 0, 0, 0);
1101 pow2_array_print (decomp_db_fd, decomp_db_array, "unidata__decomp_db", "unidata__decomp_db", 0, 0, 0, "t_int16", print_t_int16);
1102 safe_printfmt (decomp_db_fd, "\n\n");
1103 {
1104 size_t d;
1105 size_t n_d;
1106 size_t off;
1107
1108 n_d = ar_size (decompositions.void_ptr, lim_use_must_malloc, sizeof (*decompositions.dmp));
1109
1110 safe_printfmt (decomp_db_fd, "static t_unicode unidata_decomposition_data[] =\n");
1111 safe_printfmt (decomp_db_fd, "{\n");
1112 for (d = 0; d < n_d; ++d)
1113 {
1114 size_t c;
1115 size_t n_c;
1116
1117 n_c = ar_size (decompositions.dmp[d].decomposition, lim_use_must_malloc, sizeof (*decompositions.dmp[d].decomposition));
1118 for (c = 0; c < n_c; ++c)
1119 {
1120 safe_printfmt (decomp_db_fd, " 0x%04X,\n", decompositions.dmp[d].decomposition[c]);
1121 }
1122 safe_printfmt (decomp_db_fd, " 0x0,\n");
1123 }
1124 safe_printfmt (decomp_db_fd, "};\n\n");
1125
1126 safe_printfmt (decomp_db_fd, "struct uni_decomposition_mapping unidata_decomposition_table[] =\n");
1127 safe_printfmt (decomp_db_fd, "{\n");
1128 off = 0;
1129 for (d = 0; d < n_d; ++d)
1130 {
1131 safe_printfmt (decomp_db_fd, " { %d, unidata_decomposition_data + %lu },\n", decompositions.dmp[d].type, (unsigned long)off);
1132 off += ar_size ((void *)decompositions.dmp[d].decomposition, lim_use_must_malloc, sizeof (*decompositions.dmp[d].decomposition)) + 1;
1133 }
1134 safe_printfmt (decomp_db_fd, "};\n\n");
1135 }
1136
1137 safe_printfmt (decomp_db_h_fd, "/* This file automatically generated by unidata-generate */\n\n");
1138 safe_printfmt (decomp_db_h_fd, "#include \"hackerlab/arrays/pow2-array.h\"\n");
1139 safe_printfmt (decomp_db_h_fd, "#include \"hackerlab/unidata/decomp-db-macros.h\"\n");
1140 safe_printfmt (decomp_db_h_fd, "\n\n");
1141
1142 pow2_array_print (decomp_db_h_fd, decomp_db_array, "unidata__decomp_db", "unidata__decomp_db", 1, "unidata__decomp_db_ref", 0, "t_int16", 0);
1143 safe_printfmt (decomp_db_h_fd, "\n\n");
1144 safe_printfmt (decomp_db_h_fd, "extern struct uni_decomposition_mapping unidata_decomposition_table[];\n\n\n");
1145
1146 safe_close (input_fd);
1147 safe_close (bits_fd);
1148 safe_close (bits_h_fd);
1149 safe_close (db_fd);
1150 safe_close (db_h_fd);
1151 safe_close (case_db_fd);
1152 safe_close (case_db_h_fd);
1153 safe_close (combine_db_fd);
1154 safe_close (combine_db_h_fd);
1155 safe_close (decomp_db_fd);
1156 safe_close (decomp_db_h_fd);
1157
1158 return 0;
1159 }
1160
1161