1 /* Generate Unicode conforming character classification tables from a
2 UnicodeData file.
3 Copyright (C) 2000-2002 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 /* See also: gen-ctype.c in libunistring, gen-unicode-ctype.c in glibc. */
21
22 /* Usage example:
23 $ gen-ctype /usr/local/share/Unidata/UnicodeData.txt 3.1.0
24 */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <time.h>
32
33 /* ========================================================================= */
34
35 /* Reading UnicodeData.txt. */
36 /* See UnicodeData-3.1.0.html. */
37
38 /* This structure represents one line in the UnicodeData.txt file. */
39 struct unicode_attribute
40 {
41 const char *name; /* Character name */
42 const char *category; /* General category */
43 const char *combining; /* Canonical combining classes */
44 const char *bidi; /* Bidirectional category */
45 const char *decomposition; /* Character decomposition mapping */
46 const char *decdigit; /* Decimal digit value */
47 const char *digit; /* Digit value */
48 const char *numeric; /* Numeric value */
49 bool mirrored; /* mirrored */
50 const char *oldname; /* Old Unicode 1.0 name */
51 const char *comment; /* Comment */
52 unsigned int upper; /* Uppercase mapping */
53 unsigned int lower; /* Lowercase mapping */
54 unsigned int title; /* Titlecase mapping */
55 };
56
57 /* Missing fields are represented with "" for strings, and NONE for
58 characters. */
59 #define NONE (~(unsigned int)0)
60
61 /* The entire contents of the UnicodeData.txt file. */
62 struct unicode_attribute unicode_attributes [0x110000];
63
64 /* Stores in unicode_attributes[i] the values from the given fields. */
65 static void
fill_attribute(unsigned int i,const char * field1,const char * field2,const char * field3,const char * field4,const char * field5,const char * field6,const char * field7,const char * field8,const char * field9,const char * field10,const char * field11,const char * field12,const char * field13,const char * field14)66 fill_attribute (unsigned int i,
67 const char *field1, const char *field2,
68 const char *field3, const char *field4,
69 const char *field5, const char *field6,
70 const char *field7, const char *field8,
71 const char *field9, const char *field10,
72 const char *field11, const char *field12,
73 const char *field13, const char *field14)
74 {
75 struct unicode_attribute * uni;
76
77 if (i >= 0x110000)
78 {
79 fprintf (stderr, "index too large\n");
80 exit (1);
81 }
82 if (strcmp (field2, "Cs") == 0)
83 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
84 return;
85 uni = &unicode_attributes[i];
86 /* Copy the strings. */
87 uni->name = strdup (field1);
88 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
89 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
90 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
91 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
92 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
93 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
94 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
95 uni->mirrored = (field9[0] == 'Y');
96 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
97 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
98 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
99 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
100 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
101 }
102
103 /* Maximum length of a field in the UnicodeData.txt file. */
104 #define FIELDLEN 120
105
106 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
107 Reads up to (but excluding) DELIM.
108 Returns 1 when a field was successfully read, otherwise 0. */
109 static int
getfield(FILE * stream,char * buffer,int delim)110 getfield (FILE *stream, char *buffer, int delim)
111 {
112 int count = 0;
113 int c;
114
115 for (; (c = getc (stream)), (c != EOF && c != delim); )
116 {
117 /* The original unicode.org UnicodeData.txt file happens to have
118 CR/LF line terminators. Silently convert to LF. */
119 if (c == '\r')
120 continue;
121
122 /* Put c into the buffer. */
123 if (++count >= FIELDLEN - 1)
124 {
125 fprintf (stderr, "field too long\n");
126 exit (1);
127 }
128 *buffer++ = c;
129 }
130
131 if (c == EOF)
132 return 0;
133
134 *buffer = '\0';
135 return 1;
136 }
137
138 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
139 file. */
140 static void
fill_attributes(const char * unicodedata_filename)141 fill_attributes (const char *unicodedata_filename)
142 {
143 unsigned int i, j;
144 FILE *stream;
145 char field0[FIELDLEN];
146 char field1[FIELDLEN];
147 char field2[FIELDLEN];
148 char field3[FIELDLEN];
149 char field4[FIELDLEN];
150 char field5[FIELDLEN];
151 char field6[FIELDLEN];
152 char field7[FIELDLEN];
153 char field8[FIELDLEN];
154 char field9[FIELDLEN];
155 char field10[FIELDLEN];
156 char field11[FIELDLEN];
157 char field12[FIELDLEN];
158 char field13[FIELDLEN];
159 char field14[FIELDLEN];
160 int lineno = 0;
161
162 for (i = 0; i < 0x110000; i++)
163 unicode_attributes[i].name = NULL;
164
165 stream = fopen (unicodedata_filename, "r");
166 if (stream == NULL)
167 {
168 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
169 exit (1);
170 }
171
172 for (;;)
173 {
174 int n;
175
176 lineno++;
177 n = getfield (stream, field0, ';');
178 n += getfield (stream, field1, ';');
179 n += getfield (stream, field2, ';');
180 n += getfield (stream, field3, ';');
181 n += getfield (stream, field4, ';');
182 n += getfield (stream, field5, ';');
183 n += getfield (stream, field6, ';');
184 n += getfield (stream, field7, ';');
185 n += getfield (stream, field8, ';');
186 n += getfield (stream, field9, ';');
187 n += getfield (stream, field10, ';');
188 n += getfield (stream, field11, ';');
189 n += getfield (stream, field12, ';');
190 n += getfield (stream, field13, ';');
191 n += getfield (stream, field14, '\n');
192 if (n == 0)
193 break;
194 if (n != 15)
195 {
196 fprintf (stderr, "short line in'%s':%d\n",
197 unicodedata_filename, lineno);
198 exit (1);
199 }
200 i = strtoul (field0, NULL, 16);
201 if (field1[0] == '<'
202 && strlen (field1) >= 9
203 && !strcmp (field1 + strlen(field1) - 8, ", First>"))
204 {
205 /* Deal with a range. */
206 lineno++;
207 n = getfield (stream, field0, ';');
208 n += getfield (stream, field1, ';');
209 n += getfield (stream, field2, ';');
210 n += getfield (stream, field3, ';');
211 n += getfield (stream, field4, ';');
212 n += getfield (stream, field5, ';');
213 n += getfield (stream, field6, ';');
214 n += getfield (stream, field7, ';');
215 n += getfield (stream, field8, ';');
216 n += getfield (stream, field9, ';');
217 n += getfield (stream, field10, ';');
218 n += getfield (stream, field11, ';');
219 n += getfield (stream, field12, ';');
220 n += getfield (stream, field13, ';');
221 n += getfield (stream, field14, '\n');
222 if (n != 15)
223 {
224 fprintf (stderr, "missing end range in '%s':%d\n",
225 unicodedata_filename, lineno);
226 exit (1);
227 }
228 if (!(field1[0] == '<'
229 && strlen (field1) >= 8
230 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
231 {
232 fprintf (stderr, "missing end range in '%s':%d\n",
233 unicodedata_filename, lineno);
234 exit (1);
235 }
236 field1[strlen (field1) - 7] = '\0';
237 j = strtoul (field0, NULL, 16);
238 for (; i <= j; i++)
239 fill_attribute (i, field1+1, field2, field3, field4, field5,
240 field6, field7, field8, field9, field10,
241 field11, field12, field13, field14);
242 }
243 else
244 {
245 /* Single character line */
246 fill_attribute (i, field1, field2, field3, field4, field5,
247 field6, field7, field8, field9, field10,
248 field11, field12, field13, field14);
249 }
250 }
251 if (ferror (stream) || fclose (stream))
252 {
253 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
254 exit (1);
255 }
256 }
257
258 /* ========================================================================= */
259
260 /* Character mappings. */
261
262 static unsigned int
to_upper(unsigned int ch)263 to_upper (unsigned int ch)
264 {
265 if (unicode_attributes[ch].name != NULL
266 && unicode_attributes[ch].upper != NONE)
267 return unicode_attributes[ch].upper;
268 else
269 return ch;
270 }
271
272 static unsigned int
to_lower(unsigned int ch)273 to_lower (unsigned int ch)
274 {
275 if (unicode_attributes[ch].name != NULL
276 && unicode_attributes[ch].lower != NONE)
277 return unicode_attributes[ch].lower;
278 else
279 return ch;
280 }
281
282 /* Common Lisp only wants the bijective lower/upper case conversions. */
283 static bool
is_CL_both_case(unsigned int ch)284 is_CL_both_case (unsigned int ch)
285 {
286 unsigned int ch1 = to_upper (ch);
287 unsigned int ch2 = to_lower (ch);
288 return (ch1 == ch || ch2 == ch)
289 && (ch1 != ch2)
290 && to_upper (ch1) == ch1
291 && to_upper (ch2) == ch1
292 && to_lower (ch1) == ch2
293 && to_lower (ch2) == ch2;
294 }
295
296 static void
check_bothcase()297 check_bothcase ()
298 {
299 unsigned int ch;
300 for (ch = 0; ch < 0x110000; ch++)
301 {
302 unsigned int ch1 = to_upper (ch);
303 unsigned int ch2 = to_lower (ch);
304 if (!(ch1 == ch || ch2 == ch))
305 printf ("character 0x%04x is neither upper nor lower case\n", ch);
306 else if (is_CL_both_case (ch))
307 {
308 if (ch == ch1)
309 {
310 // ch is upper case
311 if (to_lower (ch2) != ch2)
312 printf ("character 0x%04x has a lower case mapping 0x%04x which is not lower case\n", ch, ch2);
313 else if (to_upper (ch2) != ch)
314 printf ("character 0x%04x has a lower case mapping 0x%04x whose upper case mapping isn't the first character\n", ch, ch2);
315 }
316 else if (ch == ch2)
317 {
318 // ch is lower case
319 if (to_upper (ch1) != ch1)
320 printf ("character 0x%04x has an upper case mapping 0x%04x which is not upper case\n", ch, ch1);
321 else if (to_lower (ch1) != ch)
322 printf ("character 0x%04x has an upper case mapping 0x%04x whose lower case mapping isn't the first character\n", ch, ch1);
323 }
324 }
325 }
326 }
327
328 static unsigned int
CL_to_upper(unsigned int ch)329 CL_to_upper (unsigned int ch)
330 {
331 return (is_CL_both_case (ch) ? to_upper (ch) : ch);
332 }
333
334 static unsigned int
CL_to_lower(unsigned int ch)335 CL_to_lower (unsigned int ch)
336 {
337 return (is_CL_both_case (ch) ? to_lower (ch) : ch);
338 }
339
340 static void
output_casemapping(const char * filename,const char * mapname,unsigned int (* map)(unsigned int),const char * version)341 output_casemapping (const char *filename,
342 const char *mapname, unsigned int (*map) (unsigned int),
343 const char *version)
344 {
345 FILE *stream;
346 bool pages[0x1100];
347 unsigned int max_nonempty_page;
348
349 stream = fopen (filename, "w");
350 if (stream == NULL)
351 {
352 fprintf (stderr, "cannot open '%s' for writing\n", filename);
353 exit (1);
354 }
355
356 fprintf (stream, "/*\n");
357 fprintf (stream, " * %s\n", filename);
358 fprintf (stream, " *\n");
359 fprintf (stream, " * Common Lisp %scase table.\n", mapname);
360 fprintf (stream, " * Generated automatically by the gen-ctype utility for Unicode %s.\n", version);
361 fprintf (stream, " */\n");
362 fprintf (stream, "\n");
363 {
364 unsigned int p;
365 for (p = 0; p < 0x1100; p++)
366 pages[p] = false;
367 max_nonempty_page = 0;
368 }
369 {
370 unsigned int p, i1;
371 for (p = 0; p < 0x1100; p++)
372 for (i1 = 0; i1 < 0x100; i1++)
373 {
374 unsigned int ch = (p << 8) + i1;
375 if (map (ch) != ch) {
376 pages[p] = true;
377 max_nonempty_page = p;
378 break;
379 }
380 }
381 }
382 {
383 unsigned int p, i1, i2;
384 for (p = 0; p < 0x1100; p++)
385 if (pages[p])
386 {
387 fprintf (stream, "static const uint16 %s_case_table_page%02x[256] = {\n", mapname, p);
388 for (i1 = 0; i1 < 32; i1++)
389 {
390 fprintf (stream, " ");
391 for (i2 = 0; i2 < 8; i2++)
392 {
393 unsigned int ch = 256*p + 8*i1 + i2;
394 unsigned int ch2 = map (ch);
395 int j = ch2 - ch;
396 if (j > 0x7fff || j < -0x7fff)
397 {
398 fprintf (stderr, "%scase(0x%04x) differs from 0x%04x by more than 15 bits\n", mapname, ch, ch);
399 exit (1);
400 }
401 fprintf (stream, "0x%04x%s ", j & 0xffffU, (8*i1+i2<255 ? "," : " "));
402 }
403 fprintf (stream, "/* 0x%02x-0x%02x */\n", 8*i1, 8*i1+7);
404 }
405 fprintf (stream, "};\n");
406 fprintf (stream, "\n");
407 }
408 }
409 {
410 unsigned int p;
411 fprintf (stream, "static const uint16 * const %s_case_table[%d] = {\n", mapname, max_nonempty_page+1);
412 for (p = 0; p <= max_nonempty_page; p++)
413 {
414 if ((p % 4) == 0)
415 fprintf (stream, " ");
416 if (pages[p])
417 fprintf (stream, "%s_case_table_page%02x", mapname, p);
418 else
419 fprintf (stream, "nop_page");
420 fprintf (stream, "%s ", (p<max_nonempty_page ? "," : " "));
421 if ((p % 4) == 3 || p == max_nonempty_page)
422 fprintf (stream, "/* 0x%02x-0x%02x */\n", 4*(p/4), p);
423 }
424 fprintf (stream, "};\n");
425 fprintf (stream, "\n");
426 }
427
428 if (ferror (stream) || fclose (stream))
429 {
430 fprintf (stderr, "error writing to '%s'\n", filename);
431 exit (1);
432 }
433 }
434
435 /* ========================================================================= */
436
437 /* Character class properties. */
438
439 static bool
is_graphic(unsigned int ch)440 is_graphic (unsigned int ch)
441 {
442 return (unicode_attributes[ch].name != NULL
443 && strcmp (unicode_attributes[ch].name, "<control>"));
444 }
445
446 static bool
is_alpha(unsigned int ch)447 is_alpha (unsigned int ch)
448 {
449 return (unicode_attributes[ch].name != NULL
450 && ((unicode_attributes[ch].category[0] == 'L'
451 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
452 <U0E2F>, <U0E46> should belong to is_punct. */
453 && (ch != 0x0E2F) && (ch != 0x0E46))
454 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
455 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
456 || (ch == 0x0E31)
457 || (ch >= 0x0E34 && ch <= 0x0E3A)
458 || (ch >= 0x0E47 && ch <= 0x0E4E)
459 #if 0
460 /* Avoid warning for <U0345>. */
461 || (ch == 0x0345)
462 /* Avoid warnings for <U2160>..<U217F>. */
463 || (unicode_attributes[ch].category[0] == 'N'
464 && unicode_attributes[ch].category[1] == 'l')
465 /* Avoid warnings for <U24B6>..<U24E9>. */
466 || (unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'o'
468 && strstr (unicode_attributes[ch].name, " LETTER ")
469 != NULL)
470 #endif
471 ) );
472 }
473
474 static bool
is_numeric(unsigned int ch)475 is_numeric (unsigned int ch)
476 {
477 return (unicode_attributes[ch].name != NULL
478 && unicode_attributes[ch].category[0] == 'N'
479 && unicode_attributes[ch].category[1] == 'd');
480 }
481
482 static void
output_attributes(const char * filename,const char * version)483 output_attributes (const char *filename, const char *version)
484 {
485 FILE *stream;
486 enum
487 {
488 NON_GRAPHIC = 0,
489 GRAPHIC_NON_ALPHANUMERIC = 1,
490 NUMERIC = 2,
491 ALPHABETIC = 3
492 };
493 unsigned char attribute[0x110000];
494 unsigned char attribute_packed[0x44000];
495 unsigned int pages[0x440];
496 unsigned int max_nonempty_page;
497
498 /* Compute the attribute table. */
499 {
500 unsigned int ch;
501 for (ch = 0; ch < 0x110000; ch++)
502 {
503 bool graphic = is_graphic (ch);
504 bool alpha = is_alpha (ch);
505 bool numeric = is_numeric (ch);
506 if (alpha && numeric)
507 fprintf (stderr, "Character 0x%04x is both alpha and numeric\n", ch);
508 if (alpha && !graphic)
509 fprintf (stderr, "Character 0x%04x is alpha but not graphic\n", ch);
510 if (numeric && !graphic)
511 fprintf (stderr, "Character 0x%04x is numeric but not graphic\n", ch);
512 attribute[ch] = (alpha ? ALPHABETIC :
513 numeric ? NUMERIC :
514 graphic ? GRAPHIC_NON_ALPHANUMERIC :
515 NON_GRAPHIC
516 );
517 }
518 }
519
520 stream = fopen (filename, "w");
521 if (stream == NULL)
522 {
523 fprintf (stderr, "cannot open '%s' for writing\n", filename);
524 exit (1);
525 }
526
527 fprintf (stream, "/*\n");
528 fprintf (stream, " * %s\n", filename);
529 fprintf (stream, " *\n");
530 fprintf (stream, " * Common Lisp character attribute table.\n");
531 fprintf (stream, " * Generated automatically by the gen-ctype utility for Unicode %s.\n", version);
532 fprintf (stream, " */\n");
533 fprintf (stream, "\n");
534
535 /* Pack the table, 4 entries into each byte. */
536 {
537 unsigned int i1, i2;
538 for (i1 = 0; i1 < 0x44000; i1++)
539 {
540 unsigned char b = 0;
541 for (i2 = 0; i2 < 4; i2++)
542 b |= attribute[4*i1+i2] << (2*i2);
543 attribute_packed[i1] = b;
544 }
545 }
546
547 /* Remove duplicate pages, to save space. */
548 {
549 unsigned int i, j;
550 for (i = 0; i < 0x440; i++)
551 {
552 for (j = 0; j < i; j++)
553 if (memcmp (&attribute_packed[j<<8], &attribute_packed[i<<8], 1<<8) == 0)
554 break;
555 pages[i] = j;
556 }
557 }
558
559 max_nonempty_page = 0;
560 {
561 unsigned int i1, i2;
562 for (i1 = 0; i1 < 0x440; i1++)
563 for (i2 = 0; i2 < 0x100; i2++)
564 if (attribute_packed[(i1<<8)+i2] != 0)
565 {
566 max_nonempty_page = i1;
567 break;
568 }
569 }
570
571 {
572 unsigned int i1, i2a, i2b;
573 for (i1 = 0; i1 < 0x440; i1++)
574 if (pages[i1] == i1)
575 {
576 fprintf (stream, "static const uintB unicode_attribute_table_page%02x[256] = {\n", 4*i1);
577 for (i2a = 0; i2a < 32; i2a++)
578 {
579 fprintf (stream, " ");
580 for (i2b = 0; i2b < 8; i2b++)
581 {
582 unsigned int i2 = 8*i2a + i2b;
583 fprintf (stream, "0x%02x%s ", attribute_packed[(i1<<8)+i2], (i2<255 ? "," : " "));
584 }
585 fprintf (stream, "/* 0x%04x-0x%04x */\n", 4*((i1<<8)+(8*i2a)), 4*((i1<<8)+(8*i2a+7))+3);
586 }
587 fprintf (stream, "};\n");
588 fprintf (stream, "\n");
589 }
590 }
591 {
592 unsigned int i1;
593 fprintf (stream, "const uintB * const unicode_attribute_table[%d] = {\n", max_nonempty_page+1);
594 for (i1 = 0; i1 <= max_nonempty_page; i1++)
595 {
596 if ((i1 % 2) == 0)
597 fprintf (stream, " ");
598 fprintf (stream, " unicode_attribute_table_page%02x%s", 4*pages[i1], (i1<max_nonempty_page ? "," : ""));
599 if ((i1 % 2) == 1 || i1 == max_nonempty_page)
600 fprintf (stream, "\n");
601 }
602 fprintf (stream, "};\n");
603 fprintf (stream, "\n");
604 }
605
606 if (ferror (stream) || fclose (stream))
607 {
608 fprintf (stderr, "error writing to '%s'\n", filename);
609 exit (1);
610 }
611 }
612
613 /* ========================================================================= */
614
615 int
main(int argc,char * argv[])616 main (int argc, char * argv[])
617 {
618 const char *unicodedata_filename;
619 const char *version;
620
621 if (argc != 3)
622 {
623 fprintf (stderr, "Usage: %s UnicodeData.txt version\n",
624 argv[0]);
625 exit (1);
626 }
627
628 unicodedata_filename = argv[1];
629 version = argv[2];
630
631 fill_attributes (unicodedata_filename);
632
633 output_casemapping ("uni_upcase.c", "up", CL_to_upper, version);
634 output_casemapping ("uni_downcase.c", "down", CL_to_lower, version);
635 output_attributes ("uni_attribute.c", version);
636
637 return 0;
638 }
639