1 /* Generate Unicode conforming character classification tables from a
2    UnicodeData file.
3    Copyright (C) 2000-2002 Free Software Foundation, Inc.
4    Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 /* See also: gen-ctype.c in libunistring, gen-unicode-ctype.c in glibc.  */
21 
22 /* Usage example:
23      $ gen-ctype /usr/local/share/Unidata/UnicodeData.txt 3.1.0
24  */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <time.h>
32 
33 /* ========================================================================= */
34 
35 /* Reading UnicodeData.txt.  */
36 /* See UnicodeData-3.1.0.html.  */
37 
38 /* This structure represents one line in the UnicodeData.txt file.  */
39 struct unicode_attribute
40 {
41   const char *name;           /* Character name */
42   const char *category;       /* General category */
43   const char *combining;      /* Canonical combining classes */
44   const char *bidi;           /* Bidirectional category */
45   const char *decomposition;  /* Character decomposition mapping */
46   const char *decdigit;       /* Decimal digit value */
47   const char *digit;          /* Digit value */
48   const char *numeric;        /* Numeric value */
49   bool mirrored;              /* mirrored */
50   const char *oldname;        /* Old Unicode 1.0 name */
51   const char *comment;        /* Comment */
52   unsigned int upper;         /* Uppercase mapping */
53   unsigned int lower;         /* Lowercase mapping */
54   unsigned int title;         /* Titlecase mapping */
55 };
56 
57 /* Missing fields are represented with "" for strings, and NONE for
58    characters.  */
59 #define NONE (~(unsigned int)0)
60 
61 /* The entire contents of the UnicodeData.txt file.  */
62 struct unicode_attribute unicode_attributes [0x110000];
63 
64 /* Stores in unicode_attributes[i] the values from the given fields.  */
65 static void
fill_attribute(unsigned int i,const char * field1,const char * field2,const char * field3,const char * field4,const char * field5,const char * field6,const char * field7,const char * field8,const char * field9,const char * field10,const char * field11,const char * field12,const char * field13,const char * field14)66 fill_attribute (unsigned int i,
67 		const char *field1, const char *field2,
68 		const char *field3, const char *field4,
69 		const char *field5, const char *field6,
70 		const char *field7, const char *field8,
71 		const char *field9, const char *field10,
72 		const char *field11, const char *field12,
73 		const char *field13, const char *field14)
74 {
75   struct unicode_attribute * uni;
76 
77   if (i >= 0x110000)
78     {
79       fprintf (stderr, "index too large\n");
80       exit (1);
81     }
82   if (strcmp (field2, "Cs") == 0)
83     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
84     return;
85   uni = &unicode_attributes[i];
86   /* Copy the strings.  */
87   uni->name          = strdup (field1);
88   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
89   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
90   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
91   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
92   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
93   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
94   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
95   uni->mirrored      = (field9[0] == 'Y');
96   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
97   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
98   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
99   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
100   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
101 }
102 
103 /* Maximum length of a field in the UnicodeData.txt file.  */
104 #define FIELDLEN 120
105 
106 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
107    Reads up to (but excluding) DELIM.
108    Returns 1 when a field was successfully read, otherwise 0.  */
109 static int
getfield(FILE * stream,char * buffer,int delim)110 getfield (FILE *stream, char *buffer, int delim)
111 {
112   int count = 0;
113   int c;
114 
115   for (; (c = getc (stream)), (c != EOF && c != delim); )
116     {
117       /* The original unicode.org UnicodeData.txt file happens to have
118 	 CR/LF line terminators.  Silently convert to LF.  */
119       if (c == '\r')
120 	continue;
121 
122       /* Put c into the buffer.  */
123       if (++count >= FIELDLEN - 1)
124 	{
125 	  fprintf (stderr, "field too long\n");
126 	  exit (1);
127 	}
128       *buffer++ = c;
129     }
130 
131   if (c == EOF)
132     return 0;
133 
134   *buffer = '\0';
135   return 1;
136 }
137 
138 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
139    file.  */
140 static void
fill_attributes(const char * unicodedata_filename)141 fill_attributes (const char *unicodedata_filename)
142 {
143   unsigned int i, j;
144   FILE *stream;
145   char field0[FIELDLEN];
146   char field1[FIELDLEN];
147   char field2[FIELDLEN];
148   char field3[FIELDLEN];
149   char field4[FIELDLEN];
150   char field5[FIELDLEN];
151   char field6[FIELDLEN];
152   char field7[FIELDLEN];
153   char field8[FIELDLEN];
154   char field9[FIELDLEN];
155   char field10[FIELDLEN];
156   char field11[FIELDLEN];
157   char field12[FIELDLEN];
158   char field13[FIELDLEN];
159   char field14[FIELDLEN];
160   int lineno = 0;
161 
162   for (i = 0; i < 0x110000; i++)
163     unicode_attributes[i].name = NULL;
164 
165   stream = fopen (unicodedata_filename, "r");
166   if (stream == NULL)
167     {
168       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
169       exit (1);
170     }
171 
172   for (;;)
173     {
174       int n;
175 
176       lineno++;
177       n = getfield (stream, field0, ';');
178       n += getfield (stream, field1, ';');
179       n += getfield (stream, field2, ';');
180       n += getfield (stream, field3, ';');
181       n += getfield (stream, field4, ';');
182       n += getfield (stream, field5, ';');
183       n += getfield (stream, field6, ';');
184       n += getfield (stream, field7, ';');
185       n += getfield (stream, field8, ';');
186       n += getfield (stream, field9, ';');
187       n += getfield (stream, field10, ';');
188       n += getfield (stream, field11, ';');
189       n += getfield (stream, field12, ';');
190       n += getfield (stream, field13, ';');
191       n += getfield (stream, field14, '\n');
192       if (n == 0)
193 	break;
194       if (n != 15)
195 	{
196 	  fprintf (stderr, "short line in'%s':%d\n",
197 		   unicodedata_filename, lineno);
198 	  exit (1);
199 	}
200       i = strtoul (field0, NULL, 16);
201       if (field1[0] == '<'
202 	  && strlen (field1) >= 9
203 	  && !strcmp (field1 + strlen(field1) - 8, ", First>"))
204 	{
205 	  /* Deal with a range. */
206 	  lineno++;
207 	  n = getfield (stream, field0, ';');
208 	  n += getfield (stream, field1, ';');
209 	  n += getfield (stream, field2, ';');
210 	  n += getfield (stream, field3, ';');
211 	  n += getfield (stream, field4, ';');
212 	  n += getfield (stream, field5, ';');
213 	  n += getfield (stream, field6, ';');
214 	  n += getfield (stream, field7, ';');
215 	  n += getfield (stream, field8, ';');
216 	  n += getfield (stream, field9, ';');
217 	  n += getfield (stream, field10, ';');
218 	  n += getfield (stream, field11, ';');
219 	  n += getfield (stream, field12, ';');
220 	  n += getfield (stream, field13, ';');
221 	  n += getfield (stream, field14, '\n');
222 	  if (n != 15)
223 	    {
224 	      fprintf (stderr, "missing end range in '%s':%d\n",
225 		       unicodedata_filename, lineno);
226 	      exit (1);
227 	    }
228 	  if (!(field1[0] == '<'
229 		&& strlen (field1) >= 8
230 		&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
231 	    {
232 	      fprintf (stderr, "missing end range in '%s':%d\n",
233 		       unicodedata_filename, lineno);
234 	      exit (1);
235 	    }
236 	  field1[strlen (field1) - 7] = '\0';
237 	  j = strtoul (field0, NULL, 16);
238 	  for (; i <= j; i++)
239 	    fill_attribute (i, field1+1, field2, field3, field4, field5,
240 			       field6, field7, field8, field9, field10,
241 			       field11, field12, field13, field14);
242 	}
243       else
244 	{
245 	  /* Single character line */
246 	  fill_attribute (i, field1, field2, field3, field4, field5,
247 			     field6, field7, field8, field9, field10,
248 			     field11, field12, field13, field14);
249 	}
250     }
251   if (ferror (stream) || fclose (stream))
252     {
253       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
254       exit (1);
255     }
256 }
257 
258 /* ========================================================================= */
259 
260 /* Character mappings.  */
261 
262 static unsigned int
to_upper(unsigned int ch)263 to_upper (unsigned int ch)
264 {
265   if (unicode_attributes[ch].name != NULL
266       && unicode_attributes[ch].upper != NONE)
267     return unicode_attributes[ch].upper;
268   else
269     return ch;
270 }
271 
272 static unsigned int
to_lower(unsigned int ch)273 to_lower (unsigned int ch)
274 {
275   if (unicode_attributes[ch].name != NULL
276       && unicode_attributes[ch].lower != NONE)
277     return unicode_attributes[ch].lower;
278   else
279     return ch;
280 }
281 
282 /* Common Lisp only wants the bijective lower/upper case conversions.  */
283 static bool
is_CL_both_case(unsigned int ch)284 is_CL_both_case (unsigned int ch)
285 {
286   unsigned int ch1 = to_upper (ch);
287   unsigned int ch2 = to_lower (ch);
288   return (ch1 == ch || ch2 == ch)
289          && (ch1 != ch2)
290          && to_upper (ch1) == ch1
291          && to_upper (ch2) == ch1
292          && to_lower (ch1) == ch2
293          && to_lower (ch2) == ch2;
294 }
295 
296 static void
check_bothcase()297 check_bothcase ()
298 {
299   unsigned int ch;
300   for (ch = 0; ch < 0x110000; ch++)
301     {
302       unsigned int ch1 = to_upper (ch);
303       unsigned int ch2 = to_lower (ch);
304       if (!(ch1 == ch || ch2 == ch))
305         printf ("character 0x%04x is neither upper nor lower case\n", ch);
306       else if (is_CL_both_case (ch))
307         {
308           if (ch == ch1)
309             {
310               // ch is upper case
311               if (to_lower (ch2) != ch2)
312                 printf ("character 0x%04x has a lower case mapping 0x%04x which is not lower case\n", ch, ch2);
313               else if (to_upper (ch2) != ch)
314                 printf ("character 0x%04x has a lower case mapping 0x%04x whose upper case mapping isn't the first character\n", ch, ch2);
315             }
316           else if (ch == ch2)
317             {
318               // ch is lower case
319               if (to_upper (ch1) != ch1)
320                 printf ("character 0x%04x has an upper case mapping 0x%04x which is not upper case\n", ch, ch1);
321               else if (to_lower (ch1) != ch)
322                 printf ("character 0x%04x has an upper case mapping 0x%04x whose lower case mapping isn't the first character\n", ch, ch1);
323             }
324         }
325     }
326 }
327 
328 static unsigned int
CL_to_upper(unsigned int ch)329 CL_to_upper (unsigned int ch)
330 {
331   return (is_CL_both_case (ch) ? to_upper (ch) : ch);
332 }
333 
334 static unsigned int
CL_to_lower(unsigned int ch)335 CL_to_lower (unsigned int ch)
336 {
337   return (is_CL_both_case (ch) ? to_lower (ch) : ch);
338 }
339 
340 static void
output_casemapping(const char * filename,const char * mapname,unsigned int (* map)(unsigned int),const char * version)341 output_casemapping (const char *filename,
342                     const char *mapname, unsigned int (*map) (unsigned int),
343                     const char *version)
344 {
345   FILE *stream;
346   bool pages[0x1100];
347   unsigned int max_nonempty_page;
348 
349   stream = fopen (filename, "w");
350   if (stream == NULL)
351     {
352       fprintf (stderr, "cannot open '%s' for writing\n", filename);
353       exit (1);
354     }
355 
356   fprintf (stream, "/*\n");
357   fprintf (stream, " * %s\n", filename);
358   fprintf (stream, " *\n");
359   fprintf (stream, " * Common Lisp %scase table.\n", mapname);
360   fprintf (stream, " * Generated automatically by the gen-ctype utility for Unicode %s.\n", version);
361   fprintf (stream, " */\n");
362   fprintf (stream, "\n");
363   {
364     unsigned int p;
365     for (p = 0; p < 0x1100; p++)
366       pages[p] = false;
367     max_nonempty_page = 0;
368   }
369   {
370     unsigned int p, i1;
371     for (p = 0; p < 0x1100; p++)
372       for (i1 = 0; i1 < 0x100; i1++)
373         {
374           unsigned int ch = (p << 8) + i1;
375           if (map (ch) != ch) {
376             pages[p] = true;
377             max_nonempty_page = p;
378             break;
379         }
380       }
381   }
382   {
383     unsigned int p, i1, i2;
384     for (p = 0; p < 0x1100; p++)
385       if (pages[p])
386         {
387           fprintf (stream, "static const uint16 %s_case_table_page%02x[256] = {\n", mapname, p);
388           for (i1 = 0; i1 < 32; i1++)
389             {
390               fprintf (stream, "  ");
391               for (i2 = 0; i2 < 8; i2++)
392                 {
393                   unsigned int ch = 256*p + 8*i1 + i2;
394                   unsigned int ch2 = map (ch);
395                   int j = ch2 - ch;
396                   if (j > 0x7fff || j < -0x7fff)
397                     {
398                       fprintf (stderr, "%scase(0x%04x) differs from 0x%04x by more than 15 bits\n", mapname, ch, ch);
399                       exit (1);
400                     }
401                   fprintf (stream, "0x%04x%s ", j & 0xffffU, (8*i1+i2<255 ? "," : " "));
402                 }
403               fprintf (stream, "/* 0x%02x-0x%02x */\n", 8*i1, 8*i1+7);
404             }
405           fprintf (stream, "};\n");
406           fprintf (stream, "\n");
407         }
408   }
409   {
410     unsigned int p;
411     fprintf (stream, "static const uint16 * const %s_case_table[%d] = {\n", mapname, max_nonempty_page+1);
412     for (p = 0; p <= max_nonempty_page; p++)
413       {
414         if ((p % 4) == 0)
415           fprintf (stream, "  ");
416         if (pages[p])
417           fprintf (stream, "%s_case_table_page%02x", mapname, p);
418         else
419           fprintf (stream, "nop_page");
420         fprintf (stream, "%s ", (p<max_nonempty_page ? "," : " "));
421         if ((p % 4) == 3 || p == max_nonempty_page)
422           fprintf (stream, "/* 0x%02x-0x%02x */\n", 4*(p/4), p);
423       }
424     fprintf (stream, "};\n");
425     fprintf (stream, "\n");
426   }
427 
428   if (ferror (stream) || fclose (stream))
429     {
430       fprintf (stderr, "error writing to '%s'\n", filename);
431       exit (1);
432     }
433 }
434 
435 /* ========================================================================= */
436 
437 /* Character class properties.  */
438 
439 static bool
is_graphic(unsigned int ch)440 is_graphic (unsigned int ch)
441 {
442   return (unicode_attributes[ch].name != NULL
443 	  && strcmp (unicode_attributes[ch].name, "<control>"));
444 }
445 
446 static bool
is_alpha(unsigned int ch)447 is_alpha (unsigned int ch)
448 {
449   return (unicode_attributes[ch].name != NULL
450 	  && ((unicode_attributes[ch].category[0] == 'L'
451 	       /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
452 		  <U0E2F>, <U0E46> should belong to is_punct.  */
453 	       && (ch != 0x0E2F) && (ch != 0x0E46))
454 	      /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
455 		 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
456 	      || (ch == 0x0E31)
457 	      || (ch >= 0x0E34 && ch <= 0x0E3A)
458 	      || (ch >= 0x0E47 && ch <= 0x0E4E)
459 #if 0
460 	      /* Avoid warning for <U0345>.  */
461 	      || (ch == 0x0345)
462 	      /* Avoid warnings for <U2160>..<U217F>.  */
463 	      || (unicode_attributes[ch].category[0] == 'N'
464 		  && unicode_attributes[ch].category[1] == 'l')
465 	      /* Avoid warnings for <U24B6>..<U24E9>.  */
466 	      || (unicode_attributes[ch].category[0] == 'S'
467 		  && unicode_attributes[ch].category[1] == 'o'
468 		  && strstr (unicode_attributes[ch].name, " LETTER ")
469 		     != NULL)
470 #endif
471          )   );
472 }
473 
474 static bool
is_numeric(unsigned int ch)475 is_numeric (unsigned int ch)
476 {
477   return (unicode_attributes[ch].name != NULL
478 	  && unicode_attributes[ch].category[0] == 'N'
479 	  && unicode_attributes[ch].category[1] == 'd');
480 }
481 
482 static void
output_attributes(const char * filename,const char * version)483 output_attributes (const char *filename, const char *version)
484 {
485   FILE *stream;
486   enum
487     {
488       NON_GRAPHIC = 0,
489       GRAPHIC_NON_ALPHANUMERIC = 1,
490       NUMERIC = 2,
491       ALPHABETIC = 3
492     };
493   unsigned char attribute[0x110000];
494   unsigned char attribute_packed[0x44000];
495   unsigned int pages[0x440];
496   unsigned int max_nonempty_page;
497 
498   /* Compute the attribute table.  */
499   {
500     unsigned int ch;
501     for (ch = 0; ch < 0x110000; ch++)
502       {
503         bool graphic = is_graphic (ch);
504         bool alpha = is_alpha (ch);
505         bool numeric = is_numeric (ch);
506         if (alpha && numeric)
507           fprintf (stderr, "Character 0x%04x is both alpha and numeric\n", ch);
508         if (alpha && !graphic)
509           fprintf (stderr, "Character 0x%04x is alpha but not graphic\n", ch);
510         if (numeric && !graphic)
511           fprintf (stderr, "Character 0x%04x is numeric but not graphic\n", ch);
512         attribute[ch] = (alpha ? ALPHABETIC :
513                          numeric ? NUMERIC :
514                          graphic ? GRAPHIC_NON_ALPHANUMERIC :
515                          NON_GRAPHIC
516                         );
517       }
518   }
519 
520   stream = fopen (filename, "w");
521   if (stream == NULL)
522     {
523       fprintf (stderr, "cannot open '%s' for writing\n", filename);
524       exit (1);
525     }
526 
527   fprintf (stream, "/*\n");
528   fprintf (stream, " * %s\n", filename);
529   fprintf (stream, " *\n");
530   fprintf (stream, " * Common Lisp character attribute table.\n");
531   fprintf (stream, " * Generated automatically by the gen-ctype utility for Unicode %s.\n", version);
532   fprintf (stream, " */\n");
533   fprintf (stream, "\n");
534 
535   /* Pack the table, 4 entries into each byte.  */
536   {
537     unsigned int i1, i2;
538     for (i1 = 0; i1 < 0x44000; i1++)
539       {
540         unsigned char b = 0;
541         for (i2 = 0; i2 < 4; i2++)
542           b |= attribute[4*i1+i2] << (2*i2);
543         attribute_packed[i1] = b;
544       }
545   }
546 
547   /* Remove duplicate pages, to save space.  */
548   {
549     unsigned int i, j;
550     for (i = 0; i < 0x440; i++)
551       {
552         for (j = 0; j < i; j++)
553           if (memcmp (&attribute_packed[j<<8], &attribute_packed[i<<8], 1<<8) == 0)
554             break;
555         pages[i] = j;
556       }
557   }
558 
559   max_nonempty_page = 0;
560   {
561     unsigned int i1, i2;
562     for (i1 = 0; i1 < 0x440; i1++)
563       for (i2 = 0; i2 < 0x100; i2++)
564         if (attribute_packed[(i1<<8)+i2] != 0)
565           {
566             max_nonempty_page = i1;
567             break;
568           }
569   }
570 
571   {
572     unsigned int i1, i2a, i2b;
573     for (i1 = 0; i1 < 0x440; i1++)
574       if (pages[i1] == i1)
575         {
576           fprintf (stream, "static const uintB unicode_attribute_table_page%02x[256] = {\n", 4*i1);
577           for (i2a = 0; i2a < 32; i2a++)
578             {
579               fprintf (stream, "  ");
580               for (i2b = 0; i2b < 8; i2b++)
581                 {
582                   unsigned int i2 = 8*i2a + i2b;
583                   fprintf (stream, "0x%02x%s ", attribute_packed[(i1<<8)+i2], (i2<255 ? "," : " "));
584                 }
585               fprintf (stream, "/* 0x%04x-0x%04x */\n", 4*((i1<<8)+(8*i2a)), 4*((i1<<8)+(8*i2a+7))+3);
586             }
587           fprintf (stream, "};\n");
588           fprintf (stream, "\n");
589         }
590   }
591   {
592     unsigned int i1;
593     fprintf (stream, "const uintB * const unicode_attribute_table[%d] = {\n", max_nonempty_page+1);
594     for (i1 = 0; i1 <= max_nonempty_page; i1++)
595       {
596         if ((i1 % 2) == 0)
597           fprintf (stream, " ");
598         fprintf (stream, " unicode_attribute_table_page%02x%s", 4*pages[i1], (i1<max_nonempty_page ? "," : ""));
599         if ((i1 % 2) == 1 || i1 == max_nonempty_page)
600           fprintf (stream, "\n");
601       }
602     fprintf (stream, "};\n");
603     fprintf (stream, "\n");
604   }
605 
606   if (ferror (stream) || fclose (stream))
607     {
608       fprintf (stderr, "error writing to '%s'\n", filename);
609       exit (1);
610     }
611 }
612 
613 /* ========================================================================= */
614 
615 int
main(int argc,char * argv[])616 main (int argc, char * argv[])
617 {
618   const char *unicodedata_filename;
619   const char *version;
620 
621   if (argc != 3)
622     {
623       fprintf (stderr, "Usage: %s UnicodeData.txt version\n",
624 	       argv[0]);
625       exit (1);
626     }
627 
628   unicodedata_filename = argv[1];
629   version = argv[2];
630 
631   fill_attributes (unicodedata_filename);
632 
633   output_casemapping ("uni_upcase.c", "up", CL_to_upper, version);
634   output_casemapping ("uni_downcase.c", "down", CL_to_lower, version);
635   output_attributes ("uni_attribute.c", version);
636 
637   return 0;
638 }
639