xref: /netbsd/external/gpl3/gcc/dist/libcpp/makeucnid.c (revision dd083157)
1 /* Make ucnid.h from various sources.
2    Copyright (C) 2005-2020 Free Software Foundation, Inc.
3 
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any
7 later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; see the file COPYING3.  If not see
16 <http://www.gnu.org/licenses/>.  */
17 
18 /* Run this program as
19    ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20        > ucnid.h
21 */
22 
23 #include <stdio.h>
24 #include <string.h>
25 #include <ctype.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
28 
29 enum {
30   C99 = 1,
31   CXX = 2,
32   N99 = 4,
33   C11 = 8,
34   N11 = 16,
35   all_languages = C99 | CXX | C11,
36   not_NFC = 32,
37   not_NFKC = 64,
38   maybe_not_NFC = 128
39 };
40 
41 #define NUM_CODE_POINTS 0x110000
42 #define MAX_CODE_POINT 0x10ffff
43 
44 static unsigned flags[NUM_CODE_POINTS];
45 static unsigned int all_decomp[NUM_CODE_POINTS][2];
46 static unsigned int decomp[NUM_CODE_POINTS][2];
47 static unsigned char combining_value[NUM_CODE_POINTS];
48 
49 /* Die!  */
50 
51 static void
fail(const char * s)52 fail (const char *s)
53 {
54   fprintf (stderr, "%s\n", s);
55   exit (1);
56 }
57 
58 /* Read ucnid.tab and set the flags for language versions in header[].  */
59 
60 static void
read_ucnid(const char * fname)61 read_ucnid (const char *fname)
62 {
63   FILE *f = fopen (fname, "r");
64   unsigned fl = 0;
65 
66   if (!f)
67     fail ("opening ucnid.tab");
68   for (;;)
69     {
70       char line[256];
71 
72       if (!fgets (line, sizeof (line), f))
73 	break;
74       if (strcmp (line, "[C99]\n") == 0)
75 	fl = C99;
76       else if (strcmp (line, "[C99DIG]\n") == 0)
77 	fl = C99|N99;
78       else if (strcmp (line, "[CXX]\n") == 0)
79 	fl = CXX;
80       else if (strcmp (line, "[C11]\n") == 0)
81 	fl = C11;
82       else if (strcmp (line, "[C11NOSTART]\n") == 0)
83 	fl = C11|N11;
84       else if (isxdigit (line[0]))
85 	{
86 	  char *l = line;
87 	  while (*l)
88 	    {
89 	      unsigned long start, end;
90 	      char *endptr;
91 	      start = strtoul (l, &endptr, 16);
92 	      if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
93 		fail ("parsing ucnid.tab [1]");
94 	      l = endptr;
95 	      if (*l != '-')
96 		end = start;
97 	      else
98 		{
99 		  end = strtoul (l + 1, &endptr, 16);
100 		  if (end < start)
101 		    fail ("parsing ucnid.tab, end before start");
102 		  l = endptr;
103 		  if (! isspace (*l))
104 		    fail ("parsing ucnid.tab, junk after range");
105 		}
106 	      while (isspace (*l))
107 		l++;
108 	      if (end > MAX_CODE_POINT)
109 		fail ("parsing ucnid.tab, end too large");
110 	      while (start <= end)
111 		flags[start++] |= fl;
112 	    }
113 	}
114     }
115   if (ferror (f))
116     fail ("reading ucnid.tab");
117   fclose (f);
118 }
119 
120 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
121    decompositions of characters for which both the character
122    decomposed and all the code points in the decomposition are valid
123    for some supported language version, and the 'all_decomp' table to
124    be the decompositions of all characters without those
125    constraints.  */
126 
127 static void
read_table(char * fname)128 read_table (char *fname)
129 {
130   FILE * f = fopen (fname, "r");
131 
132   if (!f)
133     fail ("opening UnicodeData.txt");
134   for (;;)
135     {
136       char line[256];
137       unsigned long codepoint, this_decomp[4];
138       char *l;
139       int i, j;
140       int decomp_useful;
141 
142       if (!fgets (line, sizeof (line), f))
143 	break;
144       codepoint = strtoul (line, &l, 16);
145       if (l == line || *l != ';')
146 	fail ("parsing UnicodeData.txt, reading code point");
147       if (codepoint > MAX_CODE_POINT)
148 	fail ("parsing UnicodeData.txt, code point too large");
149 
150       do {
151 	l++;
152       } while (*l != ';');
153       /* Category value.  */
154       do {
155 	l++;
156       } while (*l != ';');
157       /* Canonical combining class; in NFC/NFKC, they must be increasing
158 	 (or zero).  */
159       if (! isdigit (*++l))
160 	fail ("parsing UnicodeData.txt, combining class not number");
161       combining_value[codepoint] = strtoul (l, &l, 10);
162       if (*l++ != ';')
163 	fail ("parsing UnicodeData.txt, junk after combining class");
164 
165       /* Skip over bidi value.  */
166       do {
167 	l++;
168       } while (*l != ';');
169 
170       /* Decomposition mapping.  */
171       decomp_useful = flags[codepoint];
172       if (*++l == '<')  /* Compatibility mapping. */
173 	continue;
174       for (i = 0; i < 4; i++)
175 	{
176 	  if (*l == ';')
177 	    break;
178 	  if (!isxdigit (*l))
179 	    fail ("parsing UnicodeData.txt, decomposition format");
180 	  this_decomp[i] = strtoul (l, &l, 16);
181 	  decomp_useful &= flags[this_decomp[i]];
182 	  while (isspace (*l))
183 	    l++;
184 	}
185       if (i > 2)  /* Decomposition too long.  */
186 	fail ("parsing UnicodeData.txt, decomposition too long");
187       for (j = 0; j < i; j++)
188 	all_decomp[codepoint][j] = this_decomp[j];
189       if ((flags[codepoint] & all_languages) && decomp_useful)
190 	while (--i >= 0)
191 	  decomp[codepoint][i] = this_decomp[i];
192     }
193   if (ferror (f))
194     fail ("reading UnicodeData.txt");
195   fclose (f);
196 }
197 
198 /* Read DerivedNormalizationProps.txt and set the flags that say whether
199    a character is in NFC, NFKC, or is context-dependent.  */
200 
201 static void
read_derived(const char * fname)202 read_derived (const char *fname)
203 {
204   FILE * f = fopen (fname, "r");
205 
206   if (!f)
207     fail ("opening DerivedNormalizationProps.txt");
208   for (;;)
209     {
210       char line[256];
211       unsigned long start, end;
212       char *l;
213       bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
214 
215       if (!fgets (line, sizeof (line), f))
216 	break;
217       not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
218       not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
219       maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
220       if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
221 	continue;
222 
223       start = strtoul (line, &l, 16);
224       if (l == line)
225 	fail ("parsing DerivedNormalizationProps.txt, reading start");
226       if (start > MAX_CODE_POINT)
227 	fail ("parsing DerivedNormalizationProps.txt, code point too large");
228       if (*l == '.' && l[1] == '.')
229 	end = strtoul (l + 2, &l, 16);
230       else
231 	end = start;
232 
233       while (start <= end)
234 	flags[start++] |= ((not_NFC_p ? not_NFC : 0)
235 			   | (not_NFKC_p ? not_NFKC : 0)
236 			   | (maybe_not_NFC_p ? maybe_not_NFC : 0)
237 			   );
238     }
239   if (ferror (f))
240     fail ("reading DerivedNormalizationProps.txt");
241   fclose (f);
242 }
243 
244 /* Write out the table.
245    The table consists of two words per entry.  The first word is the flags
246    for the unicode code points up to and including the second word.  */
247 
248 static void
write_table(void)249 write_table (void)
250 {
251   unsigned i;
252   unsigned last_flag = flags[0];
253   bool really_safe = decomp[0][0] == 0;
254   unsigned char last_combine = combining_value[0];
255 
256   printf ("static const struct ucnrange ucnranges[] = {\n");
257 
258   for (i = 1; i <= NUM_CODE_POINTS; i++)
259     if (i == NUM_CODE_POINTS
260 	|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
261 	|| really_safe != (decomp[i][0] == 0)
262 	|| combining_value[i] != last_combine)
263       {
264 	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
265 		last_flag & C99 ? "C99" : "  0",
266 		last_flag & N99 ? "N99" : "  0",
267 		last_flag & CXX ? "CXX" : "  0",
268 		last_flag & C11 ? "C11" : "  0",
269 		last_flag & N11 ? "N11" : "  0",
270 		really_safe ? "CID" : "  0",
271 		last_flag & not_NFC ? "  0" : "NFC",
272 		last_flag & not_NFKC ? "  0" : "NKC",
273 		last_flag & maybe_not_NFC ? "CTX" : "  0",
274 		combining_value[i - 1],
275 		i - 1);
276 	last_flag = flags[i];
277 	last_combine = combining_value[0];
278 	really_safe = decomp[i][0] == 0;
279       }
280 
281   printf ("};\n");
282 }
283 
284 /* Return whether a given character is valid in an identifier for some
285    supported language, either as itself or as a UCN.  */
286 
287 static bool
char_id_valid(unsigned int c)288 char_id_valid (unsigned int c)
289 {
290   return ((flags[c] & all_languages)
291 	  || (c == 0x24)
292 	  || (c >= 0x30 && c <= 0x39)
293 	  || (c >= 0x41 && c <= 0x5a)
294 	  || (c >= 0x61 && c <= 0x7a));
295 }
296 
297 /* Write out the switch statement over characters for which it is
298    context-dependent whether they are in NFC.  */
299 
300 static void
write_context_switch(void)301 write_context_switch (void)
302 {
303   unsigned i;
304   printf ("static bool\n"
305 	  "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
306 	  "{\n"
307 	  "  switch (c)\n"
308 	  "    {\n");
309   for (i = 0; i < NUM_CODE_POINTS; i++)
310     {
311       bool found_case = false;
312       unsigned j;
313       if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
314 	continue;
315       if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
316 	continue; /* Hangul handled algorithmically.  */
317       printf ("    case %#06x:\n"
318 	      "      switch (p)\n"
319 	      "\t{\n", i);
320       /* If an NFC starter character decomposes with this character I
321 	 as the second character and an NFC starter character S as the
322 	 first character, that latter character as a previous
323 	 character means this character is not NFC.  Furthermore, any
324 	 NFC starter character K made by a series of compositions of S
325 	 with combining characters whose combining class is greater
326 	 than that of I also means this character is not NFC.  */
327       for (j = 0; j < NUM_CODE_POINTS; j++)
328 	{
329 	  unsigned s, k;
330 	  if (all_decomp[j][1] != i)
331 	    continue;
332 	  s = all_decomp[j][0];
333 	  if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
334 	    continue;
335 	  if (char_id_valid (s))
336 	    {
337 	      found_case = true;
338 	      printf ("\tcase %#06x:\n", s);
339 	    }
340 	  for (k = 0; k < NUM_CODE_POINTS; k++)
341 	    {
342 	      unsigned t = k;
343 	      if (k == s || !char_id_valid (k))
344 		continue;
345 	      while (all_decomp[t][1] != 0
346 		     && combining_value[all_decomp[t][1]] > combining_value[i])
347 		{
348 		  if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
349 		    break;
350 		  t = all_decomp[t][0];
351 		}
352 	      if (t == s)
353 		{
354 		  found_case = true;
355 		  printf ("\tcase %#06x:\n", k);
356 		}
357 	    }
358 	}
359       if (found_case)
360 	printf ("\t  return false;\n");
361       else
362 	printf ("\t/* Non-NFC cases not applicable to C/C++.  */\n");
363       printf ("\tdefault:\n"
364 	      "\t  return true;\n"
365 	      "\t}\n\n");
366     }
367   printf ("    default:\n"
368 	  "      cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
369 	  "      return true;\n"
370 	  "  }\n"
371 	  "}\n");
372 }
373 
374 /* Print out the huge copyright notice.  */
375 
376 static void
write_copyright(void)377 write_copyright (void)
378 {
379   static const char copyright[] = "\
380 /* Unicode characters and various properties.\n\
381    Copyright (C) 2003-2020 Free Software Foundation, Inc.\n\
382 \n\
383    This program is free software; you can redistribute it and/or modify it\n\
384    under the terms of the GNU General Public License as published by the\n\
385    Free Software Foundation; either version 3, or (at your option) any\n\
386    later version.\n\
387 \n\
388    This program is distributed in the hope that it will be useful,\n\
389    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
390    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
391    GNU General Public License for more details.\n\
392 \n\
393    You should have received a copy of the GNU General Public License\n\
394    along with this program; see the file COPYING3.  If not see\n\
395    <http://www.gnu.org/licenses/>.\n\
396 \n\
397 \n\
398    Copyright (C) 1991-2005 Unicode, Inc.  All rights reserved.\n\
399    Distributed under the Terms of Use in\n\
400    http://www.unicode.org/copyright.html.\n\
401 \n\
402    Permission is hereby granted, free of charge, to any person\n\
403    obtaining a copy of the Unicode data files and any associated\n\
404    documentation (the \"Data Files\") or Unicode software and any\n\
405    associated documentation (the \"Software\") to deal in the Data Files\n\
406    or Software without restriction, including without limitation the\n\
407    rights to use, copy, modify, merge, publish, distribute, and/or\n\
408    sell copies of the Data Files or Software, and to permit persons to\n\
409    whom the Data Files or Software are furnished to do so, provided\n\
410    that (a) the above copyright notice(s) and this permission notice\n\
411    appear with all copies of the Data Files or Software, (b) both the\n\
412    above copyright notice(s) and this permission notice appear in\n\
413    associated documentation, and (c) there is clear notice in each\n\
414    modified Data File or in the Software as well as in the\n\
415    documentation associated with the Data File(s) or Software that the\n\
416    data or software has been modified.\n\
417 \n\
418    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
419    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
420    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
421    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
422    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
423    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
424    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
425    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
426    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
427    OF THE DATA FILES OR SOFTWARE.\n\
428 \n\
429    Except as contained in this notice, the name of a copyright holder\n\
430    shall not be used in advertising or otherwise to promote the sale,\n\
431    use or other dealings in these Data Files or Software without prior\n\
432    written authorization of the copyright holder.  */\n";
433 
434    puts (copyright);
435 }
436 
437 /* Main program.  */
438 
439 int
main(int argc,char ** argv)440 main(int argc, char ** argv)
441 {
442   if (argc != 4)
443     fail ("too few arguments to makeucn");
444   read_ucnid (argv[1]);
445   read_table (argv[2]);
446   read_derived (argv[3]);
447 
448   write_copyright ();
449   write_table ();
450   write_context_switch ();
451   return 0;
452 }
453