1 /* Test the Unicode character name functions.
2    Copyright (C) 2000-2003, 2005, 2007, 2009-2018 Free Software Foundation,
3    Inc.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 #include "xalloc.h"
25 #include "uniname.h"
26 
27 /* The names according to the UnicodeData.txt file, modified to contain the
28    Hangul syllable names, as described in the Unicode 3.0 book.  */
29 static const char * unicode_names [0x110000];
30 
31 /* Maximum entries in unicode_aliases.  */
32 #define ALIASLEN 0x200
33 
34 /* The aliases according to the NameAliases.txt file.  */
35 struct unicode_alias
36 {
37   const char *name;
38   unsigned int uc;
39 };
40 
41 static struct unicode_alias unicode_aliases [ALIASLEN];
42 static int aliases_count;
43 
44 /* Stores in unicode_names[] the relevant contents of the UnicodeData.txt
45    file.  */
46 static void
fill_names(const char * unicodedata_filename)47 fill_names (const char *unicodedata_filename)
48 {
49   FILE *stream;
50   char *field0;
51   char *field1;
52   char line[1024];
53   int lineno = 0;
54 
55   stream = fopen (unicodedata_filename, "r");
56   if (stream == NULL)
57     {
58       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
59       exit (EXIT_FAILURE);
60     }
61 
62   while (fgets (line, sizeof line, stream))
63     {
64       char *p;
65       char *comment;
66       unsigned long i;
67 
68       lineno++;
69 
70       comment = strchr (line, '#');
71       if (comment != NULL)
72         *comment = '\0';
73       if (line[strspn (line, " \t\r\n")] == '\0')
74         continue;
75 
76       field0 = p = line;
77       p = strchr (p, ';');
78       if (!p)
79         {
80           fprintf (stderr, "short line in '%s':%d\n",
81                    unicodedata_filename, lineno);
82           exit (EXIT_FAILURE);
83         }
84       *p++ = '\0';
85 
86       field1 = p;
87       if (*field1 == '<')
88         continue;
89       p = strchr (p, ';');
90       if (!p)
91         {
92           fprintf (stderr, "short line in '%s':%d\n",
93                    unicodedata_filename, lineno);
94           exit (EXIT_FAILURE);
95         }
96       *p = '\0';
97       i = strtoul (field0, NULL, 16);
98       if (i >= 0x110000)
99         {
100           fprintf (stderr, "index too large\n");
101           exit (EXIT_FAILURE);
102         }
103       unicode_names[i] = xstrdup (field1);
104     }
105   if (ferror (stream) || fclose (stream))
106     {
107       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
108       exit (1);
109     }
110 }
111 
112 /* Stores in unicode_aliases[] the relevant contents of the NameAliases.txt
113    file.  */
114 static void
fill_aliases(const char * namealiases_filename)115 fill_aliases (const char *namealiases_filename)
116 {
117   FILE *stream;
118   char *field0;
119   char *field1;
120   char line[1024];
121   int lineno = 0;
122 
123   stream = fopen (namealiases_filename, "r");
124   if (stream == NULL)
125     {
126       fprintf (stderr, "error during fopen of '%s'\n", namealiases_filename);
127       exit (EXIT_FAILURE);
128     }
129 
130   while (fgets (line, sizeof line, stream))
131     {
132       char *p;
133       char *comment;
134       unsigned long uc;
135 
136       comment = strchr (line, '#');
137       if (comment != NULL)
138         *comment = '\0';
139       if (line[strspn (line, " \t\r\n")] == '\0')
140         continue;
141 
142       lineno++;
143 
144       field0 = p = line;
145       p = strchr (p, ';');
146       if (!p)
147         {
148           fprintf (stderr, "short line in '%s':%d\n",
149                    namealiases_filename, lineno);
150           exit (EXIT_FAILURE);
151         }
152       *p++ = '\0';
153 
154       field1 = p;
155       p = strchr (p, ';');
156       if (!p)
157         {
158           fprintf (stderr, "short line in '%s':%d\n",
159                    namealiases_filename, lineno);
160           exit (EXIT_FAILURE);
161         }
162       *p = '\0';
163 
164       uc = strtoul (field0, NULL, 16);
165       if (uc >= 0x110000)
166         {
167           fprintf (stderr, "index too large\n");
168           exit (EXIT_FAILURE);
169         }
170 
171       if (aliases_count == ALIASLEN)
172         {
173           fprintf (stderr, "too many aliases\n");
174           exit (EXIT_FAILURE);
175         }
176       unicode_aliases[aliases_count].name = xstrdup (field1);
177       unicode_aliases[aliases_count].uc = uc;
178       aliases_count++;
179     }
180   if (ferror (stream) || fclose (stream))
181     {
182       fprintf (stderr, "error reading from '%s'\n", namealiases_filename);
183       exit (1);
184     }
185 }
186 
187 static int
name_has_alias(unsigned int uc)188 name_has_alias (unsigned int uc)
189 {
190   int i;
191   for (i = 0; i < ALIASLEN; i++)
192     if (unicode_aliases[i].uc == uc)
193       return 1;
194   return 0;
195 }
196 
197 /* Perform an exhaustive test of the unicode_character_name function.  */
198 static int
test_name_lookup()199 test_name_lookup ()
200 {
201   int error = 0;
202   unsigned int i;
203   char buf[UNINAME_MAX];
204 
205   for (i = 0; i < 0x11000; i++)
206     {
207       char *result = unicode_character_name (i, buf);
208 
209       if (unicode_names[i] != NULL)
210         {
211           if (result == NULL)
212             {
213               fprintf (stderr, "\\u%04X name lookup failed!\n", i);
214               error = 1;
215             }
216           else if (strcmp (result, unicode_names[i]) != 0)
217             {
218               fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
219                                i, result);
220               error = 1;
221             }
222         }
223       else
224         {
225           if (result != NULL)
226             {
227               fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
228                                i, result);
229               error = 1;
230             }
231         }
232     }
233 
234   for (i = 0x110000; i < 0x1000000; i++)
235     {
236       char *result = unicode_character_name (i, buf);
237 
238       if (result != NULL)
239         {
240           fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
241                            i, result);
242           error = 1;
243         }
244     }
245 
246   return error;
247 }
248 
249 /* Perform a test of the unicode_name_character function.  */
250 static int
test_inverse_lookup()251 test_inverse_lookup ()
252 {
253   int error = 0;
254   unsigned int i;
255 
256   /* First, verify all valid character names are recognized.  */
257   for (i = 0; i < 0x110000; i++)
258     if (unicode_names[i] != NULL)
259       {
260         unsigned int result = unicode_name_character (unicode_names[i]);
261         if (result != i)
262           {
263             if (result == UNINAME_INVALID)
264               fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
265                        unicode_names[i]);
266             else
267               fprintf (stderr,
268                        "inverse name lookup of \"%s\" returned 0x%04X\n",
269                        unicode_names[i], result);
270             error = 1;
271           }
272       }
273 
274   /* Second, generate random but likely names and verify they are not
275      recognized unless really valid.  */
276   for (i = 0; i < 10000; i++)
277     {
278       unsigned int i1, i2;
279       const char *s1;
280       const char *s2;
281       unsigned int l1, l2, j1, j2;
282       char buf[2*UNINAME_MAX];
283       unsigned int result;
284 
285       do i1 = ((rand () % 0x11) << 16)
286               + ((rand () & 0xff) << 8)
287               + (rand () & 0xff);
288       while (unicode_names[i1] == NULL);
289 
290       do i2 = ((rand () % 0x11) << 16)
291               + ((rand () & 0xff) << 8)
292               + (rand () & 0xff);
293       while (unicode_names[i2] == NULL);
294 
295       s1 = unicode_names[i1];
296       l1 = strlen (s1);
297       s2 = unicode_names[i2];
298       l2 = strlen (s2);
299 
300       /* Concatenate a starting piece of s1 with an ending piece of s2.  */
301       for (j1 = 1; j1 <= l1; j1++)
302         if (j1 == l1 || s1[j1] == ' ')
303           for (j2 = 0; j2 < l2; j2++)
304             if (j2 == 0 || s2[j2-1] == ' ')
305               {
306                 memcpy (buf, s1, j1);
307                 buf[j1] = ' ';
308                 memcpy (buf + j1 + 1, s2 + j2, l2 - j2 + 1);
309 
310                 result = unicode_name_character (buf);
311                 if (result != UNINAME_INVALID
312                     && !name_has_alias (result)
313                     && !(unicode_names[result] != NULL
314                          && strcmp (unicode_names[result], buf) == 0))
315                   {
316                     fprintf (stderr,
317                              "inverse name lookup of \"%s\" returned 0x%04X\n",
318                              unicode_names[i], result);
319                     error = 1;
320                   }
321               }
322     }
323 
324   /* Third, some extreme case that used to loop.  */
325   if (unicode_name_character ("A A") != UNINAME_INVALID)
326     error = 1;
327 
328   return error;
329 }
330 
331 /* Perform a test of the unicode_name_character function for aliases.  */
332 static int
test_alias_lookup()333 test_alias_lookup ()
334 {
335   int error = 0;
336   unsigned int i;
337   char buf[UNINAME_MAX];
338 
339   /* Verify all valid character names are recognized.  */
340   for (i = 0; i < ALIASLEN; i++)
341     if (unicode_aliases[i].uc != UNINAME_INVALID
342         /* Skip if the character has no canonical name (e.g. control
343            characters).  */
344         && unicode_character_name (unicode_aliases[i].uc, buf))
345       {
346         unsigned int result = unicode_name_character (unicode_aliases[i].name);
347         if (result != unicode_aliases[i].uc)
348           {
349             if (result == UNINAME_INVALID)
350               fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
351                        unicode_aliases[i].name);
352             else
353               fprintf (stderr,
354                        "inverse name lookup of \"%s\" returned 0x%04X\n",
355                        unicode_aliases[i].name, result);
356             error = 1;
357           }
358       }
359 
360   return error;
361 }
362 
363 int
main(int argc,char * argv[])364 main (int argc, char *argv[])
365 {
366   int error = 0;
367   int i;
368 
369   for (i = 1; i < argc && strcmp (argv[i], "--") != 0; i++)
370     fill_names (argv[i]);
371 
372   if (i < argc)
373     {
374       int j;
375       for (j = 0; j < ALIASLEN; j++)
376         unicode_aliases[j].uc = UNINAME_INVALID;
377 
378       i++;
379       for (; i < argc; i++)
380         fill_aliases (argv[i]);
381     }
382 
383   error |= test_name_lookup ();
384   error |= test_inverse_lookup ();
385 
386   if (aliases_count > 0)
387     error |= test_alias_lookup ();
388 
389   return error;
390 }
391