1 /* Association between Unicode characters and their names.
2    Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21 
22 /* Specification.  */
23 #include "uniname.h"
24 
25 #include <sys/types.h>
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <string.h>
30 
31 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
32 
33 
34 /* Table of Unicode character names, derived from UnicodeData.txt.  */
35 #include "uninames.h"
36 /* It contains:
37   static const char unicode_name_words[34594] = ...;
38   #define UNICODE_CHARNAME_NUM_WORDS 5906
39   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
40   #define UNICODE_CHARNAME_WORD_HANGUL 3624
41   #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
42   #define UNICODE_CHARNAME_WORD_CJK 401
43   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
44   static const uint16_t unicode_names[62620] = ...;
45   static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
46   static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
47   #define UNICODE_CHARNAME_MAX_LENGTH 83
48   #define UNICODE_CHARNAME_MAX_WORDS 13
49 */
50 
51 /* Returns the word with a given index.  */
52 static const char *
unicode_name_word(unsigned int index,unsigned int * lengthp)53 unicode_name_word (unsigned int index, unsigned int *lengthp)
54 {
55   unsigned int i1;
56   unsigned int i2;
57   unsigned int i;
58 
59   assert (index < UNICODE_CHARNAME_NUM_WORDS);
60 
61   /* Binary search for i with
62        unicode_name_by_length[i].ind_offset <= index
63      and
64        index < unicode_name_by_length[i+1].ind_offset
65    */
66 
67   i1 = 0;
68   i2 = SIZEOF (unicode_name_by_length) - 1;
69   while (i2 - i1 > 1)
70     {
71       unsigned int i = (i1 + i2) >> 1;
72       if (unicode_name_by_length[i].ind_offset <= index)
73 	i1 = i;
74       else
75 	i2 = i;
76     }
77   i = i1;
78   assert (unicode_name_by_length[i].ind_offset <= index
79 	  && index < unicode_name_by_length[i+1].ind_offset);
80   *lengthp = i;
81   return &unicode_name_words[unicode_name_by_length[i].extra_offset
82 			     + (index-unicode_name_by_length[i].ind_offset)*i];
83 }
84 
85 /* Looks up the index of a word.  */
86 static int
unicode_name_word_lookup(const char * word,unsigned int length)87 unicode_name_word_lookup (const char *word, unsigned int length)
88 {
89   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
90     {
91       /* Binary search among the words of given length.  */
92       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
93       unsigned int i0 = unicode_name_by_length[length].ind_offset;
94       unsigned int i1 = i0;
95       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
96       while (i2 - i1 > 0)
97 	{
98 	  unsigned int i = (i1 + i2) >> 1;
99 	  const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
100 	  const char *w = word;
101 	  unsigned int n = length;
102 	  for (;;)
103 	    {
104 	      if (*p < *w)
105 		{
106 		  if (i1 == i)
107 		    return -1;
108 		  /* Note here: i1 < i < i2.  */
109 		  i1 = i;
110 		  break;
111 		}
112 	      if (*p > *w)
113 		{
114 		  /* Note here: i1 <= i < i2.  */
115 		  i2 = i;
116 		  break;
117 		}
118 	      p++; w++; n--;
119 	      if (n == 0)
120 		return i;
121 	    }
122 	}
123     }
124   return -1;
125 }
126 
127 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
128    sections 3.11 and 4.4.  */
129 static const char jamo_initial_short_name[19][3] =
130 {
131   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
132   "C", "K", "T", "P", "H"
133 };
134 static const char jamo_medial_short_name[21][4] =
135 {
136   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
137   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
138 };
139 static const char jamo_final_short_name[28][3] =
140 {
141   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
142   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
143 };
144 
145 /* Looks up the name of a Unicode character, in uppercase ASCII.
146    Returns the filled buf, or NULL if the character does not have a name.  */
147 char *
unicode_character_name(unsigned int c,char * buf)148 unicode_character_name (unsigned int c, char *buf)
149 {
150   if (c >= 0xAC00 && c <= 0xD7A3)
151     {
152       /* Special case for Hangul syllables. Keeps the tables small.  */
153       char *ptr;
154       unsigned int tmp;
155       unsigned int index1;
156       unsigned int index2;
157       unsigned int index3;
158       const char *q;
159 
160       /* buf needs to have at least 16 + 7 bytes here.  */
161       memcpy (buf, "HANGUL SYLLABLE ", 16);
162       ptr = buf + 16;
163 
164       tmp = c - 0xAC00;
165       index3 = tmp % 28; tmp = tmp / 28;
166       index2 = tmp % 21; tmp = tmp / 21;
167       index1 = tmp;
168 
169       q = jamo_initial_short_name[index1];
170       while (*q != '\0')
171 	*ptr++ = *q++;
172       q = jamo_medial_short_name[index2];
173       while (*q != '\0')
174 	*ptr++ = *q++;
175       q = jamo_final_short_name[index3];
176       while (*q != '\0')
177 	*ptr++ = *q++;
178       *ptr = '\0';
179       return buf;
180     }
181   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
182 	   || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
183     {
184       /* Special case for CJK compatibility ideographs. Keeps the tables
185 	 small.  */
186       char *ptr;
187       int i;
188 
189       /* buf needs to have at least 28 + 5 bytes here.  */
190       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
191       ptr = buf + 28;
192 
193       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
194 	{
195 	  unsigned int x = (c >> i) & 0xf;
196 	  *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
197 	}
198       *ptr = '\0';
199       return buf;
200     }
201   else
202     {
203       const uint16_t *words;
204 
205       /* Transform the code so that it fits in 16 bits.  */
206       switch (c >> 12)
207 	{
208 	case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
209 	  break;
210 	case 0x0A:
211 	  c -= 0x05000;
212 	  break;
213 	case 0x0F:
214 	  c -= 0x09000;
215 	  break;
216 	case 0x10:
217 	  c -= 0x09000;
218 	  break;
219 	case 0x1D:
220 	  c -= 0x15000;
221 	  break;
222 	case 0x2F:
223 	  c -= 0x26000;
224 	  break;
225 	case 0xE0:
226 	  c -= 0xD6000;
227 	  break;
228 	default:
229 	  return NULL;
230 	}
231 
232       {
233 	/* Binary search in unicode_code_to_name.  */
234 	unsigned int i1 = 0;
235 	unsigned int i2 = SIZEOF (unicode_code_to_name);
236 	for (;;)
237 	  {
238 	    unsigned int i = (i1 + i2) >> 1;
239 	    if (unicode_code_to_name[i].code == c)
240 	      {
241 		words = &unicode_names[unicode_code_to_name[i].name];
242 		break;
243 	      }
244 	    else if (unicode_code_to_name[i].code < c)
245 	      {
246 		if (i1 == i)
247 		  {
248 		    words = NULL;
249 		    break;
250 		  }
251 		/* Note here: i1 < i < i2.  */
252 		i1 = i;
253 	      }
254 	    else if (unicode_code_to_name[i].code > c)
255 	      {
256 		if (i2 == i)
257 		  {
258 		    words = NULL;
259 		    break;
260 		  }
261 		/* Note here: i1 <= i < i2.  */
262 		i2 = i;
263 	      }
264 	  }
265       }
266       if (words != NULL)
267 	{
268 	  /* Found it in unicode_code_to_name. Now concatenate the words.  */
269 	  /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
270 	  char *ptr = buf;
271 	  for (;;)
272 	    {
273 	      unsigned int wordlen;
274 	      const char *word = unicode_name_word (*words>>1, &wordlen);
275 	      do
276 		*ptr++ = *word++;
277 	      while (--wordlen > 0);
278 	      if ((*words & 1) == 0)
279 		break;
280 	      *ptr++ = ' ';
281 	      words++;
282 	    }
283 	  *ptr = '\0';
284 	  return buf;
285 	}
286       return NULL;
287     }
288 }
289 
290 /* Looks up the Unicode character with a given name, in upper- or lowercase
291    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
292 unsigned int
unicode_name_character(const char * name)293 unicode_name_character (const char *name)
294 {
295   unsigned int len = strlen (name);
296   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
297     {
298       /* Test for "word1 word2 ..." syntax.  */
299       char buf[UNICODE_CHARNAME_MAX_LENGTH];
300       char *ptr = buf;
301       for (;;)
302 	{
303 	  char c = *name++;
304 	  if (!(c >= ' ' && c <= '~'))
305 	    break;
306 	  *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
307 	  if (--len == 0)
308 	    goto filled_buf;
309 	}
310       if (false)
311       filled_buf:
312 	{
313 	  /* Convert the constituents to uint16_t words.  */
314 	  uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
315 	  uint16_t *wordptr = words;
316 	  {
317 	    const char *p1 = buf;
318 	    for (;;)
319 	      {
320 		{
321 		  int word;
322 		  const char *p2 = p1;
323 		  while (p2 < ptr && *p2 != ' ')
324 		    p2++;
325 		  word = unicode_name_word_lookup (p1, p2 - p1);
326 		  if (word < 0)
327 		    break;
328 		  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
329 		    break;
330 		  *wordptr++ = word;
331 		  if (p2 == ptr)
332 		    goto filled_words;
333 		  p1 = p2 + 1;
334 		}
335 		/* Special case for Hangul syllables. Keeps the tables small. */
336 		if (wordptr == &words[2]
337 		    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
338 		    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
339 		  {
340 		    /* Split the last word [p1..ptr) into three parts:
341 			 1) [BCDGHJKMNPRST]
342 			 2) [AEIOUWY]
343 			 3) [BCDGHIJKLMNPST]
344 		     */
345 		    const char *p2;
346 		    const char *p3;
347 		    const char *p4;
348 
349 		    p2 = p1;
350 		    while (p2 < ptr
351 			   && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
352 			       || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
353 			       || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
354 			       || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
355 			       || *p2 == 'T'))
356 		      p2++;
357 		    p3 = p2;
358 		    while (p3 < ptr
359 			   && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
360 			       || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
361 			       || *p3 == 'Y'))
362 		      p3++;
363 		    p4 = p3;
364 		    while (p4 < ptr
365 			   && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
366 			       || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
367 			       || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
368 			       || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
369 			       || *p4 == 'S' || *p4 == 'T'))
370 		      p4++;
371 		    if (p4 == ptr)
372 		      {
373 			unsigned int n1 = p2 - p1;
374 			unsigned int n2 = p3 - p2;
375 			unsigned int n3 = p4 - p3;
376 
377 			if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
378 			  {
379 			    unsigned int index1;
380 
381 			    for (index1 = 0; index1 < 19; index1++)
382 			      if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
383 				  && jamo_initial_short_name[index1][n1] == '\0')
384 				{
385 				  unsigned int index2;
386 
387 				  for (index2 = 0; index2 < 21; index2++)
388 				    if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
389 					&& jamo_medial_short_name[index2][n2] == '\0')
390 				      {
391 					unsigned int index3;
392 
393 					for (index3 = 0; index3 < 28; index3++)
394 					  if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
395 					      && jamo_final_short_name[index3][n3] == '\0')
396 					    {
397 					      return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
398 					    }
399 					break;
400 				      }
401 				  break;
402 				}
403 			  }
404 		      }
405 		  }
406 		/* Special case for CJK compatibility ideographs. Keeps the
407 		   tables small.  */
408 		if (wordptr == &words[2]
409 		    && words[0] == UNICODE_CHARNAME_WORD_CJK
410 		    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
411 		    && p1 + 14 <= ptr
412 		    && p1 + 15 >= ptr
413 		    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
414 		  {
415 		    const char *p2 = p1 + 10;
416 
417 		    if (*p2 != '0')
418 		      {
419 			unsigned int c = 0;
420 
421 			for (;;)
422 			  {
423 			    if (*p2 >= '0' && *p2 <= '9')
424 			      c += (*p2 - '0');
425 			    else if (*p2 >= 'A' && *p2 <= 'F')
426 			      c += (*p2 - 'A' + 10);
427 			    else
428 			      break;
429 			    p2++;
430 			    if (p2 == ptr)
431 			      {
432 				if ((c >= 0xF900 && c <= 0xFA2D)
433 				    || (c >= 0xFA30 && c <= 0xFA6A)
434 				    || (c >= 0xFA70 && c <= 0xFAD9)
435 				    || (c >= 0x2F800 && c <= 0x2FA1D))
436 				  return c;
437 				else
438 				  break;
439 			      }
440 			    c = c << 4;
441 			  }
442 		      }
443 		  }
444 	      }
445 	  }
446 	  if (false)
447 	  filled_words:
448 	    {
449 	      /* Multiply by 2, to simplify later comparisons.  */
450 	      unsigned int words_length = wordptr - words;
451 	      {
452 		int i = words_length - 1;
453 		words[i] = 2 * words[i];
454 		for (; --i >= 0; )
455 		  words[i] = 2 * words[i] + 1;
456 	      }
457 	      /* Binary search in unicode_name_to_code.  */
458 	      {
459 		unsigned int i1 = 0;
460 		unsigned int i2 = SIZEOF (unicode_name_to_code);
461 		for (;;)
462 		  {
463 		    unsigned int i = (i1 + i2) >> 1;
464 		    const uint16_t *w = words;
465 		    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
466 		    unsigned int n = words_length;
467 		    for (;;)
468 		      {
469 			if (*p < *w)
470 			  {
471 			    if (i1 == i)
472 			      goto name_not_found;
473 			    /* Note here: i1 < i < i2.  */
474 			    i1 = i;
475 			    break;
476 			  }
477 			else if (*p > *w)
478 			  {
479 			    if (i2 == i)
480 			      goto name_not_found;
481 			    /* Note here: i1 <= i < i2.  */
482 			    i2 = i;
483 			    break;
484 			  }
485 			p++; w++; n--;
486 			if (n == 0)
487 			  {
488 			    unsigned int c = unicode_name_to_code[i].code;
489 
490 			    /* Undo the transformation to 16-bit space.  */
491 			    static const unsigned int offset[11] =
492 			      {
493 				0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
494 				0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
495 				0xD6000
496 			      };
497 			    return c + offset[c >> 12];
498 			  }
499 		      }
500 		  }
501 	      }
502 	    name_not_found: ;
503 	    }
504 	}
505     }
506   return UNINAME_INVALID;
507 }
508