1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2006 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "uniname.h"
24
25 #include <sys/types.h>
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <string.h>
30
31 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
32
33
34 /* Table of Unicode character names, derived from UnicodeData.txt. */
35 #include "uninames.h"
36 /* It contains:
37 static const char unicode_name_words[34594] = ...;
38 #define UNICODE_CHARNAME_NUM_WORDS 5906
39 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
40 #define UNICODE_CHARNAME_WORD_HANGUL 3624
41 #define UNICODE_CHARNAME_WORD_SYLLABLE 4654
42 #define UNICODE_CHARNAME_WORD_CJK 401
43 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 5755
44 static const uint16_t unicode_names[62620] = ...;
45 static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[15257] = ...;
46 static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[15257] = ...;
47 #define UNICODE_CHARNAME_MAX_LENGTH 83
48 #define UNICODE_CHARNAME_MAX_WORDS 13
49 */
50
51 /* Returns the word with a given index. */
52 static const char *
unicode_name_word(unsigned int index,unsigned int * lengthp)53 unicode_name_word (unsigned int index, unsigned int *lengthp)
54 {
55 unsigned int i1;
56 unsigned int i2;
57 unsigned int i;
58
59 assert (index < UNICODE_CHARNAME_NUM_WORDS);
60
61 /* Binary search for i with
62 unicode_name_by_length[i].ind_offset <= index
63 and
64 index < unicode_name_by_length[i+1].ind_offset
65 */
66
67 i1 = 0;
68 i2 = SIZEOF (unicode_name_by_length) - 1;
69 while (i2 - i1 > 1)
70 {
71 unsigned int i = (i1 + i2) >> 1;
72 if (unicode_name_by_length[i].ind_offset <= index)
73 i1 = i;
74 else
75 i2 = i;
76 }
77 i = i1;
78 assert (unicode_name_by_length[i].ind_offset <= index
79 && index < unicode_name_by_length[i+1].ind_offset);
80 *lengthp = i;
81 return &unicode_name_words[unicode_name_by_length[i].extra_offset
82 + (index-unicode_name_by_length[i].ind_offset)*i];
83 }
84
85 /* Looks up the index of a word. */
86 static int
unicode_name_word_lookup(const char * word,unsigned int length)87 unicode_name_word_lookup (const char *word, unsigned int length)
88 {
89 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
90 {
91 /* Binary search among the words of given length. */
92 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
93 unsigned int i0 = unicode_name_by_length[length].ind_offset;
94 unsigned int i1 = i0;
95 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
96 while (i2 - i1 > 0)
97 {
98 unsigned int i = (i1 + i2) >> 1;
99 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
100 const char *w = word;
101 unsigned int n = length;
102 for (;;)
103 {
104 if (*p < *w)
105 {
106 if (i1 == i)
107 return -1;
108 /* Note here: i1 < i < i2. */
109 i1 = i;
110 break;
111 }
112 if (*p > *w)
113 {
114 /* Note here: i1 <= i < i2. */
115 i2 = i;
116 break;
117 }
118 p++; w++; n--;
119 if (n == 0)
120 return i;
121 }
122 }
123 }
124 return -1;
125 }
126
127 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
128 sections 3.11 and 4.4. */
129 static const char jamo_initial_short_name[19][3] =
130 {
131 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
132 "C", "K", "T", "P", "H"
133 };
134 static const char jamo_medial_short_name[21][4] =
135 {
136 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
137 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
138 };
139 static const char jamo_final_short_name[28][3] =
140 {
141 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
142 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
143 };
144
145 /* Looks up the name of a Unicode character, in uppercase ASCII.
146 Returns the filled buf, or NULL if the character does not have a name. */
147 char *
unicode_character_name(unsigned int c,char * buf)148 unicode_character_name (unsigned int c, char *buf)
149 {
150 if (c >= 0xAC00 && c <= 0xD7A3)
151 {
152 /* Special case for Hangul syllables. Keeps the tables small. */
153 char *ptr;
154 unsigned int tmp;
155 unsigned int index1;
156 unsigned int index2;
157 unsigned int index3;
158 const char *q;
159
160 /* buf needs to have at least 16 + 7 bytes here. */
161 memcpy (buf, "HANGUL SYLLABLE ", 16);
162 ptr = buf + 16;
163
164 tmp = c - 0xAC00;
165 index3 = tmp % 28; tmp = tmp / 28;
166 index2 = tmp % 21; tmp = tmp / 21;
167 index1 = tmp;
168
169 q = jamo_initial_short_name[index1];
170 while (*q != '\0')
171 *ptr++ = *q++;
172 q = jamo_medial_short_name[index2];
173 while (*q != '\0')
174 *ptr++ = *q++;
175 q = jamo_final_short_name[index3];
176 while (*q != '\0')
177 *ptr++ = *q++;
178 *ptr = '\0';
179 return buf;
180 }
181 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
182 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
183 {
184 /* Special case for CJK compatibility ideographs. Keeps the tables
185 small. */
186 char *ptr;
187 int i;
188
189 /* buf needs to have at least 28 + 5 bytes here. */
190 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
191 ptr = buf + 28;
192
193 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
194 {
195 unsigned int x = (c >> i) & 0xf;
196 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
197 }
198 *ptr = '\0';
199 return buf;
200 }
201 else
202 {
203 const uint16_t *words;
204
205 /* Transform the code so that it fits in 16 bits. */
206 switch (c >> 12)
207 {
208 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
209 break;
210 case 0x0A:
211 c -= 0x05000;
212 break;
213 case 0x0F:
214 c -= 0x09000;
215 break;
216 case 0x10:
217 c -= 0x09000;
218 break;
219 case 0x1D:
220 c -= 0x15000;
221 break;
222 case 0x2F:
223 c -= 0x26000;
224 break;
225 case 0xE0:
226 c -= 0xD6000;
227 break;
228 default:
229 return NULL;
230 }
231
232 {
233 /* Binary search in unicode_code_to_name. */
234 unsigned int i1 = 0;
235 unsigned int i2 = SIZEOF (unicode_code_to_name);
236 for (;;)
237 {
238 unsigned int i = (i1 + i2) >> 1;
239 if (unicode_code_to_name[i].code == c)
240 {
241 words = &unicode_names[unicode_code_to_name[i].name];
242 break;
243 }
244 else if (unicode_code_to_name[i].code < c)
245 {
246 if (i1 == i)
247 {
248 words = NULL;
249 break;
250 }
251 /* Note here: i1 < i < i2. */
252 i1 = i;
253 }
254 else if (unicode_code_to_name[i].code > c)
255 {
256 if (i2 == i)
257 {
258 words = NULL;
259 break;
260 }
261 /* Note here: i1 <= i < i2. */
262 i2 = i;
263 }
264 }
265 }
266 if (words != NULL)
267 {
268 /* Found it in unicode_code_to_name. Now concatenate the words. */
269 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
270 char *ptr = buf;
271 for (;;)
272 {
273 unsigned int wordlen;
274 const char *word = unicode_name_word (*words>>1, &wordlen);
275 do
276 *ptr++ = *word++;
277 while (--wordlen > 0);
278 if ((*words & 1) == 0)
279 break;
280 *ptr++ = ' ';
281 words++;
282 }
283 *ptr = '\0';
284 return buf;
285 }
286 return NULL;
287 }
288 }
289
290 /* Looks up the Unicode character with a given name, in upper- or lowercase
291 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
292 unsigned int
unicode_name_character(const char * name)293 unicode_name_character (const char *name)
294 {
295 unsigned int len = strlen (name);
296 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
297 {
298 /* Test for "word1 word2 ..." syntax. */
299 char buf[UNICODE_CHARNAME_MAX_LENGTH];
300 char *ptr = buf;
301 for (;;)
302 {
303 char c = *name++;
304 if (!(c >= ' ' && c <= '~'))
305 break;
306 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
307 if (--len == 0)
308 goto filled_buf;
309 }
310 if (false)
311 filled_buf:
312 {
313 /* Convert the constituents to uint16_t words. */
314 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
315 uint16_t *wordptr = words;
316 {
317 const char *p1 = buf;
318 for (;;)
319 {
320 {
321 int word;
322 const char *p2 = p1;
323 while (p2 < ptr && *p2 != ' ')
324 p2++;
325 word = unicode_name_word_lookup (p1, p2 - p1);
326 if (word < 0)
327 break;
328 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
329 break;
330 *wordptr++ = word;
331 if (p2 == ptr)
332 goto filled_words;
333 p1 = p2 + 1;
334 }
335 /* Special case for Hangul syllables. Keeps the tables small. */
336 if (wordptr == &words[2]
337 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
338 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
339 {
340 /* Split the last word [p1..ptr) into three parts:
341 1) [BCDGHJKMNPRST]
342 2) [AEIOUWY]
343 3) [BCDGHIJKLMNPST]
344 */
345 const char *p2;
346 const char *p3;
347 const char *p4;
348
349 p2 = p1;
350 while (p2 < ptr
351 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
352 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
353 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
354 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
355 || *p2 == 'T'))
356 p2++;
357 p3 = p2;
358 while (p3 < ptr
359 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
360 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
361 || *p3 == 'Y'))
362 p3++;
363 p4 = p3;
364 while (p4 < ptr
365 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
366 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
367 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
368 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
369 || *p4 == 'S' || *p4 == 'T'))
370 p4++;
371 if (p4 == ptr)
372 {
373 unsigned int n1 = p2 - p1;
374 unsigned int n2 = p3 - p2;
375 unsigned int n3 = p4 - p3;
376
377 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
378 {
379 unsigned int index1;
380
381 for (index1 = 0; index1 < 19; index1++)
382 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
383 && jamo_initial_short_name[index1][n1] == '\0')
384 {
385 unsigned int index2;
386
387 for (index2 = 0; index2 < 21; index2++)
388 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
389 && jamo_medial_short_name[index2][n2] == '\0')
390 {
391 unsigned int index3;
392
393 for (index3 = 0; index3 < 28; index3++)
394 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
395 && jamo_final_short_name[index3][n3] == '\0')
396 {
397 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
398 }
399 break;
400 }
401 break;
402 }
403 }
404 }
405 }
406 /* Special case for CJK compatibility ideographs. Keeps the
407 tables small. */
408 if (wordptr == &words[2]
409 && words[0] == UNICODE_CHARNAME_WORD_CJK
410 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
411 && p1 + 14 <= ptr
412 && p1 + 15 >= ptr
413 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
414 {
415 const char *p2 = p1 + 10;
416
417 if (*p2 != '0')
418 {
419 unsigned int c = 0;
420
421 for (;;)
422 {
423 if (*p2 >= '0' && *p2 <= '9')
424 c += (*p2 - '0');
425 else if (*p2 >= 'A' && *p2 <= 'F')
426 c += (*p2 - 'A' + 10);
427 else
428 break;
429 p2++;
430 if (p2 == ptr)
431 {
432 if ((c >= 0xF900 && c <= 0xFA2D)
433 || (c >= 0xFA30 && c <= 0xFA6A)
434 || (c >= 0xFA70 && c <= 0xFAD9)
435 || (c >= 0x2F800 && c <= 0x2FA1D))
436 return c;
437 else
438 break;
439 }
440 c = c << 4;
441 }
442 }
443 }
444 }
445 }
446 if (false)
447 filled_words:
448 {
449 /* Multiply by 2, to simplify later comparisons. */
450 unsigned int words_length = wordptr - words;
451 {
452 int i = words_length - 1;
453 words[i] = 2 * words[i];
454 for (; --i >= 0; )
455 words[i] = 2 * words[i] + 1;
456 }
457 /* Binary search in unicode_name_to_code. */
458 {
459 unsigned int i1 = 0;
460 unsigned int i2 = SIZEOF (unicode_name_to_code);
461 for (;;)
462 {
463 unsigned int i = (i1 + i2) >> 1;
464 const uint16_t *w = words;
465 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
466 unsigned int n = words_length;
467 for (;;)
468 {
469 if (*p < *w)
470 {
471 if (i1 == i)
472 goto name_not_found;
473 /* Note here: i1 < i < i2. */
474 i1 = i;
475 break;
476 }
477 else if (*p > *w)
478 {
479 if (i2 == i)
480 goto name_not_found;
481 /* Note here: i1 <= i < i2. */
482 i2 = i;
483 break;
484 }
485 p++; w++; n--;
486 if (n == 0)
487 {
488 unsigned int c = unicode_name_to_code[i].code;
489
490 /* Undo the transformation to 16-bit space. */
491 static const unsigned int offset[11] =
492 {
493 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
494 0x05000, 0x09000, 0x09000, 0x15000, 0x26000,
495 0xD6000
496 };
497 return c + offset[c >> 12];
498 }
499 }
500 }
501 }
502 name_not_found: ;
503 }
504 }
505 }
506 return UNINAME_INVALID;
507 }
508