1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2010 Brazil
3 
4   This library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License version 2.1 as published by the Free Software Foundation.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
16 */
17 #include <stdio.h>
18 #include <getopt.h>
19 #include <unistd.h>
20 #include <string.h>
21 #include <unicode/utf.h>
22 #include <unicode/uchar.h>
23 #include <unicode/unorm.h>
24 #include <unicode/ustring.h>
25 
26 #define MAX_UNICODE 0x110000
27 #define BUF_SIZE 0x100
28 
29 static int
ucs2utf(unsigned int i,unsigned char * buf)30 ucs2utf(unsigned int i, unsigned char *buf)
31 {
32   unsigned char *p = buf;
33   if (i < 0x80) {
34     *p++ = i;
35   } else {
36     if (i < 0x800) {
37       *p++ = (i >> 6) | 0xc0;
38     } else {
39       if (i < 0x00010000) {
40         *p++ = (i >> 12) | 0xe0;
41       } else {
42         if (i < 0x00200000) {
43           *p++ = (i >> 18) | 0xf0;
44         } else {
45           if (i < 0x04000000) {
46             *p++ = (i >> 24) | 0xf8;
47           } else if (i < 0x80000000) {
48             *p++ = (i >> 30) | 0xfc;
49             *p++ = ((i >> 24) & 0x3f) | 0x80;
50           }
51           *p++ = ((i >> 18) & 0x3f) | 0x80;
52         }
53         *p++ = ((i >> 12) & 0x3f) | 0x80;
54       }
55       *p++ = ((i >> 6) & 0x3f) | 0x80;
56     }
57     *p++ = (0x3f & i) | 0x80;
58   }
59   *p = '\0';
60   return (p - buf);
61 }
62 
63 void
blockcode(void)64 blockcode(void)
65 {
66   UChar32 ch;
67   unsigned char *p, src[7];
68   UBlockCode code, lc = -1;
69   for (ch = 1; ch < MAX_UNICODE; ch++) {
70     if (!U_IS_UNICODE_CHAR(ch)) { continue; }
71     code = ublock_getCode(ch);
72     if (code != lc) {
73       ucs2utf(ch, src);
74       for (p = src; *p; p++) {
75         printf("%x:", *p);
76       }
77       printf("\t%04x\t%d\n", ch, code);
78     }
79     lc = code;
80   }
81 }
82 
83 int
normalize(const char * str,char * res,UNormalizationMode mode)84 normalize(const char *str, char *res, UNormalizationMode mode)
85 {
86   UErrorCode rc;
87   int32_t ulen, nlen;
88   UChar ubuf[BUF_SIZE], nbuf[BUF_SIZE];
89   rc = U_ZERO_ERROR;
90   u_strFromUTF8(ubuf, BUF_SIZE, &ulen, str, -1, &rc);
91   if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
92     return -1;
93   }
94   rc = U_ZERO_ERROR;
95   nlen = unorm_normalize(ubuf, ulen, mode, 0, nbuf, BUF_SIZE, &rc);
96   if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
97     return -1;
98   }
99   rc = U_ZERO_ERROR;
100   u_strToUTF8(res, BUF_SIZE, NULL, nbuf, nlen, &rc);
101   if (rc != U_ZERO_ERROR /*&& rc != U_BUFFER_OVERFLOW_ERROR*/) {
102     return -1;
103   }
104   return 0;
105 }
106 
107 void
dump(UNormalizationMode mode)108 dump(UNormalizationMode mode)
109 {
110   UChar32 ch;
111   char str[7], norm[BUF_SIZE];
112   for (ch = 1; ch < MAX_UNICODE; ch++) {
113     if (!U_IS_UNICODE_CHAR(ch)) { continue; }
114     ucs2utf(ch, (unsigned char *)str);
115     if (normalize(str, norm, mode)) {
116       printf("ch=%04x error occure\n", ch);
117       continue;
118     }
119     if (strcmp(norm, str)) {
120       printf("%04x\t%s\t%s\n", ch, str, norm);
121     }
122   }
123 }
124 
125 void
ccdump(void)126 ccdump(void)
127 {
128   UChar32 ch;
129   char str[7], nfd[BUF_SIZE], nfc[BUF_SIZE];
130   for (ch = 1; ch < MAX_UNICODE; ch++) {
131     if (!U_IS_UNICODE_CHAR(ch)) { continue; }
132     ucs2utf(ch, (unsigned char *)str);
133     if (normalize(str, nfd, UNORM_NFD)) {
134       printf("ch=%04x error occure\n", ch);
135       continue;
136     }
137     if (normalize(str, nfc, UNORM_NFC)) {
138       printf("ch=%04x error occure\n", ch);
139       continue;
140     }
141     if (strcmp(nfd, nfc)) {
142       printf("%04x\t%s\t%s\n", ch, nfd, nfc);
143     }
144   }
145 }
146 
147 enum {
148   ctype_null = 0,
149   ctype_alpha,
150   ctype_digit,
151   ctype_symbol,
152   ctype_hiragana,
153   ctype_katakana,
154   ctype_kanji,
155   ctype_others
156 };
157 
158 static const char *ctypes[] = {
159   "GRN_CHAR_NULL",
160   "GRN_CHAR_ALPHA",
161   "GRN_CHAR_DIGIT",
162   "GRN_CHAR_SYMBOL",
163   "GRN_CHAR_HIRAGANA",
164   "GRN_CHAR_KATAKANA",
165   "GRN_CHAR_KANJI",
166   "GRN_CHAR_OTHERS"
167 };
168 
169 void
gcdump(void)170 gcdump(void)
171 {
172   UChar32 ch;
173   unsigned char *p, src[7];
174   int ctype, lc = -1;
175   for (ch = 1; ch < MAX_UNICODE; ch++) {
176     UCharCategory cat;
177     UBlockCode code;
178     if (!U_IS_UNICODE_CHAR(ch)) { continue; }
179     code = ublock_getCode(ch);
180     switch (code) {
181     case UBLOCK_CJK_RADICALS_SUPPLEMENT: /* cjk radicals */
182     case UBLOCK_KANGXI_RADICALS: /* kanji radicals */
183     case UBLOCK_BOPOMOFO: /* bopomofo letter */
184     case UBLOCK_HANGUL_COMPATIBILITY_JAMO: /* hangul letter */
185     case UBLOCK_KANBUN: /* kaeri ten used in kanbun ex. re-ten */
186     case UBLOCK_BOPOMOFO_EXTENDED: /* bopomofo extended letter */
187     case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: /* cjk letter */
188     case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: /* cjk letter */
189     case UBLOCK_YI_SYLLABLES: /* Yi syllables */
190     case UBLOCK_YI_RADICALS: /* Yi radicals */
191     case UBLOCK_HANGUL_SYLLABLES: /* hangul syllables */
192     case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS: /* cjk letter */
193     case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: /* cjk letter */
194     case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: /* cjk letter */
195     case UBLOCK_CJK_STROKES: /* kakijun*/
196       ctype = ctype_kanji;
197       break;
198     case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: /* symbols ex. JIS mark */
199     case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: /* ex. (kabu) */
200     case UBLOCK_CJK_COMPATIBILITY: /* symbols ex. ton doll */
201     case UBLOCK_CJK_COMPATIBILITY_FORMS: /* symbols ex. tategaki kagi-kakko */
202       ctype = ctype_symbol;
203       break;
204     case UBLOCK_HIRAGANA:
205       ctype = ctype_hiragana;
206       break;
207     case UBLOCK_KATAKANA:
208     case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS:
209       ctype = ctype_katakana;
210       break;
211     default:
212       cat = u_charType(ch);
213       switch (cat) {
214       case U_UPPERCASE_LETTER:
215       case U_LOWERCASE_LETTER:
216       case U_TITLECASE_LETTER:
217       case U_MODIFIER_LETTER:
218       case U_OTHER_LETTER:
219         ctype = ctype_alpha;
220         break;
221       case U_DECIMAL_DIGIT_NUMBER:
222       case U_LETTER_NUMBER:
223       case U_OTHER_NUMBER:
224         ctype = ctype_digit;
225         break;
226       case U_DASH_PUNCTUATION:
227       case U_START_PUNCTUATION:
228       case U_END_PUNCTUATION:
229       case U_CONNECTOR_PUNCTUATION:
230       case U_OTHER_PUNCTUATION:
231       case U_MATH_SYMBOL:
232       case U_CURRENCY_SYMBOL:
233       case U_MODIFIER_SYMBOL:
234       case U_OTHER_SYMBOL:
235         ctype = ctype_symbol;
236         break;
237       default:
238         ctype = ctype_others;
239         break;
240       }
241       break;
242     }
243     if (ctype != lc) {
244       ucs2utf(ch, src);
245       for (p = src; *p; p++) {
246         printf("%x:", *p);
247       }
248       printf("\t%04x\t%s\n", ch, ctypes[ctype]);
249     }
250     lc = ctype;
251   }
252 }
253 
254 struct option options[] = {
255   {"bc", 0, NULL, 'b'},
256   {"nfd", 0, NULL, 'd'},
257   {"nfkd", 0, NULL, 'D'},
258   {"nfc", 0, NULL, 'c'},
259   {"nfkc", 0, NULL, 'C'},
260   {"cc", 0, NULL, 'o'},
261   {"gc", 0, NULL, 'g'},
262   {"version", 0, NULL, 'v'},
263 };
264 
265 int
main(int argc,char ** argv)266 main(int argc, char **argv)
267 {
268   switch (getopt_long(argc, argv, "bdDcCogv", options, NULL)) {
269   case 'b' :
270     blockcode();
271     break;
272   case 'd' :
273     dump(UNORM_NFD);
274     break;
275   case 'D' :
276     dump(UNORM_NFKD);
277     break;
278   case 'c' :
279     dump(UNORM_NFC);
280     break;
281   case 'C' :
282     dump(UNORM_NFKC);
283     break;
284   case 'o' :
285     ccdump();
286     break;
287   case 'g' :
288     gcdump();
289     break;
290   case 'v' :
291     printf("%s\n", U_UNICODE_VERSION);
292     break;
293   default :
294     fputs("usage: icudump --[bc|nfd|nfkd|nfc|nfkc|cc|gc|version]\n", stderr);
295     break;
296   }
297   return 0;
298 }
299