1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2010 Brazil
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License version 2.1 as published by the Free Software Foundation.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
16 */
17 #include <stdio.h>
18 #include <getopt.h>
19 #include <unistd.h>
20 #include <string.h>
21 #include <unicode/utf.h>
22 #include <unicode/uchar.h>
23 #include <unicode/unorm.h>
24 #include <unicode/ustring.h>
25
26 #define MAX_UNICODE 0x110000
27 #define BUF_SIZE 0x100
28
29 static int
ucs2utf(unsigned int i,unsigned char * buf)30 ucs2utf(unsigned int i, unsigned char *buf)
31 {
32 unsigned char *p = buf;
33 if (i < 0x80) {
34 *p++ = i;
35 } else {
36 if (i < 0x800) {
37 *p++ = (i >> 6) | 0xc0;
38 } else {
39 if (i < 0x00010000) {
40 *p++ = (i >> 12) | 0xe0;
41 } else {
42 if (i < 0x00200000) {
43 *p++ = (i >> 18) | 0xf0;
44 } else {
45 if (i < 0x04000000) {
46 *p++ = (i >> 24) | 0xf8;
47 } else if (i < 0x80000000) {
48 *p++ = (i >> 30) | 0xfc;
49 *p++ = ((i >> 24) & 0x3f) | 0x80;
50 }
51 *p++ = ((i >> 18) & 0x3f) | 0x80;
52 }
53 *p++ = ((i >> 12) & 0x3f) | 0x80;
54 }
55 *p++ = ((i >> 6) & 0x3f) | 0x80;
56 }
57 *p++ = (0x3f & i) | 0x80;
58 }
59 *p = '\0';
60 return (p - buf);
61 }
62
63 void
blockcode(void)64 blockcode(void)
65 {
66 UChar32 ch;
67 unsigned char *p, src[7];
68 UBlockCode code, lc = -1;
69 for (ch = 1; ch < MAX_UNICODE; ch++) {
70 if (!U_IS_UNICODE_CHAR(ch)) { continue; }
71 code = ublock_getCode(ch);
72 if (code != lc) {
73 ucs2utf(ch, src);
74 for (p = src; *p; p++) {
75 printf("%x:", *p);
76 }
77 printf("\t%04x\t%d\n", ch, code);
78 }
79 lc = code;
80 }
81 }
82
83 int
normalize(const char * str,char * res,UNormalizationMode mode)84 normalize(const char *str, char *res, UNormalizationMode mode)
85 {
86 UErrorCode rc;
87 int32_t ulen, nlen;
88 UChar ubuf[BUF_SIZE], nbuf[BUF_SIZE];
89 rc = U_ZERO_ERROR;
90 u_strFromUTF8(ubuf, BUF_SIZE, &ulen, str, -1, &rc);
91 if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
92 return -1;
93 }
94 rc = U_ZERO_ERROR;
95 nlen = unorm_normalize(ubuf, ulen, mode, 0, nbuf, BUF_SIZE, &rc);
96 if (rc != U_ZERO_ERROR /*&& rc != U_STRING_NOT_TERMINATED_WARNING*/) {
97 return -1;
98 }
99 rc = U_ZERO_ERROR;
100 u_strToUTF8(res, BUF_SIZE, NULL, nbuf, nlen, &rc);
101 if (rc != U_ZERO_ERROR /*&& rc != U_BUFFER_OVERFLOW_ERROR*/) {
102 return -1;
103 }
104 return 0;
105 }
106
107 void
dump(UNormalizationMode mode)108 dump(UNormalizationMode mode)
109 {
110 UChar32 ch;
111 char str[7], norm[BUF_SIZE];
112 for (ch = 1; ch < MAX_UNICODE; ch++) {
113 if (!U_IS_UNICODE_CHAR(ch)) { continue; }
114 ucs2utf(ch, (unsigned char *)str);
115 if (normalize(str, norm, mode)) {
116 printf("ch=%04x error occure\n", ch);
117 continue;
118 }
119 if (strcmp(norm, str)) {
120 printf("%04x\t%s\t%s\n", ch, str, norm);
121 }
122 }
123 }
124
125 void
ccdump(void)126 ccdump(void)
127 {
128 UChar32 ch;
129 char str[7], nfd[BUF_SIZE], nfc[BUF_SIZE];
130 for (ch = 1; ch < MAX_UNICODE; ch++) {
131 if (!U_IS_UNICODE_CHAR(ch)) { continue; }
132 ucs2utf(ch, (unsigned char *)str);
133 if (normalize(str, nfd, UNORM_NFD)) {
134 printf("ch=%04x error occure\n", ch);
135 continue;
136 }
137 if (normalize(str, nfc, UNORM_NFC)) {
138 printf("ch=%04x error occure\n", ch);
139 continue;
140 }
141 if (strcmp(nfd, nfc)) {
142 printf("%04x\t%s\t%s\n", ch, nfd, nfc);
143 }
144 }
145 }
146
147 enum {
148 ctype_null = 0,
149 ctype_alpha,
150 ctype_digit,
151 ctype_symbol,
152 ctype_hiragana,
153 ctype_katakana,
154 ctype_kanji,
155 ctype_others
156 };
157
158 static const char *ctypes[] = {
159 "GRN_CHAR_NULL",
160 "GRN_CHAR_ALPHA",
161 "GRN_CHAR_DIGIT",
162 "GRN_CHAR_SYMBOL",
163 "GRN_CHAR_HIRAGANA",
164 "GRN_CHAR_KATAKANA",
165 "GRN_CHAR_KANJI",
166 "GRN_CHAR_OTHERS"
167 };
168
169 void
gcdump(void)170 gcdump(void)
171 {
172 UChar32 ch;
173 unsigned char *p, src[7];
174 int ctype, lc = -1;
175 for (ch = 1; ch < MAX_UNICODE; ch++) {
176 UCharCategory cat;
177 UBlockCode code;
178 if (!U_IS_UNICODE_CHAR(ch)) { continue; }
179 code = ublock_getCode(ch);
180 switch (code) {
181 case UBLOCK_CJK_RADICALS_SUPPLEMENT: /* cjk radicals */
182 case UBLOCK_KANGXI_RADICALS: /* kanji radicals */
183 case UBLOCK_BOPOMOFO: /* bopomofo letter */
184 case UBLOCK_HANGUL_COMPATIBILITY_JAMO: /* hangul letter */
185 case UBLOCK_KANBUN: /* kaeri ten used in kanbun ex. re-ten */
186 case UBLOCK_BOPOMOFO_EXTENDED: /* bopomofo extended letter */
187 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: /* cjk letter */
188 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: /* cjk letter */
189 case UBLOCK_YI_SYLLABLES: /* Yi syllables */
190 case UBLOCK_YI_RADICALS: /* Yi radicals */
191 case UBLOCK_HANGUL_SYLLABLES: /* hangul syllables */
192 case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS: /* cjk letter */
193 case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: /* cjk letter */
194 case UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: /* cjk letter */
195 case UBLOCK_CJK_STROKES: /* kakijun*/
196 ctype = ctype_kanji;
197 break;
198 case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: /* symbols ex. JIS mark */
199 case UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS: /* ex. (kabu) */
200 case UBLOCK_CJK_COMPATIBILITY: /* symbols ex. ton doll */
201 case UBLOCK_CJK_COMPATIBILITY_FORMS: /* symbols ex. tategaki kagi-kakko */
202 ctype = ctype_symbol;
203 break;
204 case UBLOCK_HIRAGANA:
205 ctype = ctype_hiragana;
206 break;
207 case UBLOCK_KATAKANA:
208 case UBLOCK_KATAKANA_PHONETIC_EXTENSIONS:
209 ctype = ctype_katakana;
210 break;
211 default:
212 cat = u_charType(ch);
213 switch (cat) {
214 case U_UPPERCASE_LETTER:
215 case U_LOWERCASE_LETTER:
216 case U_TITLECASE_LETTER:
217 case U_MODIFIER_LETTER:
218 case U_OTHER_LETTER:
219 ctype = ctype_alpha;
220 break;
221 case U_DECIMAL_DIGIT_NUMBER:
222 case U_LETTER_NUMBER:
223 case U_OTHER_NUMBER:
224 ctype = ctype_digit;
225 break;
226 case U_DASH_PUNCTUATION:
227 case U_START_PUNCTUATION:
228 case U_END_PUNCTUATION:
229 case U_CONNECTOR_PUNCTUATION:
230 case U_OTHER_PUNCTUATION:
231 case U_MATH_SYMBOL:
232 case U_CURRENCY_SYMBOL:
233 case U_MODIFIER_SYMBOL:
234 case U_OTHER_SYMBOL:
235 ctype = ctype_symbol;
236 break;
237 default:
238 ctype = ctype_others;
239 break;
240 }
241 break;
242 }
243 if (ctype != lc) {
244 ucs2utf(ch, src);
245 for (p = src; *p; p++) {
246 printf("%x:", *p);
247 }
248 printf("\t%04x\t%s\n", ch, ctypes[ctype]);
249 }
250 lc = ctype;
251 }
252 }
253
254 struct option options[] = {
255 {"bc", 0, NULL, 'b'},
256 {"nfd", 0, NULL, 'd'},
257 {"nfkd", 0, NULL, 'D'},
258 {"nfc", 0, NULL, 'c'},
259 {"nfkc", 0, NULL, 'C'},
260 {"cc", 0, NULL, 'o'},
261 {"gc", 0, NULL, 'g'},
262 {"version", 0, NULL, 'v'},
263 };
264
265 int
main(int argc,char ** argv)266 main(int argc, char **argv)
267 {
268 switch (getopt_long(argc, argv, "bdDcCogv", options, NULL)) {
269 case 'b' :
270 blockcode();
271 break;
272 case 'd' :
273 dump(UNORM_NFD);
274 break;
275 case 'D' :
276 dump(UNORM_NFKD);
277 break;
278 case 'c' :
279 dump(UNORM_NFC);
280 break;
281 case 'C' :
282 dump(UNORM_NFKC);
283 break;
284 case 'o' :
285 ccdump();
286 break;
287 case 'g' :
288 gcdump();
289 break;
290 case 'v' :
291 printf("%s\n", U_UNICODE_VERSION);
292 break;
293 default :
294 fputs("usage: icudump --[bc|nfd|nfkd|nfc|nfkc|cc|gc|version]\n", stderr);
295 break;
296 }
297 return 0;
298 }
299