1
2 /*
3 * Generates a CJK character set table from a .TXT table as found on
4 * ftp.unicode.org or in the X nls directory.
5 * Examples:
6 *
7 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
8 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
9 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
10 *
11 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
12 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
13 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
14 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
15 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
16 *
17 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
18 *
19 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
20 *
21 * ./cjk_tab_to_h BIG5HKSCS-0 big5hkscs >big5hkscs.h < BIG5HKSCS.TXT
22 */
23
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <string.h>
28
29 typedef struct {
30 int start;
31 int end;
32 } Block;
33
34 typedef struct {
35 int rows; /* number of possible values for the 1st byte */
36 int cols; /* number of possible values for the 2nd byte */
37 int (*row_byte) (int row); /* returns the 1st byte value for a given row */
38 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
39 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
40 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
41 const char* check_row_expr; /* format string for 1st byte value checking */
42 const char* check_col_expr; /* format string for 2nd byte value checking */
43 const char* byte_row_expr; /* format string for 1st byte value to row */
44 const char* byte_col_expr; /* format string for 2nd byte value to col */
45 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
46 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
47 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
48 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
49 int ncharsetblocks;
50 Block* charsetblocks; /* blocks[0..nblocks-1] */
51 int* uni2charset; /* uni2charset[0x0000..0xffff] */
52 } Encoding;
53
54 /*
55 * Outputs the file title.
56 */
output_title(const char * charsetname)57 static void output_title (const char *charsetname)
58 {
59 printf("\n");
60 printf("/*\n");
61 printf(" * %s\n", charsetname);
62 printf(" */\n");
63 printf("\n");
64 }
65
66 /*
67 * Reads the charset2uni table from standard input.
68 */
read_table(Encoding * enc)69 static void read_table (Encoding* enc)
70 {
71 int row, col, i, i1, i2, c, j;
72
73 enc->charset2uni = malloc(enc->rows*sizeof(int*));
74 for (row = 0; row < enc->rows; row++)
75 enc->charset2uni[row] = malloc(enc->cols*sizeof(int));
76
77 for (row = 0; row < enc->rows; row++)
78 for (col = 0; col < enc->cols; col++)
79 enc->charset2uni[row][col] = 0xfffd;
80
81 c = getc(stdin);
82 ungetc(c,stdin);
83 if (c == '#') {
84 /* Read a unicode.org style .TXT file. */
85 for (;;) {
86 c = getc(stdin);
87 if (c == EOF)
88 break;
89 if (c == '\n' || c == ' ' || c == '\t')
90 continue;
91 if (c == '#') {
92 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
93 continue;
94 }
95 ungetc(c,stdin);
96 if (scanf("0x%x", &j) != 1)
97 exit(1);
98 i1 = j >> 8;
99 i2 = j & 0xff;
100 row = enc->byte_row(i1);
101 col = enc->byte_col(i2);
102 if (row < 0 || col < 0) {
103 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
104 exit(1);
105 }
106 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
107 exit(1);
108 }
109 } else {
110 /* Read a table of hexadecimal Unicode values. */
111 for (i1 = 32; i1 < 132; i1++)
112 for (i2 = 32; i2 < 132; i2++) {
113 i = scanf("%x", &j);
114 if (i == EOF)
115 goto read_done;
116 if (i != 1)
117 exit(1);
118 if (j < 0 || j == 0xffff)
119 j = 0xfffd;
120 if (j != 0xfffd) {
121 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
122 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
123 exit (1);
124 }
125 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
126 }
127 }
128 read_done: ;
129 }
130 }
131
132 /*
133 * Computes the charsetpage[0..rows] array.
134 */
find_charset2uni_pages(Encoding * enc)135 static void find_charset2uni_pages (Encoding* enc)
136 {
137 int row, col;
138
139 enc->charsetpage = malloc((enc->rows+1)*sizeof(int));
140
141 for (row = 0; row <= enc->rows; row++)
142 enc->charsetpage[row] = 0;
143
144 for (row = 0; row < enc->rows; row++) {
145 int used = 0;
146 for (col = 0; col < enc->cols; col++)
147 if (enc->charset2uni[row][col] != 0xfffd)
148 used = col+1;
149 enc->charsetpage[row] = used;
150 }
151 }
152
153 /*
154 * Fills in nblocks and blocks.
155 */
find_charset2uni_blocks(Encoding * enc)156 static void find_charset2uni_blocks (Encoding* enc)
157 {
158 int n, row, lastrow;
159
160 enc->charsetblocks = malloc(enc->rows*sizeof(Block));
161
162 n = 0;
163 for (row = 0; row < enc->rows; row++)
164 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
165 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
166 enc->charsetblocks[n].start = row * enc->cols;
167 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
168 n++;
169 }
170 enc->ncharsetblocks = n;
171 }
172
173 /*
174 * Outputs the charset to unicode table and function.
175 */
output_charset2uni(const char * name,Encoding * enc)176 static void output_charset2uni (const char* name, Encoding* enc)
177 {
178 int row, col, lastrow, col_max, i, i1_min, i1_max;
179
180 find_charset2uni_pages(enc);
181
182 find_charset2uni_blocks(enc);
183
184 for (row = 0; row < enc->rows; row++)
185 if (enc->charsetpage[row] > 0) {
186 if (row == 0 || enc->charsetpage[row-1] == 0) {
187 /* Start a new block. */
188 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
189 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
190 name, enc->row_byte(row),
191 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
192 }
193 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
194 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
195 for (col = 0; col < col_max; col++) {
196 printf(" 0x%04x,", enc->charset2uni[row][col]);
197 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
198 }
199 printf("\n");
200 if (enc->charsetpage[row+1] == 0) {
201 /* End a block. */
202 printf("};\n");
203 }
204 }
205 printf("\n");
206
207 printf("static int\n");
208 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
209 printf("{\n");
210 printf(" unsigned char c1 = s[0];\n");
211 printf(" if (");
212 for (i = 0; i < enc->ncharsetblocks; i++) {
213 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
214 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
215 if (i > 0)
216 printf(" || ");
217 if (i1_min == i1_max)
218 printf("(c1 == 0x%02x)", i1_min);
219 else
220 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
221 }
222 printf(") {\n");
223 printf(" if (n >= 2) {\n");
224 printf(" unsigned char c2 = s[1];\n");
225 printf(" if (");
226 printf(enc->check_col_expr, "c2");
227 printf(") {\n");
228 printf(" unsigned int i = %d * (", enc->cols);
229 printf(enc->byte_row_expr, "c1");
230 printf(") + (");
231 printf(enc->byte_col_expr, "c2");
232 printf(");\n");
233 printf(" unsigned short wc = 0xfffd;\n");
234 for (i = 0; i < enc->ncharsetblocks; i++) {
235 printf(" ");
236 if (i > 0)
237 printf("} else ");
238 if (i < enc->ncharsetblocks-1)
239 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
240 printf("{\n");
241 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
242 printf(" wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
243 if (enc->charsetblocks[i].start > 0)
244 printf("-%d", enc->charsetblocks[i].start);
245 printf("];\n");
246 }
247 printf(" }\n");
248 printf(" if (wc != 0xfffd) {\n");
249 printf(" *pwc = (ucs4_t) wc;\n");
250 printf(" return 2;\n");
251 printf(" }\n");
252 printf(" }\n");
253 printf(" return RET_ILSEQ;\n");
254 printf(" }\n");
255 printf(" return RET_TOOFEW(0);\n");
256 printf(" }\n");
257 printf(" return RET_ILSEQ;\n");
258 printf("}\n");
259 printf("\n");
260 }
261
262 /*
263 * Computes the uni2charset[0x0000..0xffff] array.
264 */
invert(Encoding * enc)265 static void invert (Encoding* enc)
266 {
267 int row, col, j;
268
269 enc->uni2charset = malloc(0x10000*sizeof(int));
270
271 for (j = 0; j < 0x10000; j++)
272 enc->uni2charset[j] = 0;
273
274 for (row = 0; row < enc->rows; row++)
275 for (col = 0; col < enc->cols; col++) {
276 j = enc->charset2uni[row][col];
277 if (j != 0xfffd)
278 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
279 }
280 }
281
282 /*
283 * Outputs the unicode to charset table and function, using a linear array.
284 * (Suitable if the table is dense.)
285 */
output_uni2charset_dense(const char * name,Encoding * enc)286 static void output_uni2charset_dense (const char* name, Encoding* enc)
287 {
288 /* Like in 8bit_tab_to_h.c */
289 bool pages[0x100];
290 int line[0x2000];
291 int tableno;
292 struct { int minline; int maxline; int usecount; } tables[0x2000];
293 bool first;
294 int row, col, j, p, j1, j2, t;
295
296 for (p = 0; p < 0x100; p++)
297 pages[p] = false;
298 for (row = 0; row < enc->rows; row++)
299 for (col = 0; col < enc->cols; col++) {
300 j = enc->charset2uni[row][col];
301 if (j != 0xfffd)
302 pages[j>>8] = true;
303 }
304 for (j1 = 0; j1 < 0x2000; j1++) {
305 bool all_invalid = true;
306 for (j2 = 0; j2 < 8; j2++) {
307 j = 8*j1+j2;
308 if (enc->uni2charset[j] != 0)
309 all_invalid = false;
310 }
311 if (all_invalid)
312 line[j1] = -1;
313 else
314 line[j1] = 0;
315 }
316 tableno = 0;
317 for (j1 = 0; j1 < 0x2000; j1++) {
318 if (line[j1] >= 0) {
319 if (tableno > 0
320 && ((j1 > 0 && line[j1-1] == tableno-1)
321 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
322 && j1 - tables[tableno-1].maxline <= 8))) {
323 line[j1] = tableno-1;
324 tables[tableno-1].maxline = j1;
325 } else {
326 tableno++;
327 line[j1] = tableno-1;
328 tables[tableno-1].minline = tables[tableno-1].maxline = j1;
329 }
330 }
331 }
332 for (t = 0; t < tableno; t++) {
333 tables[t].usecount = 0;
334 j1 = 8*tables[t].minline;
335 j2 = 8*(tables[t].maxline+1);
336 for (j = j1; j < j2; j++)
337 if (enc->uni2charset[j] != 0)
338 tables[t].usecount++;
339 }
340 {
341 p = -1;
342 for (t = 0; t < tableno; t++)
343 if (tables[t].usecount > 1) {
344 p = tables[t].minline >> 5;
345 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
346 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
347 if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
348 printf(" /* 0x%04x */\n", 8*j1);
349 printf(" ");
350 for (j2 = 0; j2 < 8; j2++) {
351 j = 8*j1+j2;
352 printf(" 0x%04x,", enc->uni2charset[j]);
353 }
354 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
355 }
356 printf("};\n");
357 }
358 if (p >= 0)
359 printf("\n");
360 }
361 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
362 printf("{\n");
363 printf(" if (n >= 2) {\n");
364 printf(" unsigned short c = 0;\n");
365 first = true;
366 for (j1 = 0; j1 < 0x2000;) {
367 t = line[j1];
368 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
369 if (t >= 0) {
370 if (j1 != tables[t].minline) abort();
371 if (j2 > tables[t].maxline+1) abort();
372 j2 = tables[t].maxline+1;
373 if (first)
374 printf(" ");
375 else
376 printf(" else ");
377 first = false;
378 if (tables[t].usecount == 0) abort();
379 if (tables[t].usecount == 1) {
380 if (j2 != j1+1) abort();
381 for (j = 8*j1; j < 8*j2; j++)
382 if (enc->uni2charset[j] != 0) {
383 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
384 break;
385 }
386 } else {
387 if (j1 == 0) {
388 printf("if (wc < 0x%04x)", 8*j2);
389 } else {
390 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
391 }
392 printf("\n c = %s_page%02x[wc", name, j1 >> 5);
393 if (tables[t].minline > 0)
394 printf("-0x%04x", 8*j1);
395 printf("];\n");
396 }
397 }
398 j1 = j2;
399 }
400 printf(" if (c != 0) {\n");
401 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
402 printf(" return 2;\n");
403 printf(" }\n");
404 printf(" return RET_ILSEQ;\n");
405 printf(" }\n");
406 printf(" return RET_TOOSMALL;\n");
407 printf("}\n");
408 }
409
410 /*
411 * Outputs the unicode to charset table and function, using a packed array.
412 * (Suitable if the table is sparse.)
413 */
output_uni2charset_sparse(const char * name,Encoding * enc)414 static void output_uni2charset_sparse (const char* name, Encoding* enc)
415 {
416 bool pages[0x100];
417 Block pageblocks[0x100]; int npageblocks;
418 int indx2charset[0x10000];
419 int summary_indx[0x1000];
420 int summary_used[0x1000];
421 int i, row, col, j, p, j1, j2, indx;
422
423 /* Fill pages[0x100]. */
424 for (p = 0; p < 0x100; p++)
425 pages[p] = false;
426 for (row = 0; row < enc->rows; row++)
427 for (col = 0; col < enc->cols; col++) {
428 j = enc->charset2uni[row][col];
429 if (j != 0xfffd)
430 pages[j>>8] = true;
431 }
432
433 #if 0
434 for (p = 0; p < 0x100; p++)
435 if (pages[p]) {
436 printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
437 for (j1 = 0; j1 < 32; j1++) {
438 printf(" ");
439 for (j2 = 0; j2 < 8; j2++)
440 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
441 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
442 }
443 printf("};\n");
444 }
445 printf("\n");
446 #endif
447
448 /* Fill summary_indx[] and summary_used[]. */
449 indx = 0;
450 for (j1 = 0; j1 < 0x1000; j1++) {
451 summary_indx[j1] = indx;
452 summary_used[j1] = 0;
453 for (j2 = 0; j2 < 16; j2++) {
454 j = 16*j1+j2;
455 if (enc->uni2charset[j] != 0) {
456 indx2charset[indx++] = enc->uni2charset[j];
457 summary_used[j1] |= (1 << j2);
458 }
459 }
460 }
461
462 /* Fill npageblocks and pageblocks[]. */
463 npageblocks = 0;
464 for (p = 0; p < 0x100; ) {
465 if (pages[p] && (p == 0 || !pages[p-1])) {
466 pageblocks[npageblocks].start = 16*p;
467 do p++; while (p < 0x100 && pages[p]);
468 j1 = 16*p;
469 while (summary_used[j1-1] == 0) j1--;
470 pageblocks[npageblocks].end = j1;
471 npageblocks++;
472 } else
473 p++;
474 }
475
476 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
477 for (i = 0; i < indx; ) {
478 if ((i % 8) == 0) printf(" ");
479 printf(" 0x%04x,", indx2charset[i]);
480 i++;
481 if ((i % 8) == 0 || i == indx) printf("\n");
482 }
483 printf("};\n");
484 printf("\n");
485 for (i = 0; i < npageblocks; i++) {
486 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
487 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
488 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
489 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
490 if ((j1 % 4) == 0) printf(" ");
491 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
492 j1++;
493 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
494 }
495 printf("};\n");
496 }
497 printf("\n");
498
499 printf("static int\n");
500 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
501 printf("{\n");
502 printf(" if (n >= 2) {\n");
503 printf(" const Summary16 *summary = NULL;\n");
504 for (i = 0; i < npageblocks; i++) {
505 printf(" ");
506 if (i > 0)
507 printf("else ");
508 printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
509 16*pageblocks[i].start, 16*pageblocks[i].end);
510 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
511 pageblocks[i].start/16);
512 if (pageblocks[i].start > 0)
513 printf("-0x%03x", pageblocks[i].start);
514 printf("];\n");
515 }
516 printf(" if (summary) {\n");
517 printf(" unsigned short used = summary->used;\n");
518 printf(" unsigned int i = wc & 0x0f;\n");
519 printf(" if (used & ((unsigned short) 1 << i)) {\n");
520 printf(" unsigned short c;\n");
521 printf(" /* Keep in `used' only the bits 0..i-1. */\n");
522 printf(" used &= ((unsigned short) 1 << i) - 1;\n");
523 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
524 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
525 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
526 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
527 printf(" used = (used & 0x00ff) + (used >> 8);\n");
528 printf(" c = %s_2charset[summary->indx + used];\n", name);
529 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
530 printf(" return 2;\n");
531 printf(" }\n");
532 printf(" }\n");
533 printf(" return RET_ILSEQ;\n");
534 printf(" }\n");
535 printf(" return RET_TOOSMALL;\n");
536 printf("}\n");
537 }
538
539 /* ISO-2022/EUC specifics */
540
row_byte_normal(int row)541 static int row_byte_normal (int row) { return 0x21+row; }
col_byte_normal(int col)542 static int col_byte_normal (int col) { return 0x21+col; }
byte_row_normal(int byte)543 static int byte_row_normal (int byte) { return byte-0x21; }
byte_col_normal(int byte)544 static int byte_col_normal (int byte) { return byte-0x21; }
545
do_normal(const char * name)546 static void do_normal (const char* name)
547 {
548 Encoding enc;
549
550 enc.rows = 94;
551 enc.cols = 94;
552 enc.row_byte = row_byte_normal;
553 enc.col_byte = col_byte_normal;
554 enc.byte_row = byte_row_normal;
555 enc.byte_col = byte_col_normal;
556 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
557 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
558 enc.byte_row_expr = "%1$s - 0x21";
559 enc.byte_col_expr = "%1$s - 0x21";
560
561 read_table(&enc);
562 output_charset2uni(name,&enc);
563 invert(&enc); output_uni2charset_sparse(name,&enc);
564 }
565
566 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
567 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
568 order. There are 75 out-of-order values, scattered all throughout the table.
569 */
570
do_normal_only_charset2uni(const char * name)571 static void do_normal_only_charset2uni (const char* name)
572 {
573 Encoding enc;
574
575 enc.rows = 94;
576 enc.cols = 94;
577 enc.row_byte = row_byte_normal;
578 enc.col_byte = col_byte_normal;
579 enc.byte_row = byte_row_normal;
580 enc.byte_col = byte_col_normal;
581 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
582 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
583 enc.byte_row_expr = "%1$s - 0x21";
584 enc.byte_col_expr = "%1$s - 0x21";
585
586 read_table(&enc);
587 output_charset2uni(name,&enc);
588 }
589
590 /* CNS 11643 specifics - trick to put two tables into one */
591
row_byte_cns11643(int row)592 static int row_byte_cns11643 (int row) {
593 return 0x100 * (row / 94) + (row % 94) + 0x21;
594 }
byte_row_cns11643(int byte)595 static int byte_row_cns11643 (int byte) {
596 return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
597 byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
598 byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
599 -1);
600 }
601
do_cns11643_only_uni2charset(const char * name)602 static void do_cns11643_only_uni2charset (const char* name)
603 {
604 Encoding enc;
605 int j, x;
606
607 enc.rows = 3*94;
608 enc.cols = 94;
609 enc.row_byte = row_byte_cns11643;
610 enc.col_byte = col_byte_normal;
611 enc.byte_row = byte_row_cns11643;
612 enc.byte_col = byte_col_normal;
613 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
614 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
615 enc.byte_row_expr = "%1$s - 0x21";
616 enc.byte_col_expr = "%1$s - 0x21";
617
618 read_table(&enc);
619 invert(&enc);
620 /* Move the 2 plane bits into the unused bits 15 and 7. */
621 for (j = 0; j < 0x10000; j++) {
622 x = enc.uni2charset[j];
623 if (x != 0) {
624 if (x & 0x8080) abort();
625 switch (x >> 16) {
626 case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
627 case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
628 case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
629 default: abort();
630 }
631 enc.uni2charset[j] = x;
632 }
633 }
634 output_uni2charset_sparse(name,&enc);
635 }
636
637 /* GBK specifics */
638
row_byte_gbk1(int row)639 static int row_byte_gbk1 (int row) {
640 return 0x81+row;
641 }
col_byte_gbk1(int col)642 static int col_byte_gbk1 (int col) {
643 return (col >= 0x3f ? 0x41 : 0x40) + col;
644 }
byte_row_gbk1(int byte)645 static int byte_row_gbk1 (int byte) {
646 if (byte >= 0x81 && byte < 0xff)
647 return byte-0x81;
648 else
649 return -1;
650 }
byte_col_gbk1(int byte)651 static int byte_col_gbk1 (int byte) {
652 if (byte >= 0x40 && byte < 0x7f)
653 return byte-0x40;
654 else if (byte >= 0x80 && byte < 0xff)
655 return byte-0x41;
656 else
657 return -1;
658 }
659
do_gbk1(const char * name)660 static void do_gbk1 (const char* name)
661 {
662 Encoding enc;
663
664 enc.rows = 126;
665 enc.cols = 190;
666 enc.row_byte = row_byte_gbk1;
667 enc.col_byte = col_byte_gbk1;
668 enc.byte_row = byte_row_gbk1;
669 enc.byte_col = byte_col_gbk1;
670 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
671 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
672 enc.byte_row_expr = "%1$s - 0x81";
673 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
674
675 read_table(&enc);
676 output_charset2uni(name,&enc);
677 invert(&enc); output_uni2charset_dense(name,&enc);
678 }
679
do_gbk1_only_charset2uni(const char * name)680 static void do_gbk1_only_charset2uni (const char* name)
681 {
682 Encoding enc;
683
684 enc.rows = 126;
685 enc.cols = 190;
686 enc.row_byte = row_byte_gbk1;
687 enc.col_byte = col_byte_gbk1;
688 enc.byte_row = byte_row_gbk1;
689 enc.byte_col = byte_col_gbk1;
690 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
691 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
692 enc.byte_row_expr = "%1$s - 0x81";
693 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
694
695 read_table(&enc);
696 output_charset2uni(name,&enc);
697 }
698
row_byte_gbk2(int row)699 static int row_byte_gbk2 (int row) {
700 return 0x81+row;
701 }
col_byte_gbk2(int col)702 static int col_byte_gbk2 (int col) {
703 return (col >= 0x3f ? 0x41 : 0x40) + col;
704 }
byte_row_gbk2(int byte)705 static int byte_row_gbk2 (int byte) {
706 if (byte >= 0x81 && byte < 0xff)
707 return byte-0x81;
708 else
709 return -1;
710 }
byte_col_gbk2(int byte)711 static int byte_col_gbk2 (int byte) {
712 if (byte >= 0x40 && byte < 0x7f)
713 return byte-0x40;
714 else if (byte >= 0x80 && byte < 0xa1)
715 return byte-0x41;
716 else
717 return -1;
718 }
719
do_gbk2_only_charset2uni(const char * name)720 static void do_gbk2_only_charset2uni (const char* name)
721 {
722 Encoding enc;
723
724 enc.rows = 126;
725 enc.cols = 96;
726 enc.row_byte = row_byte_gbk2;
727 enc.col_byte = col_byte_gbk2;
728 enc.byte_row = byte_row_gbk2;
729 enc.byte_col = byte_col_gbk2;
730 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
731 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
732 enc.byte_row_expr = "%1$s - 0x81";
733 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
734
735 read_table(&enc);
736 output_charset2uni(name,&enc);
737 }
738
do_gbk1_only_uni2charset(const char * name)739 static void do_gbk1_only_uni2charset (const char* name)
740 {
741 Encoding enc;
742
743 enc.rows = 126;
744 enc.cols = 190;
745 enc.row_byte = row_byte_gbk1;
746 enc.col_byte = col_byte_gbk1;
747 enc.byte_row = byte_row_gbk1;
748 enc.byte_col = byte_col_gbk1;
749 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
750 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
751 enc.byte_row_expr = "%1$s - 0x81";
752 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
753
754 read_table(&enc);
755 invert(&enc); output_uni2charset_sparse(name,&enc);
756 }
757
758 /* KSC 5601 specifics */
759
760 /*
761 * Reads the charset2uni table from standard input.
762 */
read_table_ksc5601(Encoding * enc)763 static void read_table_ksc5601 (Encoding* enc)
764 {
765 int row, col, i, i1, i2, c, j;
766
767 enc->charset2uni = malloc(enc->rows*sizeof(int*));
768 for (row = 0; row < enc->rows; row++)
769 enc->charset2uni[row] = malloc(enc->cols*sizeof(int));
770
771 for (row = 0; row < enc->rows; row++)
772 for (col = 0; col < enc->cols; col++)
773 enc->charset2uni[row][col] = 0xfffd;
774
775 c = getc(stdin);
776 ungetc(c,stdin);
777 if (c == '#') {
778 /* Read a unicode.org style .TXT file. */
779 for (;;) {
780 c = getc(stdin);
781 if (c == EOF)
782 break;
783 if (c == '\n' || c == ' ' || c == '\t')
784 continue;
785 if (c == '#') {
786 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
787 continue;
788 }
789 ungetc(c,stdin);
790 if (scanf("0x%x", &j) != 1)
791 exit(1);
792 i1 = j >> 8;
793 i2 = j & 0xff;
794 if (scanf(" 0x%x", &j) != 1)
795 exit(1);
796 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
797 = KS X 1001.1992, ignore the rest. */
798 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
799 continue; /* KSC5601 specific */
800 i1 &= 0x7f; /* KSC5601 specific */
801 i2 &= 0x7f; /* KSC5601 specific */
802 row = enc->byte_row(i1);
803 col = enc->byte_col(i2);
804 if (row < 0 || col < 0) {
805 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
806 exit(1);
807 }
808 enc->charset2uni[row][col] = j;
809 }
810 } else {
811 /* Read a table of hexadecimal Unicode values. */
812 for (i1 = 33; i1 < 127; i1++)
813 for (i2 = 33; i2 < 127; i2++) {
814 i = scanf("%x", &j);
815 if (i == EOF)
816 goto read_done;
817 if (i != 1)
818 exit(1);
819 if (j < 0 || j == 0xffff)
820 j = 0xfffd;
821 if (j != 0xfffd) {
822 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
823 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
824 exit (1);
825 }
826 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
827 }
828 }
829 read_done: ;
830 }
831 }
832
do_ksc5601(const char * name)833 static void do_ksc5601 (const char* name)
834 {
835 Encoding enc;
836
837 enc.rows = 94;
838 enc.cols = 94;
839 enc.row_byte = row_byte_normal;
840 enc.col_byte = col_byte_normal;
841 enc.byte_row = byte_row_normal;
842 enc.byte_col = byte_col_normal;
843 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
844 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
845 enc.byte_row_expr = "%1$s - 0x21";
846 enc.byte_col_expr = "%1$s - 0x21";
847
848 read_table_ksc5601(&enc);
849 output_charset2uni(name,&enc);
850 invert(&enc); output_uni2charset_sparse(name,&enc);
851 }
852
853 /* Big5 specifics */
854
row_byte_big5(int row)855 static int row_byte_big5 (int row) {
856 return 0xa1+row;
857 }
col_byte_big5(int col)858 static int col_byte_big5 (int col) {
859 return (col >= 0x3f ? 0x62 : 0x40) + col;
860 }
byte_row_big5(int byte)861 static int byte_row_big5 (int byte) {
862 if (byte >= 0xa1 && byte < 0xff)
863 return byte-0xa1;
864 else
865 return -1;
866 }
byte_col_big5(int byte)867 static int byte_col_big5 (int byte) {
868 if (byte >= 0x40 && byte < 0x7f)
869 return byte-0x40;
870 else if (byte >= 0xa1 && byte < 0xff)
871 return byte-0x62;
872 else
873 return -1;
874 }
875
do_big5(const char * name)876 static void do_big5 (const char* name)
877 {
878 Encoding enc;
879
880 enc.rows = 94;
881 enc.cols = 157;
882 enc.row_byte = row_byte_big5;
883 enc.col_byte = col_byte_big5;
884 enc.byte_row = byte_row_big5;
885 enc.byte_col = byte_col_big5;
886 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
887 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
888 enc.byte_row_expr = "%1$s - 0xa1";
889 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
890
891 read_table(&enc);
892 output_charset2uni(name,&enc);
893 invert(&enc); output_uni2charset_sparse(name,&enc);
894 }
895
896 /* Big5-HKSCS specifics */
897
row_byte_big5hkscs(int row)898 static int row_byte_big5hkscs (int row) {
899 return 0x81+row;
900 }
col_byte_big5hkscs(int col)901 static int col_byte_big5hkscs (int col) {
902 return (col >= 0x3f ? 0x62 : 0x40) + col;
903 }
byte_row_big5hkscs(int byte)904 static int byte_row_big5hkscs (int byte) {
905 if (byte >= 0x81 && byte < 0xff)
906 return byte-0x81;
907 else
908 return -1;
909 }
byte_col_big5hkscs(int byte)910 static int byte_col_big5hkscs (int byte) {
911 if (byte >= 0x40 && byte < 0x7f)
912 return byte-0x40;
913 else if (byte >= 0xa1 && byte < 0xff)
914 return byte-0x62;
915 else
916 return -1;
917 }
918
do_big5hkscs(const char * name)919 static void do_big5hkscs (const char* name)
920 {
921 Encoding enc;
922
923 enc.rows = 126;
924 enc.cols = 157;
925 enc.row_byte = row_byte_big5hkscs;
926 enc.col_byte = col_byte_big5hkscs;
927 enc.byte_row = byte_row_big5hkscs;
928 enc.byte_col = byte_col_big5hkscs;
929 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
930 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
931 enc.byte_row_expr = "%1$s - 0x81";
932 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
933
934 read_table(&enc);
935 output_charset2uni(name,&enc);
936 invert(&enc); output_uni2charset_sparse(name,&enc);
937 }
938
939 /* Johab Hangul specifics */
940
row_byte_johab_hangul(int row)941 static int row_byte_johab_hangul (int row) {
942 return 0x84+row;
943 }
col_byte_johab_hangul(int col)944 static int col_byte_johab_hangul (int col) {
945 return (col >= 0x3e ? 0x43 : 0x41) + col;
946 }
byte_row_johab_hangul(int byte)947 static int byte_row_johab_hangul (int byte) {
948 if (byte >= 0x84 && byte < 0xd4)
949 return byte-0x84;
950 else
951 return -1;
952 }
byte_col_johab_hangul(int byte)953 static int byte_col_johab_hangul (int byte) {
954 if (byte >= 0x41 && byte < 0x7f)
955 return byte-0x41;
956 else if (byte >= 0x81 && byte < 0xff)
957 return byte-0x43;
958 else
959 return -1;
960 }
961
do_johab_hangul(const char * name)962 static void do_johab_hangul (const char* name)
963 {
964 Encoding enc;
965
966 enc.rows = 80;
967 enc.cols = 188;
968 enc.row_byte = row_byte_johab_hangul;
969 enc.col_byte = col_byte_johab_hangul;
970 enc.byte_row = byte_row_johab_hangul;
971 enc.byte_col = byte_col_johab_hangul;
972 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
973 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
974 enc.byte_row_expr = "%1$s - 0x84";
975 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
976
977 read_table(&enc);
978 output_charset2uni(name,&enc);
979 invert(&enc); output_uni2charset_dense(name,&enc);
980 }
981
982 /* SJIS specifics */
983
row_byte_sjis(int row)984 static int row_byte_sjis (int row) {
985 return (row >= 0x1f ? 0xc1 : 0x81) + row;
986 }
col_byte_sjis(int col)987 static int col_byte_sjis (int col) {
988 return (col >= 0x3f ? 0x41 : 0x40) + col;
989 }
byte_row_sjis(int byte)990 static int byte_row_sjis (int byte) {
991 if (byte >= 0x81 && byte < 0xa0)
992 return byte-0x81;
993 else if (byte >= 0xe0)
994 return byte-0xc1;
995 else
996 return -1;
997 }
byte_col_sjis(int byte)998 static int byte_col_sjis (int byte) {
999 if (byte >= 0x40 && byte < 0x7f)
1000 return byte-0x40;
1001 else if (byte >= 0x80 && byte < 0xfd)
1002 return byte-0x41;
1003 else
1004 return -1;
1005 }
1006
do_sjis(const char * name)1007 static void do_sjis (const char* name)
1008 {
1009 Encoding enc;
1010
1011 enc.rows = 94;
1012 enc.cols = 188;
1013 enc.row_byte = row_byte_sjis;
1014 enc.col_byte = col_byte_sjis;
1015 enc.byte_row = byte_row_sjis;
1016 enc.byte_col = byte_col_sjis;
1017 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1018 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1019 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1020 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1021
1022 read_table(&enc);
1023 output_charset2uni(name,&enc);
1024 invert(&enc); output_uni2charset_sparse(name,&enc);
1025 }
1026
1027 /* Main program */
1028
main(int argc,char * argv[])1029 int main (int argc, char *argv[])
1030 {
1031 const char* charsetname;
1032 const char* name;
1033
1034 if (argc != 3)
1035 exit(1);
1036 charsetname = argv[1];
1037 name = argv[2];
1038
1039 output_title(charsetname);
1040
1041 if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
1042 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
1043 do_normal(name);
1044 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1045 || !strcmp(name,"cns11643_3"))
1046 do_normal_only_charset2uni(name);
1047 else if (!strcmp(name,"cns11643_inv"))
1048 do_cns11643_only_uni2charset(name);
1049 else if (!strcmp(name,"gbkext1"))
1050 do_gbk1_only_charset2uni(name);
1051 else if (!strcmp(name,"gbkext2"))
1052 do_gbk2_only_charset2uni(name);
1053 else if (!strcmp(name,"gbkext_inv"))
1054 do_gbk1_only_uni2charset(name);
1055 else if (!strcmp(name,"cp936ext"))
1056 do_gbk1(name);
1057 else if (!strcmp(name,"ksc5601"))
1058 do_ksc5601(name);
1059 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1060 do_big5(name);
1061 else if (!strcmp(name,"big5hkscs"))
1062 do_big5hkscs(name);
1063 else if (!strcmp(name,"johab_hangul"))
1064 do_johab_hangul(name);
1065 else if (!strcmp(name,"cp932ext"))
1066 do_sjis(name);
1067 else
1068 exit(1);
1069
1070 return 0;
1071 }
1072