1 /* $XFree86: xc/lib/X11/lcUniConv/cjk_tab_to_h.c,v 1.2 2000/12/04 18:49:31 dawes Exp $ */
2
3 /*
4 * Generates a CJK character set table from a .TXT table as found on
5 * ftp.unicode.org or in the X nls directory.
6 * Examples:
7 *
8 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
9 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
10 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
11 *
12 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
13 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
14 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
15 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
16 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
17 *
18 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
19 *
20 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
21 */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdbool.h>
26 #include <string.h>
27
28 typedef struct {
29 int start;
30 int end;
31 } Block;
32
33 typedef struct {
34 int rows; /* number of possible values for the 1st byte */
35 int cols; /* number of possible values for the 2nd byte */
36 int (*row_byte) (int row); /* returns the 1st byte value for a given row */
37 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
38 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
39 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
40 const char* check_row_expr; /* format string for 1st byte value checking */
41 const char* check_col_expr; /* format string for 2nd byte value checking */
42 const char* byte_row_expr; /* format string for 1st byte value to row */
43 const char* byte_col_expr; /* format string for 2nd byte value to col */
44 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
45 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
46 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
47 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
48 int ncharsetblocks;
49 Block* charsetblocks; /* blocks[0..nblocks-1] */
50 int* uni2charset; /* uni2charset[0x0000..0xffff] */
51 } Encoding;
52
53 /*
54 * Outputs the file title.
55 */
output_title(const char * charsetname)56 static void output_title (const char *charsetname)
57 {
58 printf("\n");
59 printf("/*\n");
60 printf(" * %s\n", charsetname);
61 printf(" */\n");
62 printf("\n");
63 }
64
65 /*
66 * Reads the charset2uni table from standard input.
67 */
read_table(Encoding * enc)68 static void read_table (Encoding* enc)
69 {
70 int row, col, i, i1, i2, c, j;
71
72 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
73 for (row = 0; row < enc->rows; row++)
74 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
75
76 for (row = 0; row < enc->rows; row++)
77 for (col = 0; col < enc->cols; col++)
78 enc->charset2uni[row][col] = 0xfffd;
79
80 c = getc(stdin);
81 ungetc(c,stdin);
82 if (c == '#') {
83 /* Read a unicode.org style .TXT file. */
84 for (;;) {
85 c = getc(stdin);
86 if (c == EOF)
87 break;
88 if (c == '\n' || c == ' ' || c == '\t')
89 continue;
90 if (c == '#') {
91 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
92 continue;
93 }
94 ungetc(c,stdin);
95 if (scanf("0x%x", &j) != 1)
96 exit(1);
97 i1 = j >> 8;
98 i2 = j & 0xff;
99 row = enc->byte_row(i1);
100 col = enc->byte_col(i2);
101 if (row < 0 || col < 0) {
102 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
103 exit(1);
104 }
105 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
106 exit(1);
107 }
108 } else {
109 /* Read a table of hexadecimal Unicode values. */
110 for (i1 = 32; i1 < 132; i1++)
111 for (i2 = 32; i2 < 132; i2++) {
112 i = scanf("%x", &j);
113 if (i == EOF)
114 goto read_done;
115 if (i != 1)
116 exit(1);
117 if (j < 0 || j == 0xffff)
118 j = 0xfffd;
119 if (j != 0xfffd) {
120 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
121 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
122 exit (1);
123 }
124 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
125 }
126 }
127 read_done: ;
128 }
129 }
130
131 /*
132 * Computes the charsetpage[0..rows] array.
133 */
find_charset2uni_pages(Encoding * enc)134 static void find_charset2uni_pages (Encoding* enc)
135 {
136 int row, col;
137
138 enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
139
140 for (row = 0; row <= enc->rows; row++)
141 enc->charsetpage[row] = 0;
142
143 for (row = 0; row < enc->rows; row++) {
144 int used = 0;
145 for (col = 0; col < enc->cols; col++)
146 if (enc->charset2uni[row][col] != 0xfffd)
147 used = col+1;
148 enc->charsetpage[row] = used;
149 }
150 }
151
152 /*
153 * Fills in nblocks and blocks.
154 */
find_charset2uni_blocks(Encoding * enc)155 static void find_charset2uni_blocks (Encoding* enc)
156 {
157 int n, row, lastrow;
158
159 enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
160
161 n = 0;
162 for (row = 0; row < enc->rows; row++)
163 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
164 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
165 enc->charsetblocks[n].start = row * enc->cols;
166 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
167 n++;
168 }
169 enc->ncharsetblocks = n;
170 }
171
172 /*
173 * Outputs the charset to unicode table and function.
174 */
output_charset2uni(const char * name,Encoding * enc)175 static void output_charset2uni (const char* name, Encoding* enc)
176 {
177 int row, col, lastrow, col_max, i, i1_min, i1_max;
178
179 find_charset2uni_pages(enc);
180
181 find_charset2uni_blocks(enc);
182
183 for (row = 0; row < enc->rows; row++)
184 if (enc->charsetpage[row] > 0) {
185 if (row == 0 || enc->charsetpage[row-1] == 0) {
186 /* Start a new block. */
187 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
188 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
189 name, enc->row_byte(row),
190 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
191 }
192 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
193 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
194 for (col = 0; col < col_max; col++) {
195 printf(" 0x%04x,", enc->charset2uni[row][col]);
196 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
197 }
198 printf("\n");
199 if (enc->charsetpage[row+1] == 0) {
200 /* End a block. */
201 printf("};\n");
202 }
203 }
204 printf("\n");
205
206 printf("static int\n");
207 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
208 printf("{\n");
209 printf(" unsigned char c1 = s[0];\n");
210 printf(" if (");
211 for (i = 0; i < enc->ncharsetblocks; i++) {
212 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
213 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
214 if (i > 0)
215 printf(" || ");
216 if (i1_min == i1_max)
217 printf("(c1 == 0x%02x)", i1_min);
218 else
219 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
220 }
221 printf(") {\n");
222 printf(" if (n >= 2) {\n");
223 printf(" unsigned char c2 = s[1];\n");
224 printf(" if (");
225 printf(enc->check_col_expr, "c2");
226 printf(") {\n");
227 printf(" unsigned int i = %d * (", enc->cols);
228 printf(enc->byte_row_expr, "c1");
229 printf(") + (");
230 printf(enc->byte_col_expr, "c2");
231 printf(");\n");
232 printf(" unsigned short wc = 0xfffd;\n");
233 for (i = 0; i < enc->ncharsetblocks; i++) {
234 printf(" ");
235 if (i > 0)
236 printf("} else ");
237 if (i < enc->ncharsetblocks-1)
238 printf("if (i < %d) ", enc->charsetblocks[i+1].start);
239 printf("{\n");
240 printf(" if (i < %d)\n", enc->charsetblocks[i].end);
241 printf(" wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
242 if (enc->charsetblocks[i].start > 0)
243 printf("-%d", enc->charsetblocks[i].start);
244 printf("];\n");
245 }
246 printf(" }\n");
247 printf(" if (wc != 0xfffd) {\n");
248 printf(" *pwc = (ucs4_t) wc;\n");
249 printf(" return 2;\n");
250 printf(" }\n");
251 printf(" }\n");
252 printf(" return RET_ILSEQ;\n");
253 printf(" }\n");
254 printf(" return RET_TOOFEW(0);\n");
255 printf(" }\n");
256 printf(" return RET_ILSEQ;\n");
257 printf("}\n");
258 printf("\n");
259 }
260
261 /*
262 * Computes the uni2charset[0x0000..0xffff] array.
263 */
invert(Encoding * enc)264 static void invert (Encoding* enc)
265 {
266 int row, col, j;
267
268 enc->uni2charset = (int*) malloc(0x10000*sizeof(int));
269
270 for (j = 0; j < 0x10000; j++)
271 enc->uni2charset[j] = 0;
272
273 for (row = 0; row < enc->rows; row++)
274 for (col = 0; col < enc->cols; col++) {
275 j = enc->charset2uni[row][col];
276 if (j != 0xfffd)
277 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
278 }
279 }
280
281 /*
282 * Outputs the unicode to charset table and function, using a linear array.
283 * (Suitable if the table is dense.)
284 */
output_uni2charset_dense(const char * name,Encoding * enc)285 static void output_uni2charset_dense (const char* name, Encoding* enc)
286 {
287 /* Like in 8bit_tab_to_h.c */
288 bool pages[0x100];
289 int line[0x2000];
290 int tableno;
291 struct { int minline; int maxline; int usecount; } tables[0x2000];
292 bool first;
293 int row, col, j, p, j1, j2, t;
294
295 for (p = 0; p < 0x100; p++)
296 pages[p] = false;
297 for (row = 0; row < enc->rows; row++)
298 for (col = 0; col < enc->cols; col++) {
299 j = enc->charset2uni[row][col];
300 if (j != 0xfffd)
301 pages[j>>8] = true;
302 }
303 for (j1 = 0; j1 < 0x2000; j1++) {
304 bool all_invalid = true;
305 for (j2 = 0; j2 < 8; j2++) {
306 j = 8*j1+j2;
307 if (enc->uni2charset[j] != 0)
308 all_invalid = false;
309 }
310 if (all_invalid)
311 line[j1] = -1;
312 else
313 line[j1] = 0;
314 }
315 tableno = 0;
316 for (j1 = 0; j1 < 0x2000; j1++) {
317 if (line[j1] >= 0) {
318 if (tableno > 0
319 && ((j1 > 0 && line[j1-1] == tableno-1)
320 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
321 && j1 - tables[tableno-1].maxline <= 8))) {
322 line[j1] = tableno-1;
323 tables[tableno-1].maxline = j1;
324 } else {
325 tableno++;
326 line[j1] = tableno-1;
327 tables[tableno-1].minline = tables[tableno-1].maxline = j1;
328 }
329 }
330 }
331 for (t = 0; t < tableno; t++) {
332 tables[t].usecount = 0;
333 j1 = 8*tables[t].minline;
334 j2 = 8*(tables[t].maxline+1);
335 for (j = j1; j < j2; j++)
336 if (enc->uni2charset[j] != 0)
337 tables[t].usecount++;
338 }
339 {
340 p = -1;
341 for (t = 0; t < tableno; t++)
342 if (tables[t].usecount > 1) {
343 p = tables[t].minline >> 5;
344 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
345 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
346 if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
347 printf(" /* 0x%04x */\n", 8*j1);
348 printf(" ");
349 for (j2 = 0; j2 < 8; j2++) {
350 j = 8*j1+j2;
351 printf(" 0x%04x,", enc->uni2charset[j]);
352 }
353 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
354 }
355 printf("};\n");
356 }
357 if (p >= 0)
358 printf("\n");
359 }
360 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
361 printf("{\n");
362 printf(" if (n >= 2) {\n");
363 printf(" unsigned short c = 0;\n");
364 first = true;
365 for (j1 = 0; j1 < 0x2000;) {
366 t = line[j1];
367 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
368 if (t >= 0) {
369 if (j1 != tables[t].minline) abort();
370 if (j2 > tables[t].maxline+1) abort();
371 j2 = tables[t].maxline+1;
372 if (first)
373 printf(" ");
374 else
375 printf(" else ");
376 first = false;
377 if (tables[t].usecount == 0) abort();
378 if (tables[t].usecount == 1) {
379 if (j2 != j1+1) abort();
380 for (j = 8*j1; j < 8*j2; j++)
381 if (enc->uni2charset[j] != 0) {
382 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
383 break;
384 }
385 } else {
386 if (j1 == 0) {
387 printf("if (wc < 0x%04x)", 8*j2);
388 } else {
389 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
390 }
391 printf("\n c = %s_page%02x[wc", name, j1 >> 5);
392 if (tables[t].minline > 0)
393 printf("-0x%04x", 8*j1);
394 printf("];\n");
395 }
396 }
397 j1 = j2;
398 }
399 printf(" if (c != 0) {\n");
400 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
401 printf(" return 2;\n");
402 printf(" }\n");
403 printf(" return RET_ILSEQ;\n");
404 printf(" }\n");
405 printf(" return RET_TOOSMALL;\n");
406 printf("}\n");
407 }
408
409 /*
410 * Outputs the unicode to charset table and function, using a packed array.
411 * (Suitable if the table is sparse.)
412 */
output_uni2charset_sparse(const char * name,Encoding * enc)413 static void output_uni2charset_sparse (const char* name, Encoding* enc)
414 {
415 bool pages[0x100];
416 Block pageblocks[0x100]; int npageblocks;
417 int indx2charset[0x10000];
418 int summary_indx[0x1000];
419 int summary_used[0x1000];
420 int i, row, col, j, p, j1, j2, indx;
421
422 /* Fill pages[0x100]. */
423 for (p = 0; p < 0x100; p++)
424 pages[p] = false;
425 for (row = 0; row < enc->rows; row++)
426 for (col = 0; col < enc->cols; col++) {
427 j = enc->charset2uni[row][col];
428 if (j != 0xfffd)
429 pages[j>>8] = true;
430 }
431
432 #if 0
433 for (p = 0; p < 0x100; p++)
434 if (pages[p]) {
435 printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
436 for (j1 = 0; j1 < 32; j1++) {
437 printf(" ");
438 for (j2 = 0; j2 < 8; j2++)
439 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
440 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
441 }
442 printf("};\n");
443 }
444 printf("\n");
445 #endif
446
447 /* Fill summary_indx[] and summary_used[]. */
448 indx = 0;
449 for (j1 = 0; j1 < 0x1000; j1++) {
450 summary_indx[j1] = indx;
451 summary_used[j1] = 0;
452 for (j2 = 0; j2 < 16; j2++) {
453 j = 16*j1+j2;
454 if (enc->uni2charset[j] != 0) {
455 indx2charset[indx++] = enc->uni2charset[j];
456 summary_used[j1] |= (1 << j2);
457 }
458 }
459 }
460
461 /* Fill npageblocks and pageblocks[]. */
462 npageblocks = 0;
463 for (p = 0; p < 0x100; ) {
464 if (pages[p] && (p == 0 || !pages[p-1])) {
465 pageblocks[npageblocks].start = 16*p;
466 do p++; while (p < 0x100 && pages[p]);
467 j1 = 16*p;
468 while (summary_used[j1-1] == 0) j1--;
469 pageblocks[npageblocks].end = j1;
470 npageblocks++;
471 } else
472 p++;
473 }
474
475 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
476 for (i = 0; i < indx; ) {
477 if ((i % 8) == 0) printf(" ");
478 printf(" 0x%04x,", indx2charset[i]);
479 i++;
480 if ((i % 8) == 0 || i == indx) printf("\n");
481 }
482 printf("};\n");
483 printf("\n");
484 for (i = 0; i < npageblocks; i++) {
485 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
486 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
487 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
488 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
489 if ((j1 % 4) == 0) printf(" ");
490 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
491 j1++;
492 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
493 }
494 printf("};\n");
495 }
496 printf("\n");
497
498 printf("static int\n");
499 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
500 printf("{\n");
501 printf(" if (n >= 2) {\n");
502 printf(" const Summary16 *summary = NULL;\n");
503 for (i = 0; i < npageblocks; i++) {
504 printf(" ");
505 if (i > 0)
506 printf("else ");
507 printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
508 16*pageblocks[i].start, 16*pageblocks[i].end);
509 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
510 pageblocks[i].start/16);
511 if (pageblocks[i].start > 0)
512 printf("-0x%03x", pageblocks[i].start);
513 printf("];\n");
514 }
515 printf(" if (summary) {\n");
516 printf(" unsigned short used = summary->used;\n");
517 printf(" unsigned int i = wc & 0x0f;\n");
518 printf(" if (used & ((unsigned short) 1 << i)) {\n");
519 printf(" unsigned short c;\n");
520 printf(" /* Keep in `used' only the bits 0..i-1. */\n");
521 printf(" used &= ((unsigned short) 1 << i) - 1;\n");
522 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
523 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
524 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
525 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
526 printf(" used = (used & 0x00ff) + (used >> 8);\n");
527 printf(" c = %s_2charset[summary->indx + used];\n", name);
528 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
529 printf(" return 2;\n");
530 printf(" }\n");
531 printf(" }\n");
532 printf(" return RET_ILSEQ;\n");
533 printf(" }\n");
534 printf(" return RET_TOOSMALL;\n");
535 printf("}\n");
536 }
537
538 /* ISO-2022/EUC specifics */
539
row_byte_normal(int row)540 static int row_byte_normal (int row) { return 0x21+row; }
col_byte_normal(int col)541 static int col_byte_normal (int col) { return 0x21+col; }
byte_row_normal(int byte)542 static int byte_row_normal (int byte) { return byte-0x21; }
byte_col_normal(int byte)543 static int byte_col_normal (int byte) { return byte-0x21; }
544
do_normal(const char * name)545 static void do_normal (const char* name)
546 {
547 Encoding enc;
548
549 enc.rows = 94;
550 enc.cols = 94;
551 enc.row_byte = row_byte_normal;
552 enc.col_byte = col_byte_normal;
553 enc.byte_row = byte_row_normal;
554 enc.byte_col = byte_col_normal;
555 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
556 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
557 enc.byte_row_expr = "%1$s - 0x21";
558 enc.byte_col_expr = "%1$s - 0x21";
559
560 read_table(&enc);
561 output_charset2uni(name,&enc);
562 invert(&enc); output_uni2charset_sparse(name,&enc);
563 }
564
565 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
566 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
567 order. There are 75 out-of-order values, scattered all throughout the table.
568 */
569
do_normal_only_charset2uni(const char * name)570 static void do_normal_only_charset2uni (const char* name)
571 {
572 Encoding enc;
573
574 enc.rows = 94;
575 enc.cols = 94;
576 enc.row_byte = row_byte_normal;
577 enc.col_byte = col_byte_normal;
578 enc.byte_row = byte_row_normal;
579 enc.byte_col = byte_col_normal;
580 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
581 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
582 enc.byte_row_expr = "%1$s - 0x21";
583 enc.byte_col_expr = "%1$s - 0x21";
584
585 read_table(&enc);
586 output_charset2uni(name,&enc);
587 }
588
589 /* CNS 11643 specifics - trick to put two tables into one */
590
row_byte_cns11643(int row)591 static int row_byte_cns11643 (int row) {
592 return 0x100 * (row / 94) + (row % 94) + 0x21;
593 }
byte_row_cns11643(int byte)594 static int byte_row_cns11643 (int byte) {
595 return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
596 byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
597 byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
598 -1);
599 }
600
do_cns11643_only_uni2charset(const char * name)601 static void do_cns11643_only_uni2charset (const char* name)
602 {
603 Encoding enc;
604 int j, x;
605
606 enc.rows = 3*94;
607 enc.cols = 94;
608 enc.row_byte = row_byte_cns11643;
609 enc.col_byte = col_byte_normal;
610 enc.byte_row = byte_row_cns11643;
611 enc.byte_col = byte_col_normal;
612 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
613 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
614 enc.byte_row_expr = "%1$s - 0x21";
615 enc.byte_col_expr = "%1$s - 0x21";
616
617 read_table(&enc);
618 invert(&enc);
619 /* Move the 2 plane bits into the unused bits 15 and 7. */
620 for (j = 0; j < 0x10000; j++) {
621 x = enc.uni2charset[j];
622 if (x != 0) {
623 if (x & 0x8080) abort();
624 switch (x >> 16) {
625 case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
626 case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
627 case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
628 default: abort();
629 }
630 enc.uni2charset[j] = x;
631 }
632 }
633 output_uni2charset_sparse(name,&enc);
634 }
635
636 /* GBK specifics */
637
row_byte_gbk1(int row)638 static int row_byte_gbk1 (int row) {
639 return 0x81+row;
640 }
col_byte_gbk1(int col)641 static int col_byte_gbk1 (int col) {
642 return (col >= 0x3f ? 0x41 : 0x40) + col;
643 }
byte_row_gbk1(int byte)644 static int byte_row_gbk1 (int byte) {
645 if (byte >= 0x81 && byte < 0xff)
646 return byte-0x81;
647 else
648 return -1;
649 }
byte_col_gbk1(int byte)650 static int byte_col_gbk1 (int byte) {
651 if (byte >= 0x40 && byte < 0x7f)
652 return byte-0x40;
653 else if (byte >= 0x80 && byte < 0xff)
654 return byte-0x41;
655 else
656 return -1;
657 }
658
do_gbk1(const char * name)659 static void do_gbk1 (const char* name)
660 {
661 Encoding enc;
662
663 enc.rows = 126;
664 enc.cols = 190;
665 enc.row_byte = row_byte_gbk1;
666 enc.col_byte = col_byte_gbk1;
667 enc.byte_row = byte_row_gbk1;
668 enc.byte_col = byte_col_gbk1;
669 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
670 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
671 enc.byte_row_expr = "%1$s - 0x81";
672 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
673
674 read_table(&enc);
675 output_charset2uni(name,&enc);
676 invert(&enc); output_uni2charset_dense(name,&enc);
677 }
678
do_gbk1_only_charset2uni(const char * name)679 static void do_gbk1_only_charset2uni (const char* name)
680 {
681 Encoding enc;
682
683 enc.rows = 126;
684 enc.cols = 190;
685 enc.row_byte = row_byte_gbk1;
686 enc.col_byte = col_byte_gbk1;
687 enc.byte_row = byte_row_gbk1;
688 enc.byte_col = byte_col_gbk1;
689 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
690 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
691 enc.byte_row_expr = "%1$s - 0x81";
692 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
693
694 read_table(&enc);
695 output_charset2uni(name,&enc);
696 }
697
row_byte_gbk2(int row)698 static int row_byte_gbk2 (int row) {
699 return 0x81+row;
700 }
col_byte_gbk2(int col)701 static int col_byte_gbk2 (int col) {
702 return (col >= 0x3f ? 0x41 : 0x40) + col;
703 }
byte_row_gbk2(int byte)704 static int byte_row_gbk2 (int byte) {
705 if (byte >= 0x81 && byte < 0xff)
706 return byte-0x81;
707 else
708 return -1;
709 }
byte_col_gbk2(int byte)710 static int byte_col_gbk2 (int byte) {
711 if (byte >= 0x40 && byte < 0x7f)
712 return byte-0x40;
713 else if (byte >= 0x80 && byte < 0xa1)
714 return byte-0x41;
715 else
716 return -1;
717 }
718
do_gbk2_only_charset2uni(const char * name)719 static void do_gbk2_only_charset2uni (const char* name)
720 {
721 Encoding enc;
722
723 enc.rows = 126;
724 enc.cols = 96;
725 enc.row_byte = row_byte_gbk2;
726 enc.col_byte = col_byte_gbk2;
727 enc.byte_row = byte_row_gbk2;
728 enc.byte_col = byte_col_gbk2;
729 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
730 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
731 enc.byte_row_expr = "%1$s - 0x81";
732 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
733
734 read_table(&enc);
735 output_charset2uni(name,&enc);
736 }
737
do_gbk1_only_uni2charset(const char * name)738 static void do_gbk1_only_uni2charset (const char* name)
739 {
740 Encoding enc;
741
742 enc.rows = 126;
743 enc.cols = 190;
744 enc.row_byte = row_byte_gbk1;
745 enc.col_byte = col_byte_gbk1;
746 enc.byte_row = byte_row_gbk1;
747 enc.byte_col = byte_col_gbk1;
748 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
749 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
750 enc.byte_row_expr = "%1$s - 0x81";
751 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
752
753 read_table(&enc);
754 invert(&enc); output_uni2charset_sparse(name,&enc);
755 }
756
757 /* KSC 5601 specifics */
758
759 /*
760 * Reads the charset2uni table from standard input.
761 */
read_table_ksc5601(Encoding * enc)762 static void read_table_ksc5601 (Encoding* enc)
763 {
764 int row, col, i, i1, i2, c, j;
765
766 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
767 for (row = 0; row < enc->rows; row++)
768 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
769
770 for (row = 0; row < enc->rows; row++)
771 for (col = 0; col < enc->cols; col++)
772 enc->charset2uni[row][col] = 0xfffd;
773
774 c = getc(stdin);
775 ungetc(c,stdin);
776 if (c == '#') {
777 /* Read a unicode.org style .TXT file. */
778 for (;;) {
779 c = getc(stdin);
780 if (c == EOF)
781 break;
782 if (c == '\n' || c == ' ' || c == '\t')
783 continue;
784 if (c == '#') {
785 do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
786 continue;
787 }
788 ungetc(c,stdin);
789 if (scanf("0x%x", &j) != 1)
790 exit(1);
791 i1 = j >> 8;
792 i2 = j & 0xff;
793 if (scanf(" 0x%x", &j) != 1)
794 exit(1);
795 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
796 = KS X 1001.1992, ignore the rest. */
797 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
798 continue; /* KSC5601 specific */
799 i1 &= 0x7f; /* KSC5601 specific */
800 i2 &= 0x7f; /* KSC5601 specific */
801 row = enc->byte_row(i1);
802 col = enc->byte_col(i2);
803 if (row < 0 || col < 0) {
804 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
805 exit(1);
806 }
807 enc->charset2uni[row][col] = j;
808 }
809 } else {
810 /* Read a table of hexadecimal Unicode values. */
811 for (i1 = 33; i1 < 127; i1++)
812 for (i2 = 33; i2 < 127; i2++) {
813 i = scanf("%x", &j);
814 if (i == EOF)
815 goto read_done;
816 if (i != 1)
817 exit(1);
818 if (j < 0 || j == 0xffff)
819 j = 0xfffd;
820 if (j != 0xfffd) {
821 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
822 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
823 exit (1);
824 }
825 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
826 }
827 }
828 read_done: ;
829 }
830 }
831
do_ksc5601(const char * name)832 static void do_ksc5601 (const char* name)
833 {
834 Encoding enc;
835
836 enc.rows = 94;
837 enc.cols = 94;
838 enc.row_byte = row_byte_normal;
839 enc.col_byte = col_byte_normal;
840 enc.byte_row = byte_row_normal;
841 enc.byte_col = byte_col_normal;
842 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
843 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
844 enc.byte_row_expr = "%1$s - 0x21";
845 enc.byte_col_expr = "%1$s - 0x21";
846
847 read_table_ksc5601(&enc);
848 output_charset2uni(name,&enc);
849 invert(&enc); output_uni2charset_sparse(name,&enc);
850 }
851
852 /* Big5 specifics */
853
row_byte_big5(int row)854 static int row_byte_big5 (int row) {
855 return 0xa1+row;
856 }
col_byte_big5(int col)857 static int col_byte_big5 (int col) {
858 return (col >= 0x3f ? 0x62 : 0x40) + col;
859 }
byte_row_big5(int byte)860 static int byte_row_big5 (int byte) {
861 if (byte >= 0xa1 && byte < 0xff)
862 return byte-0xa1;
863 else
864 return -1;
865 }
byte_col_big5(int byte)866 static int byte_col_big5 (int byte) {
867 if (byte >= 0x40 && byte < 0x7f)
868 return byte-0x40;
869 else if (byte >= 0xa1 && byte < 0xff)
870 return byte-0x62;
871 else
872 return -1;
873 }
874
do_big5(const char * name)875 static void do_big5 (const char* name)
876 {
877 Encoding enc;
878
879 enc.rows = 94;
880 enc.cols = 157;
881 enc.row_byte = row_byte_big5;
882 enc.col_byte = col_byte_big5;
883 enc.byte_row = byte_row_big5;
884 enc.byte_col = byte_col_big5;
885 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
886 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
887 enc.byte_row_expr = "%1$s - 0xa1";
888 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
889
890 read_table(&enc);
891 output_charset2uni(name,&enc);
892 invert(&enc); output_uni2charset_sparse(name,&enc);
893 }
894
895 /* Johab Hangul specifics */
896
row_byte_johab_hangul(int row)897 static int row_byte_johab_hangul (int row) {
898 return 0x84+row;
899 }
col_byte_johab_hangul(int col)900 static int col_byte_johab_hangul (int col) {
901 return (col >= 0x3e ? 0x43 : 0x41) + col;
902 }
byte_row_johab_hangul(int byte)903 static int byte_row_johab_hangul (int byte) {
904 if (byte >= 0x84 && byte < 0xd4)
905 return byte-0x84;
906 else
907 return -1;
908 }
byte_col_johab_hangul(int byte)909 static int byte_col_johab_hangul (int byte) {
910 if (byte >= 0x41 && byte < 0x7f)
911 return byte-0x41;
912 else if (byte >= 0x81 && byte < 0xff)
913 return byte-0x43;
914 else
915 return -1;
916 }
917
do_johab_hangul(const char * name)918 static void do_johab_hangul (const char* name)
919 {
920 Encoding enc;
921
922 enc.rows = 80;
923 enc.cols = 188;
924 enc.row_byte = row_byte_johab_hangul;
925 enc.col_byte = col_byte_johab_hangul;
926 enc.byte_row = byte_row_johab_hangul;
927 enc.byte_col = byte_col_johab_hangul;
928 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
929 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
930 enc.byte_row_expr = "%1$s - 0x84";
931 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
932
933 read_table(&enc);
934 output_charset2uni(name,&enc);
935 invert(&enc); output_uni2charset_dense(name,&enc);
936 }
937
938 /* SJIS specifics */
939
row_byte_sjis(int row)940 static int row_byte_sjis (int row) {
941 return (row >= 0x1f ? 0xc1 : 0x81) + row;
942 }
col_byte_sjis(int col)943 static int col_byte_sjis (int col) {
944 return (col >= 0x3f ? 0x41 : 0x40) + col;
945 }
byte_row_sjis(int byte)946 static int byte_row_sjis (int byte) {
947 if (byte >= 0x81 && byte < 0xa0)
948 return byte-0x81;
949 else if (byte >= 0xe0)
950 return byte-0xc1;
951 else
952 return -1;
953 }
byte_col_sjis(int byte)954 static int byte_col_sjis (int byte) {
955 if (byte >= 0x40 && byte < 0x7f)
956 return byte-0x40;
957 else if (byte >= 0x80 && byte < 0xfd)
958 return byte-0x41;
959 else
960 return -1;
961 }
962
do_sjis(const char * name)963 static void do_sjis (const char* name)
964 {
965 Encoding enc;
966
967 enc.rows = 94;
968 enc.cols = 188;
969 enc.row_byte = row_byte_sjis;
970 enc.col_byte = col_byte_sjis;
971 enc.byte_row = byte_row_sjis;
972 enc.byte_col = byte_col_sjis;
973 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
974 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
975 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
976 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
977
978 read_table(&enc);
979 output_charset2uni(name,&enc);
980 invert(&enc); output_uni2charset_sparse(name,&enc);
981 }
982
983 /* Main program */
984
main(int argc,char * argv[])985 int main (int argc, char *argv[])
986 {
987 const char* charsetname;
988 const char* name;
989
990 if (argc != 3)
991 exit(1);
992 charsetname = argv[1];
993 name = argv[2];
994
995 output_title(charsetname);
996
997 if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
998 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
999 do_normal(name);
1000 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1001 || !strcmp(name,"cns11643_3"))
1002 do_normal_only_charset2uni(name);
1003 else if (!strcmp(name,"cns11643_inv"))
1004 do_cns11643_only_uni2charset(name);
1005 else if (!strcmp(name,"gbkext1"))
1006 do_gbk1_only_charset2uni(name);
1007 else if (!strcmp(name,"gbkext2"))
1008 do_gbk2_only_charset2uni(name);
1009 else if (!strcmp(name,"gbkext_inv"))
1010 do_gbk1_only_uni2charset(name);
1011 else if (!strcmp(name,"cp936ext"))
1012 do_gbk1(name);
1013 else if (!strcmp(name,"ksc5601"))
1014 do_ksc5601(name);
1015 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1016 do_big5(name);
1017 else if (!strcmp(name,"johab_hangul"))
1018 do_johab_hangul(name);
1019 else if (!strcmp(name,"cp932ext"))
1020 do_sjis(name);
1021 else
1022 exit(1);
1023
1024 return 0;
1025 }
1026