1 /* $XFree86: xc/lib/X11/lcUniConv/cjk_tab_to_h.c,v 1.2 2000/12/04 18:49:31 dawes Exp $ */
2 
3 /*
4  * Generates a CJK character set table from a .TXT table as found on
5  * ftp.unicode.org or in the X nls directory.
6  * Examples:
7  *
8  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
9  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
10  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
11  *
12  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
13  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
14  *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
15  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
16  *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
17  *
18  *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
19  *
20  *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
21  */
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdbool.h>
26 #include <string.h>
27 
28 typedef struct {
29   int start;
30   int end;
31 } Block;
32 
33 typedef struct {
34   int rows;    /* number of possible values for the 1st byte */
35   int cols;    /* number of possible values for the 2nd byte */
36   int (*row_byte) (int row); /* returns the 1st byte value for a given row */
37   int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
38   int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
39   int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
40   const char* check_row_expr; /* format string for 1st byte value checking */
41   const char* check_col_expr; /* format string for 2nd byte value checking */
42   const char* byte_row_expr; /* format string for 1st byte value to row */
43   const char* byte_col_expr; /* format string for 2nd byte value to col */
44   int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
45   /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
46      Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
47   int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
48   int ncharsetblocks;
49   Block* charsetblocks; /* blocks[0..nblocks-1] */
50   int* uni2charset; /* uni2charset[0x0000..0xffff] */
51 } Encoding;
52 
53 /*
54  * Outputs the file title.
55  */
output_title(const char * charsetname)56 static void output_title (const char *charsetname)
57 {
58   printf("\n");
59   printf("/*\n");
60   printf(" * %s\n", charsetname);
61   printf(" */\n");
62   printf("\n");
63 }
64 
65 /*
66  * Reads the charset2uni table from standard input.
67  */
read_table(Encoding * enc)68 static void read_table (Encoding* enc)
69 {
70   int row, col, i, i1, i2, c, j;
71 
72   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
73   for (row = 0; row < enc->rows; row++)
74     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
75 
76   for (row = 0; row < enc->rows; row++)
77     for (col = 0; col < enc->cols; col++)
78       enc->charset2uni[row][col] = 0xfffd;
79 
80   c = getc(stdin);
81   ungetc(c,stdin);
82   if (c == '#') {
83     /* Read a unicode.org style .TXT file. */
84     for (;;) {
85       c = getc(stdin);
86       if (c == EOF)
87         break;
88       if (c == '\n' || c == ' ' || c == '\t')
89         continue;
90       if (c == '#') {
91         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
92         continue;
93       }
94       ungetc(c,stdin);
95       if (scanf("0x%x", &j) != 1)
96         exit(1);
97       i1 = j >> 8;
98       i2 = j & 0xff;
99       row = enc->byte_row(i1);
100       col = enc->byte_col(i2);
101       if (row < 0 || col < 0) {
102         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
103         exit(1);
104       }
105       if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
106         exit(1);
107     }
108   } else {
109     /* Read a table of hexadecimal Unicode values. */
110     for (i1 = 32; i1 < 132; i1++)
111       for (i2 = 32; i2 < 132; i2++) {
112         i = scanf("%x", &j);
113         if (i == EOF)
114           goto read_done;
115         if (i != 1)
116           exit(1);
117         if (j < 0 || j == 0xffff)
118           j = 0xfffd;
119         if (j != 0xfffd) {
120           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
121             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
122             exit (1);
123           }
124           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
125         }
126       }
127    read_done: ;
128   }
129 }
130 
131 /*
132  * Computes the charsetpage[0..rows] array.
133  */
find_charset2uni_pages(Encoding * enc)134 static void find_charset2uni_pages (Encoding* enc)
135 {
136   int row, col;
137 
138   enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
139 
140   for (row = 0; row <= enc->rows; row++)
141     enc->charsetpage[row] = 0;
142 
143   for (row = 0; row < enc->rows; row++) {
144     int used = 0;
145     for (col = 0; col < enc->cols; col++)
146       if (enc->charset2uni[row][col] != 0xfffd)
147         used = col+1;
148     enc->charsetpage[row] = used;
149   }
150 }
151 
152 /*
153  * Fills in nblocks and blocks.
154  */
find_charset2uni_blocks(Encoding * enc)155 static void find_charset2uni_blocks (Encoding* enc)
156 {
157   int n, row, lastrow;
158 
159   enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
160 
161   n = 0;
162   for (row = 0; row < enc->rows; row++)
163     if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
164       for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
165       enc->charsetblocks[n].start = row * enc->cols;
166       enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
167       n++;
168     }
169   enc->ncharsetblocks = n;
170 }
171 
172 /*
173  * Outputs the charset to unicode table and function.
174  */
output_charset2uni(const char * name,Encoding * enc)175 static void output_charset2uni (const char* name, Encoding* enc)
176 {
177   int row, col, lastrow, col_max, i, i1_min, i1_max;
178 
179   find_charset2uni_pages(enc);
180 
181   find_charset2uni_blocks(enc);
182 
183   for (row = 0; row < enc->rows; row++)
184     if (enc->charsetpage[row] > 0) {
185       if (row == 0 || enc->charsetpage[row-1] == 0) {
186         /* Start a new block. */
187         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
188         printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
189                name, enc->row_byte(row),
190                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
191       }
192       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
193       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
194       for (col = 0; col < col_max; col++) {
195         printf(" 0x%04x,", enc->charset2uni[row][col]);
196         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
197       }
198       printf("\n");
199       if (enc->charsetpage[row+1] == 0) {
200         /* End a block. */
201         printf("};\n");
202       }
203     }
204   printf("\n");
205 
206   printf("static int\n");
207   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
208   printf("{\n");
209   printf("  unsigned char c1 = s[0];\n");
210   printf("  if (");
211   for (i = 0; i < enc->ncharsetblocks; i++) {
212     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
213     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
214     if (i > 0)
215       printf(" || ");
216     if (i1_min == i1_max)
217       printf("(c1 == 0x%02x)", i1_min);
218     else
219       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
220   }
221   printf(") {\n");
222   printf("    if (n >= 2) {\n");
223   printf("      unsigned char c2 = s[1];\n");
224   printf("      if (");
225   printf(enc->check_col_expr, "c2");
226   printf(") {\n");
227   printf("        unsigned int i = %d * (", enc->cols);
228   printf(enc->byte_row_expr, "c1");
229   printf(") + (");
230   printf(enc->byte_col_expr, "c2");
231   printf(");\n");
232   printf("        unsigned short wc = 0xfffd;\n");
233   for (i = 0; i < enc->ncharsetblocks; i++) {
234     printf("        ");
235     if (i > 0)
236       printf("} else ");
237     if (i < enc->ncharsetblocks-1)
238       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
239     printf("{\n");
240     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
241     printf("            wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
242     if (enc->charsetblocks[i].start > 0)
243       printf("-%d", enc->charsetblocks[i].start);
244     printf("];\n");
245   }
246   printf("        }\n");
247   printf("        if (wc != 0xfffd) {\n");
248   printf("          *pwc = (ucs4_t) wc;\n");
249   printf("          return 2;\n");
250   printf("        }\n");
251   printf("      }\n");
252   printf("      return RET_ILSEQ;\n");
253   printf("    }\n");
254   printf("    return RET_TOOFEW(0);\n");
255   printf("  }\n");
256   printf("  return RET_ILSEQ;\n");
257   printf("}\n");
258   printf("\n");
259 }
260 
261 /*
262  * Computes the uni2charset[0x0000..0xffff] array.
263  */
invert(Encoding * enc)264 static void invert (Encoding* enc)
265 {
266   int row, col, j;
267 
268   enc->uni2charset = (int*) malloc(0x10000*sizeof(int));
269 
270   for (j = 0; j < 0x10000; j++)
271     enc->uni2charset[j] = 0;
272 
273   for (row = 0; row < enc->rows; row++)
274     for (col = 0; col < enc->cols; col++) {
275       j = enc->charset2uni[row][col];
276       if (j != 0xfffd)
277         enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
278     }
279 }
280 
281 /*
282  * Outputs the unicode to charset table and function, using a linear array.
283  * (Suitable if the table is dense.)
284  */
output_uni2charset_dense(const char * name,Encoding * enc)285 static void output_uni2charset_dense (const char* name, Encoding* enc)
286 {
287   /* Like in 8bit_tab_to_h.c */
288   bool pages[0x100];
289   int line[0x2000];
290   int tableno;
291   struct { int minline; int maxline; int usecount; } tables[0x2000];
292   bool first;
293   int row, col, j, p, j1, j2, t;
294 
295   for (p = 0; p < 0x100; p++)
296     pages[p] = false;
297   for (row = 0; row < enc->rows; row++)
298     for (col = 0; col < enc->cols; col++) {
299       j = enc->charset2uni[row][col];
300       if (j != 0xfffd)
301         pages[j>>8] = true;
302     }
303   for (j1 = 0; j1 < 0x2000; j1++) {
304     bool all_invalid = true;
305     for (j2 = 0; j2 < 8; j2++) {
306       j = 8*j1+j2;
307       if (enc->uni2charset[j] != 0)
308         all_invalid = false;
309     }
310     if (all_invalid)
311       line[j1] = -1;
312     else
313       line[j1] = 0;
314   }
315   tableno = 0;
316   for (j1 = 0; j1 < 0x2000; j1++) {
317     if (line[j1] >= 0) {
318       if (tableno > 0
319           && ((j1 > 0 && line[j1-1] == tableno-1)
320               || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
321                   && j1 - tables[tableno-1].maxline <= 8))) {
322         line[j1] = tableno-1;
323         tables[tableno-1].maxline = j1;
324       } else {
325         tableno++;
326         line[j1] = tableno-1;
327         tables[tableno-1].minline = tables[tableno-1].maxline = j1;
328       }
329     }
330   }
331   for (t = 0; t < tableno; t++) {
332     tables[t].usecount = 0;
333     j1 = 8*tables[t].minline;
334     j2 = 8*(tables[t].maxline+1);
335     for (j = j1; j < j2; j++)
336       if (enc->uni2charset[j] != 0)
337         tables[t].usecount++;
338   }
339   {
340     p = -1;
341     for (t = 0; t < tableno; t++)
342       if (tables[t].usecount > 1) {
343         p = tables[t].minline >> 5;
344         printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
345         for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
346           if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
347             printf("  /* 0x%04x */\n", 8*j1);
348           printf(" ");
349           for (j2 = 0; j2 < 8; j2++) {
350             j = 8*j1+j2;
351             printf(" 0x%04x,", enc->uni2charset[j]);
352           }
353           printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
354         }
355         printf("};\n");
356       }
357     if (p >= 0)
358       printf("\n");
359   }
360   printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
361   printf("{\n");
362   printf("  if (n >= 2) {\n");
363   printf("    unsigned short c = 0;\n");
364   first = true;
365   for (j1 = 0; j1 < 0x2000;) {
366     t = line[j1];
367     for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
368     if (t >= 0) {
369       if (j1 != tables[t].minline) abort();
370       if (j2 > tables[t].maxline+1) abort();
371       j2 = tables[t].maxline+1;
372       if (first)
373         printf("    ");
374       else
375         printf("    else ");
376       first = false;
377       if (tables[t].usecount == 0) abort();
378       if (tables[t].usecount == 1) {
379         if (j2 != j1+1) abort();
380         for (j = 8*j1; j < 8*j2; j++)
381           if (enc->uni2charset[j] != 0) {
382             printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
383             break;
384           }
385       } else {
386         if (j1 == 0) {
387           printf("if (wc < 0x%04x)", 8*j2);
388         } else {
389           printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
390         }
391         printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
392         if (tables[t].minline > 0)
393           printf("-0x%04x", 8*j1);
394         printf("];\n");
395       }
396     }
397     j1 = j2;
398   }
399   printf("    if (c != 0) {\n");
400   printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
401   printf("      return 2;\n");
402   printf("    }\n");
403   printf("    return RET_ILSEQ;\n");
404   printf("  }\n");
405   printf("  return RET_TOOSMALL;\n");
406   printf("}\n");
407 }
408 
409 /*
410  * Outputs the unicode to charset table and function, using a packed array.
411  * (Suitable if the table is sparse.)
412  */
output_uni2charset_sparse(const char * name,Encoding * enc)413 static void output_uni2charset_sparse (const char* name, Encoding* enc)
414 {
415   bool pages[0x100];
416   Block pageblocks[0x100]; int npageblocks;
417   int indx2charset[0x10000];
418   int summary_indx[0x1000];
419   int summary_used[0x1000];
420   int i, row, col, j, p, j1, j2, indx;
421 
422   /* Fill pages[0x100]. */
423   for (p = 0; p < 0x100; p++)
424     pages[p] = false;
425   for (row = 0; row < enc->rows; row++)
426     for (col = 0; col < enc->cols; col++) {
427       j = enc->charset2uni[row][col];
428       if (j != 0xfffd)
429         pages[j>>8] = true;
430     }
431 
432 #if 0
433   for (p = 0; p < 0x100; p++)
434     if (pages[p]) {
435       printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
436       for (j1 = 0; j1 < 32; j1++) {
437         printf("  ");
438         for (j2 = 0; j2 < 8; j2++)
439           printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
440         printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
441       }
442       printf("};\n");
443     }
444   printf("\n");
445 #endif
446 
447   /* Fill summary_indx[] and summary_used[]. */
448   indx = 0;
449   for (j1 = 0; j1 < 0x1000; j1++) {
450     summary_indx[j1] = indx;
451     summary_used[j1] = 0;
452     for (j2 = 0; j2 < 16; j2++) {
453       j = 16*j1+j2;
454       if (enc->uni2charset[j] != 0) {
455         indx2charset[indx++] = enc->uni2charset[j];
456         summary_used[j1] |= (1 << j2);
457       }
458     }
459   }
460 
461   /* Fill npageblocks and pageblocks[]. */
462   npageblocks = 0;
463   for (p = 0; p < 0x100; ) {
464     if (pages[p] && (p == 0 || !pages[p-1])) {
465       pageblocks[npageblocks].start = 16*p;
466       do p++; while (p < 0x100 && pages[p]);
467       j1 = 16*p;
468       while (summary_used[j1-1] == 0) j1--;
469       pageblocks[npageblocks].end = j1;
470       npageblocks++;
471     } else
472       p++;
473   }
474 
475   printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
476   for (i = 0; i < indx; ) {
477     if ((i % 8) == 0) printf(" ");
478     printf(" 0x%04x,", indx2charset[i]);
479     i++;
480     if ((i % 8) == 0 || i == indx) printf("\n");
481   }
482   printf("};\n");
483   printf("\n");
484   for (i = 0; i < npageblocks; i++) {
485     printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
486            pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
487     for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
488       if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
489       if ((j1 % 4) == 0) printf(" ");
490       printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
491       j1++;
492       if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
493     }
494     printf("};\n");
495   }
496   printf("\n");
497 
498   printf("static int\n");
499   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
500   printf("{\n");
501   printf("  if (n >= 2) {\n");
502   printf("    const Summary16 *summary = NULL;\n");
503   for (i = 0; i < npageblocks; i++) {
504     printf("    ");
505     if (i > 0)
506       printf("else ");
507     printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
508            16*pageblocks[i].start, 16*pageblocks[i].end);
509     printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
510            pageblocks[i].start/16);
511     if (pageblocks[i].start > 0)
512       printf("-0x%03x", pageblocks[i].start);
513     printf("];\n");
514   }
515   printf("    if (summary) {\n");
516   printf("      unsigned short used = summary->used;\n");
517   printf("      unsigned int i = wc & 0x0f;\n");
518   printf("      if (used & ((unsigned short) 1 << i)) {\n");
519   printf("        unsigned short c;\n");
520   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
521   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
522   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
523   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
524   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
525   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
526   printf("        used = (used & 0x00ff) + (used >> 8);\n");
527   printf("        c = %s_2charset[summary->indx + used];\n", name);
528   printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
529   printf("        return 2;\n");
530   printf("      }\n");
531   printf("    }\n");
532   printf("    return RET_ILSEQ;\n");
533   printf("  }\n");
534   printf("  return RET_TOOSMALL;\n");
535   printf("}\n");
536 }
537 
538 /* ISO-2022/EUC specifics */
539 
row_byte_normal(int row)540 static int row_byte_normal (int row) { return 0x21+row; }
col_byte_normal(int col)541 static int col_byte_normal (int col) { return 0x21+col; }
byte_row_normal(int byte)542 static int byte_row_normal (int byte) { return byte-0x21; }
byte_col_normal(int byte)543 static int byte_col_normal (int byte) { return byte-0x21; }
544 
do_normal(const char * name)545 static void do_normal (const char* name)
546 {
547   Encoding enc;
548 
549   enc.rows = 94;
550   enc.cols = 94;
551   enc.row_byte = row_byte_normal;
552   enc.col_byte = col_byte_normal;
553   enc.byte_row = byte_row_normal;
554   enc.byte_col = byte_col_normal;
555   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
556   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
557   enc.byte_row_expr = "%1$s - 0x21";
558   enc.byte_col_expr = "%1$s - 0x21";
559 
560   read_table(&enc);
561   output_charset2uni(name,&enc);
562   invert(&enc); output_uni2charset_sparse(name,&enc);
563 }
564 
565 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
566    starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
567    order. There are 75 out-of-order values, scattered all throughout the table.
568  */
569 
do_normal_only_charset2uni(const char * name)570 static void do_normal_only_charset2uni (const char* name)
571 {
572   Encoding enc;
573 
574   enc.rows = 94;
575   enc.cols = 94;
576   enc.row_byte = row_byte_normal;
577   enc.col_byte = col_byte_normal;
578   enc.byte_row = byte_row_normal;
579   enc.byte_col = byte_col_normal;
580   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
581   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
582   enc.byte_row_expr = "%1$s - 0x21";
583   enc.byte_col_expr = "%1$s - 0x21";
584 
585   read_table(&enc);
586   output_charset2uni(name,&enc);
587 }
588 
589 /* CNS 11643 specifics - trick to put two tables into one */
590 
row_byte_cns11643(int row)591 static int row_byte_cns11643 (int row) {
592   return 0x100 * (row / 94) + (row % 94) + 0x21;
593 }
byte_row_cns11643(int byte)594 static int byte_row_cns11643 (int byte) {
595   return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
596           byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
597           byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
598           -1);
599 }
600 
do_cns11643_only_uni2charset(const char * name)601 static void do_cns11643_only_uni2charset (const char* name)
602 {
603   Encoding enc;
604   int j, x;
605 
606   enc.rows = 3*94;
607   enc.cols = 94;
608   enc.row_byte = row_byte_cns11643;
609   enc.col_byte = col_byte_normal;
610   enc.byte_row = byte_row_cns11643;
611   enc.byte_col = byte_col_normal;
612   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
613   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
614   enc.byte_row_expr = "%1$s - 0x21";
615   enc.byte_col_expr = "%1$s - 0x21";
616 
617   read_table(&enc);
618   invert(&enc);
619   /* Move the 2 plane bits into the unused bits 15 and 7. */
620   for (j = 0; j < 0x10000; j++) {
621     x = enc.uni2charset[j];
622     if (x != 0) {
623       if (x & 0x8080) abort();
624       switch (x >> 16) {
625         case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
626         case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
627         case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
628         default: abort();
629       }
630       enc.uni2charset[j] = x;
631     }
632   }
633   output_uni2charset_sparse(name,&enc);
634 }
635 
636 /* GBK specifics */
637 
row_byte_gbk1(int row)638 static int row_byte_gbk1 (int row) {
639   return 0x81+row;
640 }
col_byte_gbk1(int col)641 static int col_byte_gbk1 (int col) {
642   return (col >= 0x3f ? 0x41 : 0x40) + col;
643 }
byte_row_gbk1(int byte)644 static int byte_row_gbk1 (int byte) {
645   if (byte >= 0x81 && byte < 0xff)
646     return byte-0x81;
647   else
648     return -1;
649 }
byte_col_gbk1(int byte)650 static int byte_col_gbk1 (int byte) {
651   if (byte >= 0x40 && byte < 0x7f)
652     return byte-0x40;
653   else if (byte >= 0x80 && byte < 0xff)
654     return byte-0x41;
655   else
656     return -1;
657 }
658 
do_gbk1(const char * name)659 static void do_gbk1 (const char* name)
660 {
661   Encoding enc;
662 
663   enc.rows = 126;
664   enc.cols = 190;
665   enc.row_byte = row_byte_gbk1;
666   enc.col_byte = col_byte_gbk1;
667   enc.byte_row = byte_row_gbk1;
668   enc.byte_col = byte_col_gbk1;
669   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
670   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
671   enc.byte_row_expr = "%1$s - 0x81";
672   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
673 
674   read_table(&enc);
675   output_charset2uni(name,&enc);
676   invert(&enc); output_uni2charset_dense(name,&enc);
677 }
678 
do_gbk1_only_charset2uni(const char * name)679 static void do_gbk1_only_charset2uni (const char* name)
680 {
681   Encoding enc;
682 
683   enc.rows = 126;
684   enc.cols = 190;
685   enc.row_byte = row_byte_gbk1;
686   enc.col_byte = col_byte_gbk1;
687   enc.byte_row = byte_row_gbk1;
688   enc.byte_col = byte_col_gbk1;
689   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
690   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
691   enc.byte_row_expr = "%1$s - 0x81";
692   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
693 
694   read_table(&enc);
695   output_charset2uni(name,&enc);
696 }
697 
row_byte_gbk2(int row)698 static int row_byte_gbk2 (int row) {
699   return 0x81+row;
700 }
col_byte_gbk2(int col)701 static int col_byte_gbk2 (int col) {
702   return (col >= 0x3f ? 0x41 : 0x40) + col;
703 }
byte_row_gbk2(int byte)704 static int byte_row_gbk2 (int byte) {
705   if (byte >= 0x81 && byte < 0xff)
706     return byte-0x81;
707   else
708     return -1;
709 }
byte_col_gbk2(int byte)710 static int byte_col_gbk2 (int byte) {
711   if (byte >= 0x40 && byte < 0x7f)
712     return byte-0x40;
713   else if (byte >= 0x80 && byte < 0xa1)
714     return byte-0x41;
715   else
716     return -1;
717 }
718 
do_gbk2_only_charset2uni(const char * name)719 static void do_gbk2_only_charset2uni (const char* name)
720 {
721   Encoding enc;
722 
723   enc.rows = 126;
724   enc.cols = 96;
725   enc.row_byte = row_byte_gbk2;
726   enc.col_byte = col_byte_gbk2;
727   enc.byte_row = byte_row_gbk2;
728   enc.byte_col = byte_col_gbk2;
729   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
730   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
731   enc.byte_row_expr = "%1$s - 0x81";
732   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
733 
734   read_table(&enc);
735   output_charset2uni(name,&enc);
736 }
737 
do_gbk1_only_uni2charset(const char * name)738 static void do_gbk1_only_uni2charset (const char* name)
739 {
740   Encoding enc;
741 
742   enc.rows = 126;
743   enc.cols = 190;
744   enc.row_byte = row_byte_gbk1;
745   enc.col_byte = col_byte_gbk1;
746   enc.byte_row = byte_row_gbk1;
747   enc.byte_col = byte_col_gbk1;
748   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
749   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
750   enc.byte_row_expr = "%1$s - 0x81";
751   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
752 
753   read_table(&enc);
754   invert(&enc); output_uni2charset_sparse(name,&enc);
755 }
756 
757 /* KSC 5601 specifics */
758 
759 /*
760  * Reads the charset2uni table from standard input.
761  */
read_table_ksc5601(Encoding * enc)762 static void read_table_ksc5601 (Encoding* enc)
763 {
764   int row, col, i, i1, i2, c, j;
765 
766   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
767   for (row = 0; row < enc->rows; row++)
768     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
769 
770   for (row = 0; row < enc->rows; row++)
771     for (col = 0; col < enc->cols; col++)
772       enc->charset2uni[row][col] = 0xfffd;
773 
774   c = getc(stdin);
775   ungetc(c,stdin);
776   if (c == '#') {
777     /* Read a unicode.org style .TXT file. */
778     for (;;) {
779       c = getc(stdin);
780       if (c == EOF)
781         break;
782       if (c == '\n' || c == ' ' || c == '\t')
783         continue;
784       if (c == '#') {
785         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
786         continue;
787       }
788       ungetc(c,stdin);
789       if (scanf("0x%x", &j) != 1)
790         exit(1);
791       i1 = j >> 8;
792       i2 = j & 0xff;
793       if (scanf(" 0x%x", &j) != 1)
794         exit(1);
795       /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
796          = KS X 1001.1992, ignore the rest. */
797       if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
798         continue;  /* KSC5601 specific */
799       i1 &= 0x7f;  /* KSC5601 specific */
800       i2 &= 0x7f;  /* KSC5601 specific */
801       row = enc->byte_row(i1);
802       col = enc->byte_col(i2);
803       if (row < 0 || col < 0) {
804         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
805         exit(1);
806       }
807       enc->charset2uni[row][col] = j;
808     }
809   } else {
810     /* Read a table of hexadecimal Unicode values. */
811     for (i1 = 33; i1 < 127; i1++)
812       for (i2 = 33; i2 < 127; i2++) {
813         i = scanf("%x", &j);
814         if (i == EOF)
815           goto read_done;
816         if (i != 1)
817           exit(1);
818         if (j < 0 || j == 0xffff)
819           j = 0xfffd;
820         if (j != 0xfffd) {
821           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
822             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
823             exit (1);
824           }
825           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
826         }
827       }
828    read_done: ;
829   }
830 }
831 
do_ksc5601(const char * name)832 static void do_ksc5601 (const char* name)
833 {
834   Encoding enc;
835 
836   enc.rows = 94;
837   enc.cols = 94;
838   enc.row_byte = row_byte_normal;
839   enc.col_byte = col_byte_normal;
840   enc.byte_row = byte_row_normal;
841   enc.byte_col = byte_col_normal;
842   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
843   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
844   enc.byte_row_expr = "%1$s - 0x21";
845   enc.byte_col_expr = "%1$s - 0x21";
846 
847   read_table_ksc5601(&enc);
848   output_charset2uni(name,&enc);
849   invert(&enc); output_uni2charset_sparse(name,&enc);
850 }
851 
852 /* Big5 specifics */
853 
row_byte_big5(int row)854 static int row_byte_big5 (int row) {
855   return 0xa1+row;
856 }
col_byte_big5(int col)857 static int col_byte_big5 (int col) {
858   return (col >= 0x3f ? 0x62 : 0x40) + col;
859 }
byte_row_big5(int byte)860 static int byte_row_big5 (int byte) {
861   if (byte >= 0xa1 && byte < 0xff)
862     return byte-0xa1;
863   else
864     return -1;
865 }
byte_col_big5(int byte)866 static int byte_col_big5 (int byte) {
867   if (byte >= 0x40 && byte < 0x7f)
868     return byte-0x40;
869   else if (byte >= 0xa1 && byte < 0xff)
870     return byte-0x62;
871   else
872     return -1;
873 }
874 
do_big5(const char * name)875 static void do_big5 (const char* name)
876 {
877   Encoding enc;
878 
879   enc.rows = 94;
880   enc.cols = 157;
881   enc.row_byte = row_byte_big5;
882   enc.col_byte = col_byte_big5;
883   enc.byte_row = byte_row_big5;
884   enc.byte_col = byte_col_big5;
885   enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
886   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
887   enc.byte_row_expr = "%1$s - 0xa1";
888   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
889 
890   read_table(&enc);
891   output_charset2uni(name,&enc);
892   invert(&enc); output_uni2charset_sparse(name,&enc);
893 }
894 
895 /* Johab Hangul specifics */
896 
row_byte_johab_hangul(int row)897 static int row_byte_johab_hangul (int row) {
898   return 0x84+row;
899 }
col_byte_johab_hangul(int col)900 static int col_byte_johab_hangul (int col) {
901   return (col >= 0x3e ? 0x43 : 0x41) + col;
902 }
byte_row_johab_hangul(int byte)903 static int byte_row_johab_hangul (int byte) {
904   if (byte >= 0x84 && byte < 0xd4)
905     return byte-0x84;
906   else
907     return -1;
908 }
byte_col_johab_hangul(int byte)909 static int byte_col_johab_hangul (int byte) {
910   if (byte >= 0x41 && byte < 0x7f)
911     return byte-0x41;
912   else if (byte >= 0x81 && byte < 0xff)
913     return byte-0x43;
914   else
915     return -1;
916 }
917 
do_johab_hangul(const char * name)918 static void do_johab_hangul (const char* name)
919 {
920   Encoding enc;
921 
922   enc.rows = 80;
923   enc.cols = 188;
924   enc.row_byte = row_byte_johab_hangul;
925   enc.col_byte = col_byte_johab_hangul;
926   enc.byte_row = byte_row_johab_hangul;
927   enc.byte_col = byte_col_johab_hangul;
928   enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
929   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
930   enc.byte_row_expr = "%1$s - 0x84";
931   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
932 
933   read_table(&enc);
934   output_charset2uni(name,&enc);
935   invert(&enc); output_uni2charset_dense(name,&enc);
936 }
937 
938 /* SJIS specifics */
939 
row_byte_sjis(int row)940 static int row_byte_sjis (int row) {
941   return (row >= 0x1f ? 0xc1 : 0x81) + row;
942 }
col_byte_sjis(int col)943 static int col_byte_sjis (int col) {
944   return (col >= 0x3f ? 0x41 : 0x40) + col;
945 }
byte_row_sjis(int byte)946 static int byte_row_sjis (int byte) {
947   if (byte >= 0x81 && byte < 0xa0)
948     return byte-0x81;
949   else if (byte >= 0xe0)
950     return byte-0xc1;
951   else
952     return -1;
953 }
byte_col_sjis(int byte)954 static int byte_col_sjis (int byte) {
955   if (byte >= 0x40 && byte < 0x7f)
956     return byte-0x40;
957   else if (byte >= 0x80 && byte < 0xfd)
958     return byte-0x41;
959   else
960     return -1;
961 }
962 
do_sjis(const char * name)963 static void do_sjis (const char* name)
964 {
965   Encoding enc;
966 
967   enc.rows = 94;
968   enc.cols = 188;
969   enc.row_byte = row_byte_sjis;
970   enc.col_byte = col_byte_sjis;
971   enc.byte_row = byte_row_sjis;
972   enc.byte_col = byte_col_sjis;
973   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
974   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
975   enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
976   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
977 
978   read_table(&enc);
979   output_charset2uni(name,&enc);
980   invert(&enc); output_uni2charset_sparse(name,&enc);
981 }
982 
983 /* Main program */
984 
main(int argc,char * argv[])985 int main (int argc, char *argv[])
986 {
987   const char* charsetname;
988   const char* name;
989 
990   if (argc != 3)
991     exit(1);
992   charsetname = argv[1];
993   name = argv[2];
994 
995   output_title(charsetname);
996 
997   if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
998       || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
999     do_normal(name);
1000   else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1001            || !strcmp(name,"cns11643_3"))
1002     do_normal_only_charset2uni(name);
1003   else if (!strcmp(name,"cns11643_inv"))
1004     do_cns11643_only_uni2charset(name);
1005   else if (!strcmp(name,"gbkext1"))
1006     do_gbk1_only_charset2uni(name);
1007   else if (!strcmp(name,"gbkext2"))
1008     do_gbk2_only_charset2uni(name);
1009   else if (!strcmp(name,"gbkext_inv"))
1010     do_gbk1_only_uni2charset(name);
1011   else if (!strcmp(name,"cp936ext"))
1012     do_gbk1(name);
1013   else if (!strcmp(name,"ksc5601"))
1014     do_ksc5601(name);
1015   else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1016     do_big5(name);
1017   else if (!strcmp(name,"johab_hangul"))
1018     do_johab_hangul(name);
1019   else if (!strcmp(name,"cp932ext"))
1020     do_sjis(name);
1021   else
1022     exit(1);
1023 
1024   return 0;
1025 }
1026