1 
2 /*
3  * Generates a CJK character set table from a .TXT table as found on
4  * ftp.unicode.org or in the X nls directory.
5  * Examples:
6  *
7  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
8  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
9  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
10  *
11  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
12  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
13  *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
14  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
15  *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
16  *
17  *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
18  *
19  *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
20  *
21  *   ./cjk_tab_to_h BIG5HKSCS-0 big5hkscs >big5hkscs.h < BIG5HKSCS.TXT
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <string.h>
28 
29 typedef struct {
30   int start;
31   int end;
32 } Block;
33 
34 typedef struct {
35   int rows;    /* number of possible values for the 1st byte */
36   int cols;    /* number of possible values for the 2nd byte */
37   int (*row_byte) (int row); /* returns the 1st byte value for a given row */
38   int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
39   int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
40   int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
41   const char* check_row_expr; /* format string for 1st byte value checking */
42   const char* check_col_expr; /* format string for 2nd byte value checking */
43   const char* byte_row_expr; /* format string for 1st byte value to row */
44   const char* byte_col_expr; /* format string for 2nd byte value to col */
45   int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
46   /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
47      Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
48   int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
49   int ncharsetblocks;
50   Block* charsetblocks; /* blocks[0..nblocks-1] */
51   int* uni2charset; /* uni2charset[0x0000..0xffff] */
52 } Encoding;
53 
54 /*
55  * Outputs the file title.
56  */
output_title(const char * charsetname)57 static void output_title (const char *charsetname)
58 {
59   printf("\n");
60   printf("/*\n");
61   printf(" * %s\n", charsetname);
62   printf(" */\n");
63   printf("\n");
64 }
65 
66 /*
67  * Reads the charset2uni table from standard input.
68  */
read_table(Encoding * enc)69 static void read_table (Encoding* enc)
70 {
71   int row, col, i, i1, i2, c, j;
72 
73   enc->charset2uni = malloc(enc->rows*sizeof(int*));
74   for (row = 0; row < enc->rows; row++)
75     enc->charset2uni[row] = malloc(enc->cols*sizeof(int));
76 
77   for (row = 0; row < enc->rows; row++)
78     for (col = 0; col < enc->cols; col++)
79       enc->charset2uni[row][col] = 0xfffd;
80 
81   c = getc(stdin);
82   ungetc(c,stdin);
83   if (c == '#') {
84     /* Read a unicode.org style .TXT file. */
85     for (;;) {
86       c = getc(stdin);
87       if (c == EOF)
88         break;
89       if (c == '\n' || c == ' ' || c == '\t')
90         continue;
91       if (c == '#') {
92         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
93         continue;
94       }
95       ungetc(c,stdin);
96       if (scanf("0x%x", &j) != 1)
97         exit(1);
98       i1 = j >> 8;
99       i2 = j & 0xff;
100       row = enc->byte_row(i1);
101       col = enc->byte_col(i2);
102       if (row < 0 || col < 0) {
103         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
104         exit(1);
105       }
106       if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
107         exit(1);
108     }
109   } else {
110     /* Read a table of hexadecimal Unicode values. */
111     for (i1 = 32; i1 < 132; i1++)
112       for (i2 = 32; i2 < 132; i2++) {
113         i = scanf("%x", &j);
114         if (i == EOF)
115           goto read_done;
116         if (i != 1)
117           exit(1);
118         if (j < 0 || j == 0xffff)
119           j = 0xfffd;
120         if (j != 0xfffd) {
121           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
122             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
123             exit (1);
124           }
125           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
126         }
127       }
128    read_done: ;
129   }
130 }
131 
132 /*
133  * Computes the charsetpage[0..rows] array.
134  */
find_charset2uni_pages(Encoding * enc)135 static void find_charset2uni_pages (Encoding* enc)
136 {
137   int row, col;
138 
139   enc->charsetpage = malloc((enc->rows+1)*sizeof(int));
140 
141   for (row = 0; row <= enc->rows; row++)
142     enc->charsetpage[row] = 0;
143 
144   for (row = 0; row < enc->rows; row++) {
145     int used = 0;
146     for (col = 0; col < enc->cols; col++)
147       if (enc->charset2uni[row][col] != 0xfffd)
148         used = col+1;
149     enc->charsetpage[row] = used;
150   }
151 }
152 
153 /*
154  * Fills in nblocks and blocks.
155  */
find_charset2uni_blocks(Encoding * enc)156 static void find_charset2uni_blocks (Encoding* enc)
157 {
158   int n, row, lastrow;
159 
160   enc->charsetblocks = malloc(enc->rows*sizeof(Block));
161 
162   n = 0;
163   for (row = 0; row < enc->rows; row++)
164     if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
165       for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
166       enc->charsetblocks[n].start = row * enc->cols;
167       enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
168       n++;
169     }
170   enc->ncharsetblocks = n;
171 }
172 
173 /*
174  * Outputs the charset to unicode table and function.
175  */
output_charset2uni(const char * name,Encoding * enc)176 static void output_charset2uni (const char* name, Encoding* enc)
177 {
178   int row, col, lastrow, col_max, i, i1_min, i1_max;
179 
180   find_charset2uni_pages(enc);
181 
182   find_charset2uni_blocks(enc);
183 
184   for (row = 0; row < enc->rows; row++)
185     if (enc->charsetpage[row] > 0) {
186       if (row == 0 || enc->charsetpage[row-1] == 0) {
187         /* Start a new block. */
188         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
189         printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
190                name, enc->row_byte(row),
191                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
192       }
193       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
194       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
195       for (col = 0; col < col_max; col++) {
196         printf(" 0x%04x,", enc->charset2uni[row][col]);
197         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
198       }
199       printf("\n");
200       if (enc->charsetpage[row+1] == 0) {
201         /* End a block. */
202         printf("};\n");
203       }
204     }
205   printf("\n");
206 
207   printf("static int\n");
208   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
209   printf("{\n");
210   printf("  unsigned char c1 = s[0];\n");
211   printf("  if (");
212   for (i = 0; i < enc->ncharsetblocks; i++) {
213     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
214     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
215     if (i > 0)
216       printf(" || ");
217     if (i1_min == i1_max)
218       printf("(c1 == 0x%02x)", i1_min);
219     else
220       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
221   }
222   printf(") {\n");
223   printf("    if (n >= 2) {\n");
224   printf("      unsigned char c2 = s[1];\n");
225   printf("      if (");
226   printf(enc->check_col_expr, "c2");
227   printf(") {\n");
228   printf("        unsigned int i = %d * (", enc->cols);
229   printf(enc->byte_row_expr, "c1");
230   printf(") + (");
231   printf(enc->byte_col_expr, "c2");
232   printf(");\n");
233   printf("        unsigned short wc = 0xfffd;\n");
234   for (i = 0; i < enc->ncharsetblocks; i++) {
235     printf("        ");
236     if (i > 0)
237       printf("} else ");
238     if (i < enc->ncharsetblocks-1)
239       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
240     printf("{\n");
241     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
242     printf("            wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
243     if (enc->charsetblocks[i].start > 0)
244       printf("-%d", enc->charsetblocks[i].start);
245     printf("];\n");
246   }
247   printf("        }\n");
248   printf("        if (wc != 0xfffd) {\n");
249   printf("          *pwc = (ucs4_t) wc;\n");
250   printf("          return 2;\n");
251   printf("        }\n");
252   printf("      }\n");
253   printf("      return RET_ILSEQ;\n");
254   printf("    }\n");
255   printf("    return RET_TOOFEW(0);\n");
256   printf("  }\n");
257   printf("  return RET_ILSEQ;\n");
258   printf("}\n");
259   printf("\n");
260 }
261 
262 /*
263  * Computes the uni2charset[0x0000..0xffff] array.
264  */
invert(Encoding * enc)265 static void invert (Encoding* enc)
266 {
267   int row, col, j;
268 
269   enc->uni2charset = malloc(0x10000*sizeof(int));
270 
271   for (j = 0; j < 0x10000; j++)
272     enc->uni2charset[j] = 0;
273 
274   for (row = 0; row < enc->rows; row++)
275     for (col = 0; col < enc->cols; col++) {
276       j = enc->charset2uni[row][col];
277       if (j != 0xfffd)
278         enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
279     }
280 }
281 
282 /*
283  * Outputs the unicode to charset table and function, using a linear array.
284  * (Suitable if the table is dense.)
285  */
output_uni2charset_dense(const char * name,Encoding * enc)286 static void output_uni2charset_dense (const char* name, Encoding* enc)
287 {
288   /* Like in 8bit_tab_to_h.c */
289   bool pages[0x100];
290   int line[0x2000];
291   int tableno;
292   struct { int minline; int maxline; int usecount; } tables[0x2000];
293   bool first;
294   int row, col, j, p, j1, j2, t;
295 
296   for (p = 0; p < 0x100; p++)
297     pages[p] = false;
298   for (row = 0; row < enc->rows; row++)
299     for (col = 0; col < enc->cols; col++) {
300       j = enc->charset2uni[row][col];
301       if (j != 0xfffd)
302         pages[j>>8] = true;
303     }
304   for (j1 = 0; j1 < 0x2000; j1++) {
305     bool all_invalid = true;
306     for (j2 = 0; j2 < 8; j2++) {
307       j = 8*j1+j2;
308       if (enc->uni2charset[j] != 0)
309         all_invalid = false;
310     }
311     if (all_invalid)
312       line[j1] = -1;
313     else
314       line[j1] = 0;
315   }
316   tableno = 0;
317   for (j1 = 0; j1 < 0x2000; j1++) {
318     if (line[j1] >= 0) {
319       if (tableno > 0
320           && ((j1 > 0 && line[j1-1] == tableno-1)
321               || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
322                   && j1 - tables[tableno-1].maxline <= 8))) {
323         line[j1] = tableno-1;
324         tables[tableno-1].maxline = j1;
325       } else {
326         tableno++;
327         line[j1] = tableno-1;
328         tables[tableno-1].minline = tables[tableno-1].maxline = j1;
329       }
330     }
331   }
332   for (t = 0; t < tableno; t++) {
333     tables[t].usecount = 0;
334     j1 = 8*tables[t].minline;
335     j2 = 8*(tables[t].maxline+1);
336     for (j = j1; j < j2; j++)
337       if (enc->uni2charset[j] != 0)
338         tables[t].usecount++;
339   }
340   {
341     p = -1;
342     for (t = 0; t < tableno; t++)
343       if (tables[t].usecount > 1) {
344         p = tables[t].minline >> 5;
345         printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
346         for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
347           if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
348             printf("  /* 0x%04x */\n", 8*j1);
349           printf(" ");
350           for (j2 = 0; j2 < 8; j2++) {
351             j = 8*j1+j2;
352             printf(" 0x%04x,", enc->uni2charset[j]);
353           }
354           printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
355         }
356         printf("};\n");
357       }
358     if (p >= 0)
359       printf("\n");
360   }
361   printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
362   printf("{\n");
363   printf("  if (n >= 2) {\n");
364   printf("    unsigned short c = 0;\n");
365   first = true;
366   for (j1 = 0; j1 < 0x2000;) {
367     t = line[j1];
368     for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
369     if (t >= 0) {
370       if (j1 != tables[t].minline) abort();
371       if (j2 > tables[t].maxline+1) abort();
372       j2 = tables[t].maxline+1;
373       if (first)
374         printf("    ");
375       else
376         printf("    else ");
377       first = false;
378       if (tables[t].usecount == 0) abort();
379       if (tables[t].usecount == 1) {
380         if (j2 != j1+1) abort();
381         for (j = 8*j1; j < 8*j2; j++)
382           if (enc->uni2charset[j] != 0) {
383             printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
384             break;
385           }
386       } else {
387         if (j1 == 0) {
388           printf("if (wc < 0x%04x)", 8*j2);
389         } else {
390           printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
391         }
392         printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
393         if (tables[t].minline > 0)
394           printf("-0x%04x", 8*j1);
395         printf("];\n");
396       }
397     }
398     j1 = j2;
399   }
400   printf("    if (c != 0) {\n");
401   printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
402   printf("      return 2;\n");
403   printf("    }\n");
404   printf("    return RET_ILSEQ;\n");
405   printf("  }\n");
406   printf("  return RET_TOOSMALL;\n");
407   printf("}\n");
408 }
409 
410 /*
411  * Outputs the unicode to charset table and function, using a packed array.
412  * (Suitable if the table is sparse.)
413  */
output_uni2charset_sparse(const char * name,Encoding * enc)414 static void output_uni2charset_sparse (const char* name, Encoding* enc)
415 {
416   bool pages[0x100];
417   Block pageblocks[0x100]; int npageblocks;
418   int indx2charset[0x10000];
419   int summary_indx[0x1000];
420   int summary_used[0x1000];
421   int i, row, col, j, p, j1, j2, indx;
422 
423   /* Fill pages[0x100]. */
424   for (p = 0; p < 0x100; p++)
425     pages[p] = false;
426   for (row = 0; row < enc->rows; row++)
427     for (col = 0; col < enc->cols; col++) {
428       j = enc->charset2uni[row][col];
429       if (j != 0xfffd)
430         pages[j>>8] = true;
431     }
432 
433 #if 0
434   for (p = 0; p < 0x100; p++)
435     if (pages[p]) {
436       printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
437       for (j1 = 0; j1 < 32; j1++) {
438         printf("  ");
439         for (j2 = 0; j2 < 8; j2++)
440           printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
441         printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
442       }
443       printf("};\n");
444     }
445   printf("\n");
446 #endif
447 
448   /* Fill summary_indx[] and summary_used[]. */
449   indx = 0;
450   for (j1 = 0; j1 < 0x1000; j1++) {
451     summary_indx[j1] = indx;
452     summary_used[j1] = 0;
453     for (j2 = 0; j2 < 16; j2++) {
454       j = 16*j1+j2;
455       if (enc->uni2charset[j] != 0) {
456         indx2charset[indx++] = enc->uni2charset[j];
457         summary_used[j1] |= (1 << j2);
458       }
459     }
460   }
461 
462   /* Fill npageblocks and pageblocks[]. */
463   npageblocks = 0;
464   for (p = 0; p < 0x100; ) {
465     if (pages[p] && (p == 0 || !pages[p-1])) {
466       pageblocks[npageblocks].start = 16*p;
467       do p++; while (p < 0x100 && pages[p]);
468       j1 = 16*p;
469       while (summary_used[j1-1] == 0) j1--;
470       pageblocks[npageblocks].end = j1;
471       npageblocks++;
472     } else
473       p++;
474   }
475 
476   printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
477   for (i = 0; i < indx; ) {
478     if ((i % 8) == 0) printf(" ");
479     printf(" 0x%04x,", indx2charset[i]);
480     i++;
481     if ((i % 8) == 0 || i == indx) printf("\n");
482   }
483   printf("};\n");
484   printf("\n");
485   for (i = 0; i < npageblocks; i++) {
486     printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
487            pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
488     for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
489       if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
490       if ((j1 % 4) == 0) printf(" ");
491       printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
492       j1++;
493       if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
494     }
495     printf("};\n");
496   }
497   printf("\n");
498 
499   printf("static int\n");
500   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
501   printf("{\n");
502   printf("  if (n >= 2) {\n");
503   printf("    const Summary16 *summary = NULL;\n");
504   for (i = 0; i < npageblocks; i++) {
505     printf("    ");
506     if (i > 0)
507       printf("else ");
508     printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
509            16*pageblocks[i].start, 16*pageblocks[i].end);
510     printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
511            pageblocks[i].start/16);
512     if (pageblocks[i].start > 0)
513       printf("-0x%03x", pageblocks[i].start);
514     printf("];\n");
515   }
516   printf("    if (summary) {\n");
517   printf("      unsigned short used = summary->used;\n");
518   printf("      unsigned int i = wc & 0x0f;\n");
519   printf("      if (used & ((unsigned short) 1 << i)) {\n");
520   printf("        unsigned short c;\n");
521   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
522   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
523   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
524   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
525   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
526   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
527   printf("        used = (used & 0x00ff) + (used >> 8);\n");
528   printf("        c = %s_2charset[summary->indx + used];\n", name);
529   printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
530   printf("        return 2;\n");
531   printf("      }\n");
532   printf("    }\n");
533   printf("    return RET_ILSEQ;\n");
534   printf("  }\n");
535   printf("  return RET_TOOSMALL;\n");
536   printf("}\n");
537 }
538 
539 /* ISO-2022/EUC specifics */
540 
row_byte_normal(int row)541 static int row_byte_normal (int row) { return 0x21+row; }
col_byte_normal(int col)542 static int col_byte_normal (int col) { return 0x21+col; }
byte_row_normal(int byte)543 static int byte_row_normal (int byte) { return byte-0x21; }
byte_col_normal(int byte)544 static int byte_col_normal (int byte) { return byte-0x21; }
545 
do_normal(const char * name)546 static void do_normal (const char* name)
547 {
548   Encoding enc;
549 
550   enc.rows = 94;
551   enc.cols = 94;
552   enc.row_byte = row_byte_normal;
553   enc.col_byte = col_byte_normal;
554   enc.byte_row = byte_row_normal;
555   enc.byte_col = byte_col_normal;
556   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
557   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
558   enc.byte_row_expr = "%1$s - 0x21";
559   enc.byte_col_expr = "%1$s - 0x21";
560 
561   read_table(&enc);
562   output_charset2uni(name,&enc);
563   invert(&enc); output_uni2charset_sparse(name,&enc);
564 }
565 
566 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
567    starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
568    order. There are 75 out-of-order values, scattered all throughout the table.
569  */
570 
do_normal_only_charset2uni(const char * name)571 static void do_normal_only_charset2uni (const char* name)
572 {
573   Encoding enc;
574 
575   enc.rows = 94;
576   enc.cols = 94;
577   enc.row_byte = row_byte_normal;
578   enc.col_byte = col_byte_normal;
579   enc.byte_row = byte_row_normal;
580   enc.byte_col = byte_col_normal;
581   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
582   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
583   enc.byte_row_expr = "%1$s - 0x21";
584   enc.byte_col_expr = "%1$s - 0x21";
585 
586   read_table(&enc);
587   output_charset2uni(name,&enc);
588 }
589 
590 /* CNS 11643 specifics - trick to put two tables into one */
591 
row_byte_cns11643(int row)592 static int row_byte_cns11643 (int row) {
593   return 0x100 * (row / 94) + (row % 94) + 0x21;
594 }
byte_row_cns11643(int byte)595 static int byte_row_cns11643 (int byte) {
596   return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
597           byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
598           byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
599           -1);
600 }
601 
do_cns11643_only_uni2charset(const char * name)602 static void do_cns11643_only_uni2charset (const char* name)
603 {
604   Encoding enc;
605   int j, x;
606 
607   enc.rows = 3*94;
608   enc.cols = 94;
609   enc.row_byte = row_byte_cns11643;
610   enc.col_byte = col_byte_normal;
611   enc.byte_row = byte_row_cns11643;
612   enc.byte_col = byte_col_normal;
613   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
614   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
615   enc.byte_row_expr = "%1$s - 0x21";
616   enc.byte_col_expr = "%1$s - 0x21";
617 
618   read_table(&enc);
619   invert(&enc);
620   /* Move the 2 plane bits into the unused bits 15 and 7. */
621   for (j = 0; j < 0x10000; j++) {
622     x = enc.uni2charset[j];
623     if (x != 0) {
624       if (x & 0x8080) abort();
625       switch (x >> 16) {
626         case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
627         case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
628         case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
629         default: abort();
630       }
631       enc.uni2charset[j] = x;
632     }
633   }
634   output_uni2charset_sparse(name,&enc);
635 }
636 
637 /* GBK specifics */
638 
row_byte_gbk1(int row)639 static int row_byte_gbk1 (int row) {
640   return 0x81+row;
641 }
col_byte_gbk1(int col)642 static int col_byte_gbk1 (int col) {
643   return (col >= 0x3f ? 0x41 : 0x40) + col;
644 }
byte_row_gbk1(int byte)645 static int byte_row_gbk1 (int byte) {
646   if (byte >= 0x81 && byte < 0xff)
647     return byte-0x81;
648   else
649     return -1;
650 }
byte_col_gbk1(int byte)651 static int byte_col_gbk1 (int byte) {
652   if (byte >= 0x40 && byte < 0x7f)
653     return byte-0x40;
654   else if (byte >= 0x80 && byte < 0xff)
655     return byte-0x41;
656   else
657     return -1;
658 }
659 
do_gbk1(const char * name)660 static void do_gbk1 (const char* name)
661 {
662   Encoding enc;
663 
664   enc.rows = 126;
665   enc.cols = 190;
666   enc.row_byte = row_byte_gbk1;
667   enc.col_byte = col_byte_gbk1;
668   enc.byte_row = byte_row_gbk1;
669   enc.byte_col = byte_col_gbk1;
670   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
671   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
672   enc.byte_row_expr = "%1$s - 0x81";
673   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
674 
675   read_table(&enc);
676   output_charset2uni(name,&enc);
677   invert(&enc); output_uni2charset_dense(name,&enc);
678 }
679 
do_gbk1_only_charset2uni(const char * name)680 static void do_gbk1_only_charset2uni (const char* name)
681 {
682   Encoding enc;
683 
684   enc.rows = 126;
685   enc.cols = 190;
686   enc.row_byte = row_byte_gbk1;
687   enc.col_byte = col_byte_gbk1;
688   enc.byte_row = byte_row_gbk1;
689   enc.byte_col = byte_col_gbk1;
690   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
691   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
692   enc.byte_row_expr = "%1$s - 0x81";
693   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
694 
695   read_table(&enc);
696   output_charset2uni(name,&enc);
697 }
698 
row_byte_gbk2(int row)699 static int row_byte_gbk2 (int row) {
700   return 0x81+row;
701 }
col_byte_gbk2(int col)702 static int col_byte_gbk2 (int col) {
703   return (col >= 0x3f ? 0x41 : 0x40) + col;
704 }
byte_row_gbk2(int byte)705 static int byte_row_gbk2 (int byte) {
706   if (byte >= 0x81 && byte < 0xff)
707     return byte-0x81;
708   else
709     return -1;
710 }
byte_col_gbk2(int byte)711 static int byte_col_gbk2 (int byte) {
712   if (byte >= 0x40 && byte < 0x7f)
713     return byte-0x40;
714   else if (byte >= 0x80 && byte < 0xa1)
715     return byte-0x41;
716   else
717     return -1;
718 }
719 
do_gbk2_only_charset2uni(const char * name)720 static void do_gbk2_only_charset2uni (const char* name)
721 {
722   Encoding enc;
723 
724   enc.rows = 126;
725   enc.cols = 96;
726   enc.row_byte = row_byte_gbk2;
727   enc.col_byte = col_byte_gbk2;
728   enc.byte_row = byte_row_gbk2;
729   enc.byte_col = byte_col_gbk2;
730   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
731   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
732   enc.byte_row_expr = "%1$s - 0x81";
733   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
734 
735   read_table(&enc);
736   output_charset2uni(name,&enc);
737 }
738 
do_gbk1_only_uni2charset(const char * name)739 static void do_gbk1_only_uni2charset (const char* name)
740 {
741   Encoding enc;
742 
743   enc.rows = 126;
744   enc.cols = 190;
745   enc.row_byte = row_byte_gbk1;
746   enc.col_byte = col_byte_gbk1;
747   enc.byte_row = byte_row_gbk1;
748   enc.byte_col = byte_col_gbk1;
749   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
750   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
751   enc.byte_row_expr = "%1$s - 0x81";
752   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
753 
754   read_table(&enc);
755   invert(&enc); output_uni2charset_sparse(name,&enc);
756 }
757 
758 /* KSC 5601 specifics */
759 
760 /*
761  * Reads the charset2uni table from standard input.
762  */
read_table_ksc5601(Encoding * enc)763 static void read_table_ksc5601 (Encoding* enc)
764 {
765   int row, col, i, i1, i2, c, j;
766 
767   enc->charset2uni = malloc(enc->rows*sizeof(int*));
768   for (row = 0; row < enc->rows; row++)
769     enc->charset2uni[row] = malloc(enc->cols*sizeof(int));
770 
771   for (row = 0; row < enc->rows; row++)
772     for (col = 0; col < enc->cols; col++)
773       enc->charset2uni[row][col] = 0xfffd;
774 
775   c = getc(stdin);
776   ungetc(c,stdin);
777   if (c == '#') {
778     /* Read a unicode.org style .TXT file. */
779     for (;;) {
780       c = getc(stdin);
781       if (c == EOF)
782         break;
783       if (c == '\n' || c == ' ' || c == '\t')
784         continue;
785       if (c == '#') {
786         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
787         continue;
788       }
789       ungetc(c,stdin);
790       if (scanf("0x%x", &j) != 1)
791         exit(1);
792       i1 = j >> 8;
793       i2 = j & 0xff;
794       if (scanf(" 0x%x", &j) != 1)
795         exit(1);
796       /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
797          = KS X 1001.1992, ignore the rest. */
798       if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
799         continue;  /* KSC5601 specific */
800       i1 &= 0x7f;  /* KSC5601 specific */
801       i2 &= 0x7f;  /* KSC5601 specific */
802       row = enc->byte_row(i1);
803       col = enc->byte_col(i2);
804       if (row < 0 || col < 0) {
805         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
806         exit(1);
807       }
808       enc->charset2uni[row][col] = j;
809     }
810   } else {
811     /* Read a table of hexadecimal Unicode values. */
812     for (i1 = 33; i1 < 127; i1++)
813       for (i2 = 33; i2 < 127; i2++) {
814         i = scanf("%x", &j);
815         if (i == EOF)
816           goto read_done;
817         if (i != 1)
818           exit(1);
819         if (j < 0 || j == 0xffff)
820           j = 0xfffd;
821         if (j != 0xfffd) {
822           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
823             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
824             exit (1);
825           }
826           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
827         }
828       }
829    read_done: ;
830   }
831 }
832 
do_ksc5601(const char * name)833 static void do_ksc5601 (const char* name)
834 {
835   Encoding enc;
836 
837   enc.rows = 94;
838   enc.cols = 94;
839   enc.row_byte = row_byte_normal;
840   enc.col_byte = col_byte_normal;
841   enc.byte_row = byte_row_normal;
842   enc.byte_col = byte_col_normal;
843   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
844   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
845   enc.byte_row_expr = "%1$s - 0x21";
846   enc.byte_col_expr = "%1$s - 0x21";
847 
848   read_table_ksc5601(&enc);
849   output_charset2uni(name,&enc);
850   invert(&enc); output_uni2charset_sparse(name,&enc);
851 }
852 
853 /* Big5 specifics */
854 
row_byte_big5(int row)855 static int row_byte_big5 (int row) {
856   return 0xa1+row;
857 }
col_byte_big5(int col)858 static int col_byte_big5 (int col) {
859   return (col >= 0x3f ? 0x62 : 0x40) + col;
860 }
byte_row_big5(int byte)861 static int byte_row_big5 (int byte) {
862   if (byte >= 0xa1 && byte < 0xff)
863     return byte-0xa1;
864   else
865     return -1;
866 }
byte_col_big5(int byte)867 static int byte_col_big5 (int byte) {
868   if (byte >= 0x40 && byte < 0x7f)
869     return byte-0x40;
870   else if (byte >= 0xa1 && byte < 0xff)
871     return byte-0x62;
872   else
873     return -1;
874 }
875 
do_big5(const char * name)876 static void do_big5 (const char* name)
877 {
878   Encoding enc;
879 
880   enc.rows = 94;
881   enc.cols = 157;
882   enc.row_byte = row_byte_big5;
883   enc.col_byte = col_byte_big5;
884   enc.byte_row = byte_row_big5;
885   enc.byte_col = byte_col_big5;
886   enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
887   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
888   enc.byte_row_expr = "%1$s - 0xa1";
889   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
890 
891   read_table(&enc);
892   output_charset2uni(name,&enc);
893   invert(&enc); output_uni2charset_sparse(name,&enc);
894 }
895 
896 /* Big5-HKSCS specifics */
897 
row_byte_big5hkscs(int row)898 static int row_byte_big5hkscs (int row) {
899   return 0x81+row;
900 }
col_byte_big5hkscs(int col)901 static int col_byte_big5hkscs (int col) {
902   return (col >= 0x3f ? 0x62 : 0x40) + col;
903 }
byte_row_big5hkscs(int byte)904 static int byte_row_big5hkscs (int byte) {
905   if (byte >= 0x81 && byte < 0xff)
906     return byte-0x81;
907   else
908     return -1;
909 }
byte_col_big5hkscs(int byte)910 static int byte_col_big5hkscs (int byte) {
911   if (byte >= 0x40 && byte < 0x7f)
912     return byte-0x40;
913   else if (byte >= 0xa1 && byte < 0xff)
914     return byte-0x62;
915   else
916     return -1;
917 }
918 
do_big5hkscs(const char * name)919 static void do_big5hkscs (const char* name)
920 {
921   Encoding enc;
922 
923   enc.rows = 126;
924   enc.cols = 157;
925   enc.row_byte = row_byte_big5hkscs;
926   enc.col_byte = col_byte_big5hkscs;
927   enc.byte_row = byte_row_big5hkscs;
928   enc.byte_col = byte_col_big5hkscs;
929   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
930   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
931   enc.byte_row_expr = "%1$s - 0x81";
932   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
933 
934   read_table(&enc);
935   output_charset2uni(name,&enc);
936   invert(&enc); output_uni2charset_sparse(name,&enc);
937 }
938 
939 /* Johab Hangul specifics */
940 
row_byte_johab_hangul(int row)941 static int row_byte_johab_hangul (int row) {
942   return 0x84+row;
943 }
col_byte_johab_hangul(int col)944 static int col_byte_johab_hangul (int col) {
945   return (col >= 0x3e ? 0x43 : 0x41) + col;
946 }
byte_row_johab_hangul(int byte)947 static int byte_row_johab_hangul (int byte) {
948   if (byte >= 0x84 && byte < 0xd4)
949     return byte-0x84;
950   else
951     return -1;
952 }
byte_col_johab_hangul(int byte)953 static int byte_col_johab_hangul (int byte) {
954   if (byte >= 0x41 && byte < 0x7f)
955     return byte-0x41;
956   else if (byte >= 0x81 && byte < 0xff)
957     return byte-0x43;
958   else
959     return -1;
960 }
961 
do_johab_hangul(const char * name)962 static void do_johab_hangul (const char* name)
963 {
964   Encoding enc;
965 
966   enc.rows = 80;
967   enc.cols = 188;
968   enc.row_byte = row_byte_johab_hangul;
969   enc.col_byte = col_byte_johab_hangul;
970   enc.byte_row = byte_row_johab_hangul;
971   enc.byte_col = byte_col_johab_hangul;
972   enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
973   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
974   enc.byte_row_expr = "%1$s - 0x84";
975   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
976 
977   read_table(&enc);
978   output_charset2uni(name,&enc);
979   invert(&enc); output_uni2charset_dense(name,&enc);
980 }
981 
982 /* SJIS specifics */
983 
row_byte_sjis(int row)984 static int row_byte_sjis (int row) {
985   return (row >= 0x1f ? 0xc1 : 0x81) + row;
986 }
col_byte_sjis(int col)987 static int col_byte_sjis (int col) {
988   return (col >= 0x3f ? 0x41 : 0x40) + col;
989 }
byte_row_sjis(int byte)990 static int byte_row_sjis (int byte) {
991   if (byte >= 0x81 && byte < 0xa0)
992     return byte-0x81;
993   else if (byte >= 0xe0)
994     return byte-0xc1;
995   else
996     return -1;
997 }
byte_col_sjis(int byte)998 static int byte_col_sjis (int byte) {
999   if (byte >= 0x40 && byte < 0x7f)
1000     return byte-0x40;
1001   else if (byte >= 0x80 && byte < 0xfd)
1002     return byte-0x41;
1003   else
1004     return -1;
1005 }
1006 
do_sjis(const char * name)1007 static void do_sjis (const char* name)
1008 {
1009   Encoding enc;
1010 
1011   enc.rows = 94;
1012   enc.cols = 188;
1013   enc.row_byte = row_byte_sjis;
1014   enc.col_byte = col_byte_sjis;
1015   enc.byte_row = byte_row_sjis;
1016   enc.byte_col = byte_col_sjis;
1017   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1018   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1019   enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1020   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1021 
1022   read_table(&enc);
1023   output_charset2uni(name,&enc);
1024   invert(&enc); output_uni2charset_sparse(name,&enc);
1025 }
1026 
1027 /* Main program */
1028 
main(int argc,char * argv[])1029 int main (int argc, char *argv[])
1030 {
1031   const char* charsetname;
1032   const char* name;
1033 
1034   if (argc != 3)
1035     exit(1);
1036   charsetname = argv[1];
1037   name = argv[2];
1038 
1039   output_title(charsetname);
1040 
1041   if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
1042       || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
1043     do_normal(name);
1044   else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
1045            || !strcmp(name,"cns11643_3"))
1046     do_normal_only_charset2uni(name);
1047   else if (!strcmp(name,"cns11643_inv"))
1048     do_cns11643_only_uni2charset(name);
1049   else if (!strcmp(name,"gbkext1"))
1050     do_gbk1_only_charset2uni(name);
1051   else if (!strcmp(name,"gbkext2"))
1052     do_gbk2_only_charset2uni(name);
1053   else if (!strcmp(name,"gbkext_inv"))
1054     do_gbk1_only_uni2charset(name);
1055   else if (!strcmp(name,"cp936ext"))
1056     do_gbk1(name);
1057   else if (!strcmp(name,"ksc5601"))
1058     do_ksc5601(name);
1059   else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
1060     do_big5(name);
1061   else if (!strcmp(name,"big5hkscs"))
1062     do_big5hkscs(name);
1063   else if (!strcmp(name,"johab_hangul"))
1064     do_johab_hangul(name);
1065   else if (!strcmp(name,"cp932ext"))
1066     do_sjis(name);
1067   else
1068     exit(1);
1069 
1070   return 0;
1071 }
1072