1 /* Copyright (C) 1999-2004, 2006-2007 Free Software Foundation, Inc.
2    This file is part of the GNU LIBICONV Tools.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17 
18 /*
19  * Generates a CJK character set table from a .TXT table as found on
20  * ftp.unicode.org or in the X nls directory.
21  * Examples:
22  *
23  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
26  *
27  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29  *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31  *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
32  *
33  *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
34  *
35  *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
36  *
37  *   ./cjk_tab_to_h JISX0213:2004 jisx0213 > jisx0213.h < JISX0213.TXT
38  */
39 
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stdbool.h>
43 #include <string.h>
44 #include <ctype.h>
45 #include <assert.h>
46 
47 typedef struct {
48   int start;
49   int end;
50 } Block;
51 
52 typedef struct {
53   int rows;    /* number of possible values for the 1st byte */
54   int cols;    /* number of possible values for the 2nd byte */
55   int (*row_byte) (int row); /* returns the 1st byte value for a given row */
56   int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
57   int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
58   int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
59   const char* check_row_expr; /* format string for 1st byte value checking */
60   const char* check_col_expr; /* format string for 2nd byte value checking */
61   const char* byte_row_expr; /* format string for 1st byte value to row */
62   const char* byte_col_expr; /* format string for 2nd byte value to col */
63   int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
64   /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
65      Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
66   int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
67   int ncharsetblocks;
68   Block* charsetblocks; /* blocks[0..nblocks-1] */
69   int* uni2charset; /* uni2charset[0x0000..0xffff] */
70   int fffd;    /* uni representation of the invalid character */
71 } Encoding;
72 
73 /*
74  * Outputs the file title.
75  */
output_title(const char * charsetname)76 static void output_title (const char *charsetname)
77 {
78   printf("/*\n");
79   printf(" * Copyright (C) 1999-2007 Free Software Foundation, Inc.\n");
80   printf(" * This file is part of the GNU LIBICONV Library.\n");
81   printf(" *\n");
82   printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
83   printf(" * and/or modify it under the terms of the GNU Library General Public\n");
84   printf(" * License as published by the Free Software Foundation; either version 2\n");
85   printf(" * of the License, or (at your option) any later version.\n");
86   printf(" *\n");
87   printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
88   printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
89   printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
90   printf(" * Library General Public License for more details.\n");
91   printf(" *\n");
92   printf(" * You should have received a copy of the GNU Library General Public\n");
93   printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
94   printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
95   printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n");
96   printf(" */\n");
97   printf("\n");
98   printf("/*\n");
99   printf(" * %s\n", charsetname);
100   printf(" */\n");
101   printf("\n");
102 }
103 
104 /*
105  * Reads the charset2uni table from standard input.
106  */
read_table(Encoding * enc)107 static void read_table (Encoding* enc)
108 {
109   int row, col, i, i1, i2, c, j;
110 
111   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
112   for (row = 0; row < enc->rows; row++)
113     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
114 
115   for (row = 0; row < enc->rows; row++)
116     for (col = 0; col < enc->cols; col++)
117       enc->charset2uni[row][col] = 0xfffd;
118 
119   c = getc(stdin);
120   ungetc(c,stdin);
121   if (c == '#') {
122     /* Read a unicode.org style .TXT file. */
123     for (;;) {
124       c = getc(stdin);
125       if (c == EOF)
126         break;
127       if (c == '\n' || c == ' ' || c == '\t')
128         continue;
129       if (c == '#') {
130         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
131         continue;
132       }
133       ungetc(c,stdin);
134       if (scanf("0x%x", &j) != 1)
135         exit(1);
136       i1 = j >> 8;
137       i2 = j & 0xff;
138       row = enc->byte_row(i1);
139       col = enc->byte_col(i2);
140       if (row < 0 || col < 0) {
141         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
142         exit(1);
143       }
144       if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
145         exit(1);
146     }
147   } else {
148     /* Read a table of hexadecimal Unicode values. */
149     for (i1 = 32; i1 < 132; i1++)
150       for (i2 = 32; i2 < 132; i2++) {
151         i = scanf("%x", &j);
152         if (i == EOF)
153           goto read_done;
154         if (i != 1)
155           exit(1);
156         if (j < 0 || j == 0xffff)
157           j = 0xfffd;
158         if (j != 0xfffd) {
159           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
160             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
161             exit (1);
162           }
163           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
164         }
165       }
166    read_done: ;
167   }
168 }
169 
170 /*
171  * Determine whether the Unicode range goes outside the BMP.
172  */
is_charset2uni_large(Encoding * enc)173 static bool is_charset2uni_large (Encoding* enc)
174 {
175   int row, col;
176 
177   for (row = 0; row < enc->rows; row++)
178     for (col = 0; col < enc->cols; col++)
179       if (enc->charset2uni[row][col] >= 0x10000)
180         return true;
181   return false;
182 }
183 
184 /*
185  * Compactify the Unicode range by use of an auxiliary table,
186  * so 16 bits suffice to store each value.
187  */
compact_large_charset2uni(Encoding * enc,unsigned int ** urows,unsigned int * urowshift)188 static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift)
189 {
190   unsigned int shift;
191 
192   for (shift = 8; ; shift--) {
193     int *upages = (int *) malloc((0x110000>>shift) * sizeof(int));
194     int i, row, col, nurows;
195 
196     for (i = 0; i < 0x110000>>shift; i++)
197       upages[i] = -1;
198 
199     for (row = 0; row < enc->rows; row++)
200       for (col = 0; col < enc->cols; col++)
201         upages[enc->charset2uni[row][col] >> shift] = 0;
202 
203     nurows = 0;
204     for (i = 0; i < 0x110000>>shift; i++)
205       if (upages[i] == 0)
206         nurows++;
207 
208     /* We want all table entries to fit in an 'unsigned short'. */
209     if (nurows <= 1<<(16-shift)) {
210       int** old_charset2uni;
211 
212       *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));
213       *urowshift = shift;
214 
215       nurows = 0;
216       for (i = 0; i < 0x110000>>shift; i++)
217         if (upages[i] == 0) {
218           upages[i] = nurows;
219           (*urows)[nurows] = i;
220           nurows++;
221         }
222 
223       old_charset2uni = enc->charset2uni;
224       enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
225       for (row = 0; row < enc->rows; row++)
226         enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
227       for (row = 0; row < enc->rows; row++)
228         for (col = 0; col < enc->cols; col++) {
229           int u = old_charset2uni[row][col];
230           enc->charset2uni[row][col] =
231             (upages[u >> shift] << shift) | (u & ((1 << shift) - 1));
232         }
233       enc->fffd =
234         (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1));
235 
236       return nurows;
237     }
238   }
239   abort();
240 }
241 
242 /*
243  * Computes the charsetpage[0..rows] array.
244  */
find_charset2uni_pages(Encoding * enc)245 static void find_charset2uni_pages (Encoding* enc)
246 {
247   int row, col;
248 
249   enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
250 
251   for (row = 0; row <= enc->rows; row++)
252     enc->charsetpage[row] = 0;
253 
254   for (row = 0; row < enc->rows; row++) {
255     int used = 0;
256     for (col = 0; col < enc->cols; col++)
257       if (enc->charset2uni[row][col] != enc->fffd)
258         used = col+1;
259     enc->charsetpage[row] = used;
260   }
261 }
262 
263 /*
264  * Fills in nblocks and blocks.
265  */
find_charset2uni_blocks(Encoding * enc)266 static void find_charset2uni_blocks (Encoding* enc)
267 {
268   int n, row, lastrow;
269 
270   enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
271 
272   n = 0;
273   for (row = 0; row < enc->rows; row++)
274     if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
275       for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
276       enc->charsetblocks[n].start = row * enc->cols;
277       enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
278       n++;
279     }
280   enc->ncharsetblocks = n;
281 }
282 
283 /*
284  * Outputs the charset to unicode table and function.
285  */
output_charset2uni(const char * name,Encoding * enc)286 static void output_charset2uni (const char* name, Encoding* enc)
287 {
288   int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;
289   bool is_large;
290   unsigned int* urows;
291   unsigned int urowshift;
292   Encoding tmpenc;
293 
294   is_large = is_charset2uni_large(enc);
295   if (is_large) {
296     /* Use a temporary copy of enc. */
297     tmpenc = *enc;
298     enc = &tmpenc;
299     nurows = compact_large_charset2uni(enc,&urows,&urowshift);
300   } else {
301     nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd;
302   }
303 
304   find_charset2uni_pages(enc);
305 
306   find_charset2uni_blocks(enc);
307 
308   for (row = 0; row < enc->rows; row++)
309     if (enc->charsetpage[row] > 0) {
310       if (row == 0 || enc->charsetpage[row-1] == 0) {
311         /* Start a new block. */
312         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
313         printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
314                name, enc->row_byte(row),
315                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
316       }
317       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
318       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
319       for (col = 0; col < col_max; col++) {
320         printf(" 0x%04x,", enc->charset2uni[row][col]);
321         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
322       }
323       printf("\n");
324       if (enc->charsetpage[row+1] == 0) {
325         /* End a block. */
326         printf("};\n");
327       }
328     }
329   printf("\n");
330 
331   if (is_large) {
332     printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);
333     for (i = 0; i < nurows; i++) {
334       printf(" 0x%05x,", urows[i] << urowshift);
335       if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");
336     }
337     printf("\n");
338     printf("};\n");
339     printf("\n");
340   }
341 
342   printf("static int\n");
343   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
344   printf("{\n");
345   printf("  unsigned char c1 = s[0];\n");
346   printf("  if (");
347   for (i = 0; i < enc->ncharsetblocks; i++) {
348     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
349     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
350     if (i > 0)
351       printf(" || ");
352     if (i1_min == i1_max)
353       printf("(c1 == 0x%02x)", i1_min);
354     else
355       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
356   }
357   printf(") {\n");
358   printf("    if (n >= 2) {\n");
359   printf("      unsigned char c2 = s[1];\n");
360   printf("      if (");
361   printf(enc->check_col_expr, "c2");
362   printf(") {\n");
363   printf("        unsigned int i = %d * (", enc->cols);
364   printf(enc->byte_row_expr, "c1");
365   printf(") + (");
366   printf(enc->byte_col_expr, "c2");
367   printf(");\n");
368   printf("        %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");
369   if (is_large) printf("        unsigned short swc;\n");
370   for (i = 0; i < enc->ncharsetblocks; i++) {
371     printf("        ");
372     if (i > 0)
373       printf("} else ");
374     if (i < enc->ncharsetblocks-1)
375       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
376     printf("{\n");
377     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
378     printf("            %s = ", is_large ? "swc" : "wc");
379     printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
380     if (enc->charsetblocks[i].start > 0)
381       printf("-%d", enc->charsetblocks[i].start);
382     printf("]");
383     if (is_large) printf(",\n            wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1);
384     printf(";\n");
385   }
386   printf("        }\n");
387   printf("        if (wc != 0xfffd) {\n");
388   printf("          *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");
389   printf("          return 2;\n");
390   printf("        }\n");
391   printf("      }\n");
392   printf("      return RET_ILSEQ;\n");
393   printf("    }\n");
394   printf("    return RET_TOOFEW(0);\n");
395   printf("  }\n");
396   printf("  return RET_ILSEQ;\n");
397   printf("}\n");
398   printf("\n");
399 }
400 
401 /*
402  * Outputs the charset to unicode table and function.
403  * (Suitable if the mapping function is well defined, i.e. has no holes, and
404  * is monotonically increasing with small gaps only.)
405  */
output_charset2uni_noholes_monotonic(const char * name,Encoding * enc)406 static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
407 {
408   int row, col, lastrow, r, col_max, i, i1_min, i1_max;
409 
410   /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
411      enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
412      is always < 0x100. */
413   int steps_per_row = 2;
414   int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
415 
416   find_charset2uni_pages(enc);
417 
418   find_charset2uni_blocks(enc);
419 
420   for (row = 0; row < enc->rows; row++)
421     if (enc->charsetpage[row] > 0) {
422       if (row == 0 || enc->charsetpage[row-1] == 0) {
423         /* Start a new block. */
424         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
425         printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
426                name, enc->row_byte(row),
427                steps_per_row*(lastrow-row+1));
428         for (r = row; r <= lastrow; r++) {
429           for (i = 0; i < steps_per_row; i++)
430             printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
431           if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
432         }
433         printf("\n");
434         printf("};\n");
435         printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
436                name, enc->row_byte(row),
437                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
438       }
439       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
440       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
441       for (col = 0; col < col_max; col++) {
442         printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
443         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
444       }
445       printf("\n");
446       if (enc->charsetpage[row+1] == 0) {
447         /* End a block. */
448         printf("};\n");
449       }
450     }
451   printf("\n");
452 
453   printf("static int\n");
454   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
455   printf("{\n");
456   printf("  unsigned char c1 = s[0];\n");
457   printf("  if (");
458   for (i = 0; i < enc->ncharsetblocks; i++) {
459     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
460     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
461     if (i > 0)
462       printf(" || ");
463     if (i1_min == i1_max)
464       printf("(c1 == 0x%02x)", i1_min);
465     else
466       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
467   }
468   printf(") {\n");
469   printf("    if (n >= 2) {\n");
470   printf("      unsigned char c2 = s[1];\n");
471   printf("      if (");
472   printf(enc->check_col_expr, "c2");
473   printf(") {\n");
474   printf("        unsigned int row = ");
475   printf(enc->byte_row_expr, "c1");
476   printf(";\n");
477   printf("        unsigned int col = ");
478   printf(enc->byte_col_expr, "c2");
479   printf(";\n");
480   printf("        unsigned int i = %d * row + col;\n", enc->cols);
481   printf("        unsigned short wc = 0xfffd;\n");
482   for (i = 0; i < enc->ncharsetblocks; i++) {
483     printf("        ");
484     if (i > 0)
485       printf("} else ");
486     if (i < enc->ncharsetblocks-1)
487       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
488     printf("{\n");
489     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
490     printf("            wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
491     if (enc->charsetblocks[i].start > 0)
492       printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
493     else
494       printf("row");
495     printf("+");
496     if (steps_per_row == 2)
497       printf("(col>=%d?1:0)", stepsize);
498     else
499       printf("col/%d", stepsize);
500     printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
501     if (enc->charsetblocks[i].start > 0)
502       printf("-%d", enc->charsetblocks[i].start);
503     printf("];\n");
504   }
505   printf("        }\n");
506   printf("        if (wc != 0xfffd) {\n");
507   printf("          *pwc = (ucs4_t) wc;\n");
508   printf("          return 2;\n");
509   printf("        }\n");
510   printf("      }\n");
511   printf("      return RET_ILSEQ;\n");
512   printf("    }\n");
513   printf("    return RET_TOOFEW(0);\n");
514   printf("  }\n");
515   printf("  return RET_ILSEQ;\n");
516   printf("}\n");
517   printf("\n");
518 }
519 
520 /*
521  * Computes the uni2charset[0x0000..0x2ffff] array.
522  */
invert(Encoding * enc)523 static void invert (Encoding* enc)
524 {
525   int row, col, j;
526 
527   enc->uni2charset = (int*) malloc(0x30000*sizeof(int));
528 
529   for (j = 0; j < 0x30000; j++)
530     enc->uni2charset[j] = 0;
531 
532   for (row = 0; row < enc->rows; row++)
533     for (col = 0; col < enc->cols; col++) {
534       j = enc->charset2uni[row][col];
535       if (j != 0xfffd)
536         enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
537     }
538 }
539 
540 /*
541  * Outputs the unicode to charset table and function, using a linear array.
542  * (Suitable if the table is dense.)
543  */
output_uni2charset_dense(const char * name,Encoding * enc)544 static void output_uni2charset_dense (const char* name, Encoding* enc)
545 {
546   /* Like in 8bit_tab_to_h.c */
547   bool pages[0x300];
548   int line[0x6000];
549   int tableno;
550   struct { int minline; int maxline; int usecount; } tables[0x6000];
551   bool first;
552   int row, col, j, p, j1, j2, t;
553 
554   for (p = 0; p < 0x300; p++)
555     pages[p] = false;
556   for (row = 0; row < enc->rows; row++)
557     for (col = 0; col < enc->cols; col++) {
558       j = enc->charset2uni[row][col];
559       if (j != 0xfffd)
560         pages[j>>8] = true;
561     }
562   for (j1 = 0; j1 < 0x6000; j1++) {
563     bool all_invalid = true;
564     for (j2 = 0; j2 < 8; j2++) {
565       j = 8*j1+j2;
566       if (enc->uni2charset[j] != 0)
567         all_invalid = false;
568     }
569     if (all_invalid)
570       line[j1] = -1;
571     else
572       line[j1] = 0;
573   }
574   tableno = 0;
575   for (j1 = 0; j1 < 0x6000; j1++) {
576     if (line[j1] >= 0) {
577       if (tableno > 0
578           && ((j1 > 0 && line[j1-1] == tableno-1)
579               || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
580                   && j1 - tables[tableno-1].maxline <= 8))) {
581         line[j1] = tableno-1;
582         tables[tableno-1].maxline = j1;
583       } else {
584         tableno++;
585         line[j1] = tableno-1;
586         tables[tableno-1].minline = tables[tableno-1].maxline = j1;
587       }
588     }
589   }
590   for (t = 0; t < tableno; t++) {
591     tables[t].usecount = 0;
592     j1 = 8*tables[t].minline;
593     j2 = 8*(tables[t].maxline+1);
594     for (j = j1; j < j2; j++)
595       if (enc->uni2charset[j] != 0)
596         tables[t].usecount++;
597   }
598   {
599     p = -1;
600     for (t = 0; t < tableno; t++)
601       if (tables[t].usecount > 1) {
602         p = tables[t].minline >> 5;
603         printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
604         for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
605           if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
606             printf("  /* 0x%04x */\n", 8*j1);
607           printf(" ");
608           for (j2 = 0; j2 < 8; j2++) {
609             j = 8*j1+j2;
610             printf(" 0x%04x,", enc->uni2charset[j]);
611           }
612           printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
613         }
614         printf("};\n");
615       }
616     if (p >= 0)
617       printf("\n");
618   }
619   printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
620   printf("{\n");
621   printf("  if (n >= 2) {\n");
622   printf("    unsigned short c = 0;\n");
623   first = true;
624   for (j1 = 0; j1 < 0x6000;) {
625     t = line[j1];
626     for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++);
627     if (t >= 0) {
628       if (j1 != tables[t].minline) abort();
629       if (j2 > tables[t].maxline+1) abort();
630       j2 = tables[t].maxline+1;
631       if (first)
632         printf("    ");
633       else
634         printf("    else ");
635       first = false;
636       if (tables[t].usecount == 0) abort();
637       if (tables[t].usecount == 1) {
638         if (j2 != j1+1) abort();
639         for (j = 8*j1; j < 8*j2; j++)
640           if (enc->uni2charset[j] != 0) {
641             printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
642             break;
643           }
644       } else {
645         if (j1 == 0) {
646           printf("if (wc < 0x%04x)", 8*j2);
647         } else {
648           printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
649         }
650         printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
651         if (tables[t].minline > 0)
652           printf("-0x%04x", 8*j1);
653         printf("];\n");
654       }
655     }
656     j1 = j2;
657   }
658   printf("    if (c != 0) {\n");
659   printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
660   printf("      return 2;\n");
661   printf("    }\n");
662   printf("    return RET_ILUNI;\n");
663   printf("  }\n");
664   printf("  return RET_TOOSMALL;\n");
665   printf("}\n");
666 }
667 
668 /*
669  * Outputs the unicode to charset table and function, using a packed array.
670  * (Suitable if the table is sparse.)
671  * The argument 'monotonic' may be set to true if the mapping is monotonically
672  * increasing with small gaps only.
673  */
output_uni2charset_sparse(const char * name,Encoding * enc,bool monotonic)674 static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
675 {
676   bool pages[0x300];
677   Block pageblocks[0x300]; int npageblocks;
678   int indx2charset[0x30000];
679   int summary_indx[0x3000];
680   int summary_used[0x3000];
681   int i, row, col, j, p, j1, j2, indx;
682   bool is_large;
683   /* for monotonic: */
684   int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
685   int stepsize = 1 << log2_stepsize;
686   int indxsteps;
687 
688   /* Fill pages[0x300]. */
689   for (p = 0; p < 0x300; p++)
690     pages[p] = false;
691   for (row = 0; row < enc->rows; row++)
692     for (col = 0; col < enc->cols; col++) {
693       j = enc->charset2uni[row][col];
694       if (j != 0xfffd)
695         pages[j>>8] = true;
696     }
697 
698   /* Determine whether two or three bytes are needed for each character. */
699   is_large = false;
700   for (j = 0; j < 0x30000; j++)
701     if (enc->uni2charset[j] >= 0x10000)
702       is_large = true;
703 
704 #if 0
705   for (p = 0; p < 0x300; p++)
706     if (pages[p]) {
707       printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
708       for (j1 = 0; j1 < 32; j1++) {
709         printf("  ");
710         for (j2 = 0; j2 < 8; j2++)
711           printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
712         printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
713       }
714       printf("};\n");
715     }
716   printf("\n");
717 #endif
718 
719   /* Fill summary_indx[] and summary_used[]. */
720   indx = 0;
721   for (j1 = 0; j1 < 0x3000; j1++) {
722     summary_indx[j1] = indx;
723     summary_used[j1] = 0;
724     for (j2 = 0; j2 < 16; j2++) {
725       j = 16*j1+j2;
726       if (enc->uni2charset[j] != 0) {
727         indx2charset[indx++] = enc->uni2charset[j];
728         summary_used[j1] |= (1 << j2);
729       }
730     }
731   }
732 
733   /* Fill npageblocks and pageblocks[]. */
734   npageblocks = 0;
735   for (p = 0; p < 0x300; ) {
736     if (pages[p] && (p == 0 || !pages[p-1])) {
737       pageblocks[npageblocks].start = 16*p;
738       do p++; while (p < 0x300 && pages[p]);
739       j1 = 16*p;
740       while (summary_used[j1-1] == 0) j1--;
741       pageblocks[npageblocks].end = j1;
742       npageblocks++;
743     } else
744       p++;
745   }
746 
747   if (monotonic) {
748     indxsteps = (indx + stepsize-1) / stepsize;
749     printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
750     for (i = 0; i < indxsteps; ) {
751       if ((i % 8) == 0) printf(" ");
752       printf(" 0x%04x,", indx2charset[i*stepsize]);
753       i++;
754       if ((i % 8) == 0 || i == indxsteps) printf("\n");
755     }
756     printf("};\n");
757     printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
758     for (i = 0; i < indx; ) {
759       if ((i % 8) == 0) printf(" ");
760       printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
761       i++;
762       if ((i % 8) == 0 || i == indx) printf("\n");
763     }
764     printf("};\n");
765   } else {
766     if (is_large) {
767       printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx);
768       for (i = 0; i < indx; ) {
769         if ((i % 4) == 0) printf(" ");
770         printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16,
771                (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff);
772         i++;
773         if ((i % 4) == 0 || i == indx) printf("\n");
774       }
775       printf("};\n");
776     } else {
777       printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
778       for (i = 0; i < indx; ) {
779         if ((i % 8) == 0) printf(" ");
780         printf(" 0x%04x,", indx2charset[i]);
781         i++;
782         if ((i % 8) == 0 || i == indx) printf("\n");
783       }
784       printf("};\n");
785     }
786   }
787   printf("\n");
788   for (i = 0; i < npageblocks; i++) {
789     printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
790            pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
791     for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
792       if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
793       if ((j1 % 4) == 0) printf(" ");
794       printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
795       j1++;
796       if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
797     }
798     printf("};\n");
799   }
800   printf("\n");
801 
802   printf("static int\n");
803   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
804   printf("{\n");
805   printf("  if (n >= 2) {\n");
806   printf("    const Summary16 *summary = NULL;\n");
807   for (i = 0; i < npageblocks; i++) {
808     printf("    ");
809     if (i > 0)
810       printf("else ");
811     printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
812            16*pageblocks[i].start, 16*pageblocks[i].end);
813     printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
814            pageblocks[i].start/16);
815     if (pageblocks[i].start > 0)
816       printf("-0x%03x", pageblocks[i].start);
817     printf("];\n");
818   }
819   printf("    if (summary) {\n");
820   printf("      unsigned short used = summary->used;\n");
821   printf("      unsigned int i = wc & 0x0f;\n");
822   printf("      if (used & ((unsigned short) 1 << i)) {\n");
823   if (monotonic || !is_large)
824     printf("        unsigned short c;\n");
825   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
826   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
827   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
828   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
829   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
830   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
831   printf("        used = (used & 0x00ff) + (used >> 8);\n");
832   if (monotonic) {
833     printf("        used += summary->indx;\n");
834     printf("        c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
835     printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
836     printf("        return 2;\n");
837   } else {
838     if (is_large) {
839       printf("        used += summary->indx;\n");
840       printf("        r[0] = %s_2charset[3*used];\n", name);
841       printf("        r[1] = %s_2charset[3*used+1];\n", name);
842       printf("        r[2] = %s_2charset[3*used+2];\n", name);
843       printf("        return 3;\n");
844     } else {
845       printf("        c = %s_2charset[summary->indx + used];\n", name);
846       printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
847       printf("        return 2;\n");
848     }
849   }
850   printf("      }\n");
851   printf("    }\n");
852   printf("    return RET_ILUNI;\n");
853   printf("  }\n");
854   printf("  return RET_TOOSMALL;\n");
855   printf("}\n");
856 }
857 
858 /* ISO-2022/EUC specifics */
859 
row_byte_normal(int row)860 static int row_byte_normal (int row) { return 0x21+row; }
col_byte_normal(int col)861 static int col_byte_normal (int col) { return 0x21+col; }
byte_row_normal(int byte)862 static int byte_row_normal (int byte) { return byte-0x21; }
byte_col_normal(int byte)863 static int byte_col_normal (int byte) { return byte-0x21; }
864 
do_normal(const char * name)865 static void do_normal (const char* name)
866 {
867   Encoding enc;
868 
869   enc.rows = 94;
870   enc.cols = 94;
871   enc.row_byte = row_byte_normal;
872   enc.col_byte = col_byte_normal;
873   enc.byte_row = byte_row_normal;
874   enc.byte_col = byte_col_normal;
875   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
876   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
877   enc.byte_row_expr = "%1$s - 0x21";
878   enc.byte_col_expr = "%1$s - 0x21";
879 
880   read_table(&enc);
881   output_charset2uni(name,&enc);
882   invert(&enc); output_uni2charset_sparse(name,&enc,false);
883 }
884 
885 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
886    starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
887    order. There are 75 out-of-order values, scattered all throughout the table.
888  */
889 
do_normal_only_charset2uni(const char * name)890 static void do_normal_only_charset2uni (const char* name)
891 {
892   Encoding enc;
893 
894   enc.rows = 94;
895   enc.cols = 94;
896   enc.row_byte = row_byte_normal;
897   enc.col_byte = col_byte_normal;
898   enc.byte_row = byte_row_normal;
899   enc.byte_col = byte_col_normal;
900   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
901   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
902   enc.byte_row_expr = "%1$s - 0x21";
903   enc.byte_col_expr = "%1$s - 0x21";
904 
905   read_table(&enc);
906   output_charset2uni(name,&enc);
907 }
908 
909 /* CNS 11643 specifics - trick to put two tables into one */
910 
row_byte_cns11643(int row)911 static int row_byte_cns11643 (int row) {
912   return 0x100 * (row / 94) + (row % 94) + 0x21;
913 }
byte_row_cns11643(int byte)914 static int byte_row_cns11643 (int byte) {
915   return (byte >> 8) * 94 + (byte & 0xff) - 0x21;
916 }
917 
do_cns11643_only_uni2charset(const char * name)918 static void do_cns11643_only_uni2charset (const char* name)
919 {
920   Encoding enc;
921 
922   enc.rows = 16*94;
923   enc.cols = 94;
924   enc.row_byte = row_byte_cns11643;
925   enc.col_byte = col_byte_normal;
926   enc.byte_row = byte_row_cns11643;
927   enc.byte_col = byte_col_normal;
928   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
929   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
930   enc.byte_row_expr = "%1$s - 0x21";
931   enc.byte_col_expr = "%1$s - 0x21";
932 
933   read_table(&enc);
934   invert(&enc);
935   output_uni2charset_sparse(name,&enc,false);
936 }
937 
938 /* GBK specifics */
939 
row_byte_gbk1(int row)940 static int row_byte_gbk1 (int row) {
941   return 0x81+row;
942 }
col_byte_gbk1(int col)943 static int col_byte_gbk1 (int col) {
944   return (col >= 0x3f ? 0x41 : 0x40) + col;
945 }
byte_row_gbk1(int byte)946 static int byte_row_gbk1 (int byte) {
947   if (byte >= 0x81 && byte < 0xff)
948     return byte-0x81;
949   else
950     return -1;
951 }
byte_col_gbk1(int byte)952 static int byte_col_gbk1 (int byte) {
953   if (byte >= 0x40 && byte < 0x7f)
954     return byte-0x40;
955   else if (byte >= 0x80 && byte < 0xff)
956     return byte-0x41;
957   else
958     return -1;
959 }
960 
do_gbk1(const char * name)961 static void do_gbk1 (const char* name)
962 {
963   Encoding enc;
964 
965   enc.rows = 126;
966   enc.cols = 190;
967   enc.row_byte = row_byte_gbk1;
968   enc.col_byte = col_byte_gbk1;
969   enc.byte_row = byte_row_gbk1;
970   enc.byte_col = byte_col_gbk1;
971   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
972   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
973   enc.byte_row_expr = "%1$s - 0x81";
974   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
975 
976   read_table(&enc);
977   output_charset2uni(name,&enc);
978   invert(&enc); output_uni2charset_dense(name,&enc);
979 }
980 
do_gbk1_only_charset2uni(const char * name)981 static void do_gbk1_only_charset2uni (const char* name)
982 {
983   Encoding enc;
984 
985   enc.rows = 126;
986   enc.cols = 190;
987   enc.row_byte = row_byte_gbk1;
988   enc.col_byte = col_byte_gbk1;
989   enc.byte_row = byte_row_gbk1;
990   enc.byte_col = byte_col_gbk1;
991   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
992   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
993   enc.byte_row_expr = "%1$s - 0x81";
994   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
995 
996   read_table(&enc);
997   output_charset2uni(name,&enc);
998 }
999 
row_byte_gbk2(int row)1000 static int row_byte_gbk2 (int row) {
1001   return 0x81+row;
1002 }
col_byte_gbk2(int col)1003 static int col_byte_gbk2 (int col) {
1004   return (col >= 0x3f ? 0x41 : 0x40) + col;
1005 }
byte_row_gbk2(int byte)1006 static int byte_row_gbk2 (int byte) {
1007   if (byte >= 0x81 && byte < 0xff)
1008     return byte-0x81;
1009   else
1010     return -1;
1011 }
byte_col_gbk2(int byte)1012 static int byte_col_gbk2 (int byte) {
1013   if (byte >= 0x40 && byte < 0x7f)
1014     return byte-0x40;
1015   else if (byte >= 0x80 && byte < 0xa1)
1016     return byte-0x41;
1017   else
1018     return -1;
1019 }
1020 
do_gbk2_only_charset2uni(const char * name)1021 static void do_gbk2_only_charset2uni (const char* name)
1022 {
1023   Encoding enc;
1024 
1025   enc.rows = 126;
1026   enc.cols = 96;
1027   enc.row_byte = row_byte_gbk2;
1028   enc.col_byte = col_byte_gbk2;
1029   enc.byte_row = byte_row_gbk2;
1030   enc.byte_col = byte_col_gbk2;
1031   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1032   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1033   enc.byte_row_expr = "%1$s - 0x81";
1034   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1035 
1036   read_table(&enc);
1037   output_charset2uni(name,&enc);
1038 }
1039 
do_gbk1_only_uni2charset(const char * name)1040 static void do_gbk1_only_uni2charset (const char* name)
1041 {
1042   Encoding enc;
1043 
1044   enc.rows = 126;
1045   enc.cols = 190;
1046   enc.row_byte = row_byte_gbk1;
1047   enc.col_byte = col_byte_gbk1;
1048   enc.byte_row = byte_row_gbk1;
1049   enc.byte_col = byte_col_gbk1;
1050   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1051   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1052   enc.byte_row_expr = "%1$s - 0x81";
1053   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1054 
1055   read_table(&enc);
1056   invert(&enc); output_uni2charset_sparse(name,&enc,false);
1057 }
1058 
1059 /* KSC 5601 specifics */
1060 
1061 /*
1062  * Reads the charset2uni table from standard input.
1063  */
read_table_ksc5601(Encoding * enc)1064 static void read_table_ksc5601 (Encoding* enc)
1065 {
1066   int row, col, i, i1, i2, c, j;
1067 
1068   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
1069   for (row = 0; row < enc->rows; row++)
1070     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
1071 
1072   for (row = 0; row < enc->rows; row++)
1073     for (col = 0; col < enc->cols; col++)
1074       enc->charset2uni[row][col] = 0xfffd;
1075 
1076   c = getc(stdin);
1077   ungetc(c,stdin);
1078   if (c == '#') {
1079     /* Read a unicode.org style .TXT file. */
1080     for (;;) {
1081       c = getc(stdin);
1082       if (c == EOF)
1083         break;
1084       if (c == '\n' || c == ' ' || c == '\t')
1085         continue;
1086       if (c == '#') {
1087         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1088         continue;
1089       }
1090       ungetc(c,stdin);
1091       if (scanf("0x%x", &j) != 1)
1092         exit(1);
1093       i1 = j >> 8;
1094       i2 = j & 0xff;
1095       if (scanf(" 0x%x", &j) != 1)
1096         exit(1);
1097       /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1098          = KS X 1001.1992, ignore the rest. */
1099       if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
1100         continue;  /* KSC5601 specific */
1101       i1 &= 0x7f;  /* KSC5601 specific */
1102       i2 &= 0x7f;  /* KSC5601 specific */
1103       row = enc->byte_row(i1);
1104       col = enc->byte_col(i2);
1105       if (row < 0 || col < 0) {
1106         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
1107         exit(1);
1108       }
1109       enc->charset2uni[row][col] = j;
1110     }
1111   } else {
1112     /* Read a table of hexadecimal Unicode values. */
1113     for (i1 = 33; i1 < 127; i1++)
1114       for (i2 = 33; i2 < 127; i2++) {
1115         i = scanf("%x", &j);
1116         if (i == EOF)
1117           goto read_done;
1118         if (i != 1)
1119           exit(1);
1120         if (j < 0 || j == 0xffff)
1121           j = 0xfffd;
1122         if (j != 0xfffd) {
1123           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1124             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1125             exit (1);
1126           }
1127           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1128         }
1129       }
1130    read_done: ;
1131   }
1132 }
1133 
do_ksc5601(const char * name)1134 static void do_ksc5601 (const char* name)
1135 {
1136   Encoding enc;
1137 
1138   enc.rows = 94;
1139   enc.cols = 94;
1140   enc.row_byte = row_byte_normal;
1141   enc.col_byte = col_byte_normal;
1142   enc.byte_row = byte_row_normal;
1143   enc.byte_col = byte_col_normal;
1144   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1145   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1146   enc.byte_row_expr = "%1$s - 0x21";
1147   enc.byte_col_expr = "%1$s - 0x21";
1148 
1149   read_table_ksc5601(&enc);
1150   output_charset2uni(name,&enc);
1151   invert(&enc); output_uni2charset_sparse(name,&enc,false);
1152 }
1153 
1154 /* UHC specifics */
1155 
1156 /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1157 
row_byte_uhc_1(int row)1158 static int row_byte_uhc_1 (int row) {
1159   return 0x81 + row;
1160 }
col_byte_uhc_1(int col)1161 static int col_byte_uhc_1 (int col) {
1162   return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1163 }
byte_row_uhc_1(int byte)1164 static int byte_row_uhc_1 (int byte) {
1165   if (byte >= 0x81 && byte < 0xa1)
1166     return byte-0x81;
1167   else
1168     return -1;
1169 }
byte_col_uhc_1(int byte)1170 static int byte_col_uhc_1 (int byte) {
1171   if (byte >= 0x41 && byte < 0x5b)
1172     return byte-0x41;
1173   else if (byte >= 0x61 && byte < 0x7b)
1174     return byte-0x47;
1175   else if (byte >= 0x81 && byte < 0xff)
1176     return byte-0x4d;
1177   else
1178     return -1;
1179 }
1180 
do_uhc_1(const char * name)1181 static void do_uhc_1 (const char* name)
1182 {
1183   Encoding enc;
1184 
1185   enc.rows = 32;
1186   enc.cols = 178;
1187   enc.row_byte = row_byte_uhc_1;
1188   enc.col_byte = col_byte_uhc_1;
1189   enc.byte_row = byte_row_uhc_1;
1190   enc.byte_col = byte_col_uhc_1;
1191   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1192   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1193   enc.byte_row_expr = "%1$s - 0x81";
1194   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1195 
1196   read_table(&enc);
1197   output_charset2uni_noholes_monotonic(name,&enc);
1198   invert(&enc); output_uni2charset_sparse(name,&enc,true);
1199 }
1200 
1201 /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1202 
row_byte_uhc_2(int row)1203 static int row_byte_uhc_2 (int row) {
1204   return 0xa1 + row;
1205 }
col_byte_uhc_2(int col)1206 static int col_byte_uhc_2 (int col) {
1207   return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1208 }
byte_row_uhc_2(int byte)1209 static int byte_row_uhc_2 (int byte) {
1210   if (byte >= 0xa1 && byte < 0xff)
1211     return byte-0xa1;
1212   else
1213     return -1;
1214 }
byte_col_uhc_2(int byte)1215 static int byte_col_uhc_2 (int byte) {
1216   if (byte >= 0x41 && byte < 0x5b)
1217     return byte-0x41;
1218   else if (byte >= 0x61 && byte < 0x7b)
1219     return byte-0x47;
1220   else if (byte >= 0x81 && byte < 0xa1)
1221     return byte-0x4d;
1222   else
1223     return -1;
1224 }
1225 
do_uhc_2(const char * name)1226 static void do_uhc_2 (const char* name)
1227 {
1228   Encoding enc;
1229 
1230   enc.rows = 94;
1231   enc.cols = 84;
1232   enc.row_byte = row_byte_uhc_2;
1233   enc.col_byte = col_byte_uhc_2;
1234   enc.byte_row = byte_row_uhc_2;
1235   enc.byte_col = byte_col_uhc_2;
1236   enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1237   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1238   enc.byte_row_expr = "%1$s - 0xa1";
1239   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1240 
1241   read_table(&enc);
1242   output_charset2uni_noholes_monotonic(name,&enc);
1243   invert(&enc); output_uni2charset_sparse(name,&enc,true);
1244 }
1245 
1246 /* Big5 specifics */
1247 
row_byte_big5(int row)1248 static int row_byte_big5 (int row) {
1249   return 0xa1+row;
1250 }
col_byte_big5(int col)1251 static int col_byte_big5 (int col) {
1252   return (col >= 0x3f ? 0x62 : 0x40) + col;
1253 }
byte_row_big5(int byte)1254 static int byte_row_big5 (int byte) {
1255   if (byte >= 0xa1 && byte < 0xff)
1256     return byte-0xa1;
1257   else
1258     return -1;
1259 }
byte_col_big5(int byte)1260 static int byte_col_big5 (int byte) {
1261   if (byte >= 0x40 && byte < 0x7f)
1262     return byte-0x40;
1263   else if (byte >= 0xa1 && byte < 0xff)
1264     return byte-0x62;
1265   else
1266     return -1;
1267 }
1268 
do_big5(const char * name)1269 static void do_big5 (const char* name)
1270 {
1271   Encoding enc;
1272 
1273   enc.rows = 94;
1274   enc.cols = 157;
1275   enc.row_byte = row_byte_big5;
1276   enc.col_byte = col_byte_big5;
1277   enc.byte_row = byte_row_big5;
1278   enc.byte_col = byte_col_big5;
1279   enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1280   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1281   enc.byte_row_expr = "%1$s - 0xa1";
1282   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1283 
1284   read_table(&enc);
1285   output_charset2uni(name,&enc);
1286   invert(&enc); output_uni2charset_sparse(name,&enc,false);
1287 }
1288 
1289 /* HKSCS specifics */
1290 
row_byte_hkscs(int row)1291 static int row_byte_hkscs (int row) {
1292   return 0x80+row;
1293 }
byte_row_hkscs(int byte)1294 static int byte_row_hkscs (int byte) {
1295   if (byte >= 0x80 && byte < 0xff)
1296     return byte-0x80;
1297   else
1298     return -1;
1299 }
1300 
do_hkscs(const char * name)1301 static void do_hkscs (const char* name)
1302 {
1303   Encoding enc;
1304 
1305   enc.rows = 128;
1306   enc.cols = 157;
1307   enc.row_byte = row_byte_hkscs;
1308   enc.col_byte = col_byte_big5;
1309   enc.byte_row = byte_row_hkscs;
1310   enc.byte_col = byte_col_big5;
1311   enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1312   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1313   enc.byte_row_expr = "%1$s - 0x80";
1314   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1315 
1316   read_table(&enc);
1317   output_charset2uni(name,&enc);
1318   invert(&enc); output_uni2charset_sparse(name,&enc,false);
1319 }
1320 
1321 /* Johab Hangul specifics */
1322 
row_byte_johab_hangul(int row)1323 static int row_byte_johab_hangul (int row) {
1324   return 0x84+row;
1325 }
col_byte_johab_hangul(int col)1326 static int col_byte_johab_hangul (int col) {
1327   return (col >= 0x3e ? 0x43 : 0x41) + col;
1328 }
byte_row_johab_hangul(int byte)1329 static int byte_row_johab_hangul (int byte) {
1330   if (byte >= 0x84 && byte < 0xd4)
1331     return byte-0x84;
1332   else
1333     return -1;
1334 }
byte_col_johab_hangul(int byte)1335 static int byte_col_johab_hangul (int byte) {
1336   if (byte >= 0x41 && byte < 0x7f)
1337     return byte-0x41;
1338   else if (byte >= 0x81 && byte < 0xff)
1339     return byte-0x43;
1340   else
1341     return -1;
1342 }
1343 
do_johab_hangul(const char * name)1344 static void do_johab_hangul (const char* name)
1345 {
1346   Encoding enc;
1347 
1348   enc.rows = 80;
1349   enc.cols = 188;
1350   enc.row_byte = row_byte_johab_hangul;
1351   enc.col_byte = col_byte_johab_hangul;
1352   enc.byte_row = byte_row_johab_hangul;
1353   enc.byte_col = byte_col_johab_hangul;
1354   enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1355   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1356   enc.byte_row_expr = "%1$s - 0x84";
1357   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1358 
1359   read_table(&enc);
1360   output_charset2uni(name,&enc);
1361   invert(&enc); output_uni2charset_dense(name,&enc);
1362 }
1363 
1364 /* SJIS specifics */
1365 
row_byte_sjis(int row)1366 static int row_byte_sjis (int row) {
1367   return (row >= 0x1f ? 0xc1 : 0x81) + row;
1368 }
col_byte_sjis(int col)1369 static int col_byte_sjis (int col) {
1370   return (col >= 0x3f ? 0x41 : 0x40) + col;
1371 }
byte_row_sjis(int byte)1372 static int byte_row_sjis (int byte) {
1373   if (byte >= 0x81 && byte < 0xa0)
1374     return byte-0x81;
1375   else if (byte >= 0xe0)
1376     return byte-0xc1;
1377   else
1378     return -1;
1379 }
byte_col_sjis(int byte)1380 static int byte_col_sjis (int byte) {
1381   if (byte >= 0x40 && byte < 0x7f)
1382     return byte-0x40;
1383   else if (byte >= 0x80 && byte < 0xfd)
1384     return byte-0x41;
1385   else
1386     return -1;
1387 }
1388 
do_sjis(const char * name)1389 static void do_sjis (const char* name)
1390 {
1391   Encoding enc;
1392 
1393   enc.rows = 94;
1394   enc.cols = 188;
1395   enc.row_byte = row_byte_sjis;
1396   enc.col_byte = col_byte_sjis;
1397   enc.byte_row = byte_row_sjis;
1398   enc.byte_col = byte_col_sjis;
1399   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1400   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1401   enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1402   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1403 
1404   read_table(&enc);
1405   output_charset2uni(name,&enc);
1406   invert(&enc); output_uni2charset_sparse(name,&enc,false);
1407 }
1408 
1409 /* GB18030 Unicode specifics */
1410 
do_gb18030uni(const char * name)1411 static void do_gb18030uni (const char* name)
1412 {
1413   int c;
1414   unsigned int bytes;
1415   int i1, i2, i3, i4, i, j, k;
1416   int charset2uni[4*10*126*10];
1417   int uni2charset[0x10000];
1418   struct { int low; int high; int diff; int total; } ranges[256];
1419   int ranges_count, ranges_total;
1420 
1421   for (i = 0; i < 4*10*126*10; i++)
1422     charset2uni[i] = 0;
1423   for (j = 0; j < 0x10000; j++)
1424     uni2charset[j] = 0;
1425 
1426   /* Read a unicode.org style .TXT file. */
1427   for (;;) {
1428     c = getc(stdin);
1429     if (c == EOF)
1430       break;
1431     if (c == '\n' || c == ' ' || c == '\t')
1432       continue;
1433     if (c == '#') {
1434       do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1435       continue;
1436     }
1437     ungetc(c,stdin);
1438     if (scanf("0x%x", &bytes) != 1)
1439       exit(1);
1440     i1 = (bytes >> 24) & 0xff;
1441     i2 = (bytes >> 16) & 0xff;
1442     i3 = (bytes >> 8) & 0xff;
1443     i4 = bytes & 0xff;
1444     if (!(i1 >= 0x81 && i1 <= 0x84
1445           && i2 >= 0x30 && i2 <= 0x39
1446           && i3 >= 0x81 && i3 <= 0xfe
1447           && i4 >= 0x30 && i4 <= 0x39)) {
1448       fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1449       exit(1);
1450     }
1451     i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1452     if (scanf(" 0x%x", &j) != 1)
1453       exit(1);
1454     if (!(j >= 0 && j < 0x10000))
1455       exit(1);
1456     charset2uni[i] = j;
1457     uni2charset[j] = i;
1458   }
1459 
1460   /* Verify that the mapping i -> j is monotonically increasing and
1461      of the form
1462         low[k] <= i <= high[k]  =>  j = diff[k] + i
1463      with a set of disjoint intervals (low[k], high[k]). */
1464   ranges_count = 0;
1465   for (i = 0; i < 4*10*126*10; i++)
1466     if (charset2uni[i] != 0) {
1467       int diff;
1468       j = charset2uni[i];
1469       diff = j - i;
1470       if (ranges_count > 0) {
1471         if (!(i > ranges[ranges_count-1].high))
1472           exit(1);
1473         if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1474           exit(1);
1475         /* Additional property: The diffs are also increasing. */
1476         if (!(diff >= ranges[ranges_count-1].diff))
1477           exit(1);
1478       }
1479       if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1480         ranges[ranges_count-1].high = i;
1481       else {
1482         if (ranges_count == 256)
1483           exit(1);
1484         ranges[ranges_count].low = i;
1485         ranges[ranges_count].high = i;
1486         ranges[ranges_count].diff = diff;
1487         ranges_count++;
1488       }
1489     }
1490 
1491   /* Determine size of bitmap. */
1492   ranges_total = 0;
1493   for (k = 0; k < ranges_count; k++) {
1494     ranges[k].total = ranges_total;
1495     ranges_total += ranges[k].high - ranges[k].low + 1;
1496   }
1497 
1498   printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1499   for (k = 0; k < ranges_count; k++) {
1500     printf("  0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1501     if (k+1 < ranges_count) printf(",");
1502     if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1503   }
1504   printf("\n");
1505   printf("};\n");
1506 
1507   printf("\n");
1508 
1509   printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1510   for (k = 0; k < ranges_count; k++) {
1511     printf("  0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1512     if (k+1 < ranges_count) printf(",");
1513     if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1514   }
1515   printf("\n");
1516   printf("};\n");
1517 
1518   printf("\n");
1519 
1520   printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1521   for (k = 0; k < ranges_count; k++) {
1522     printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1523     if (k+1 < ranges_count) printf(",");
1524     if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1525   }
1526   printf("\n");
1527   printf("};\n");
1528 
1529   printf("\n");
1530 
1531   printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1532   {
1533     int accu = 0;
1534     for (k = 0; k < ranges_count; k++) {
1535       for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1536         if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1537           accu |= (1 << (i % 8));
1538         i++;
1539         if ((i % 8) == 0) {
1540           printf(" 0x%02x", accu);
1541           if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1542           if (((i / 8) % 12) == 0)
1543             printf("\n ");
1544           accu = 0;
1545         }
1546       }
1547       if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1548     }
1549     if ((ranges_total % 8) != 0)
1550       printf(" 0x%02x", accu);
1551     printf("\n");
1552   }
1553   printf("};\n");
1554 
1555   printf("\n");
1556 
1557   printf("static int\n");
1558   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
1559   printf("{\n");
1560   printf("  unsigned char c1 = s[0];\n");
1561   printf("  if (c1 >= 0x81 && c1 <= 0x84) {\n");
1562   printf("    if (n >= 2) {\n");
1563   printf("      unsigned char c2 = s[1];\n");
1564   printf("      if (c2 >= 0x30 && c2 <= 0x39) {\n");
1565   printf("        if (n >= 3) {\n");
1566   printf("          unsigned char c3 = s[2];\n");
1567   printf("          if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1568   printf("            if (n >= 4) {\n");
1569   printf("              unsigned char c4 = s[3];\n");
1570   printf("              if (c4 >= 0x30 && c4 <= 0x39) {\n");
1571   printf("                unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1572   printf("                if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1573   printf("                  unsigned int k1 = 0;\n");
1574   printf("                  unsigned int k2 = %d;\n", ranges_count-1);
1575   printf("                  while (k1 < k2) {\n");
1576   printf("                    unsigned int k = (k1 + k2) / 2;\n");
1577   printf("                    if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1578   printf("                      k2 = k;\n");
1579   printf("                    else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1580   printf("                      k1 = k + 1;\n");
1581   printf("                    else\n");
1582   printf("                      return RET_ILSEQ;\n");
1583   printf("                  }\n");
1584   printf("                  {\n");
1585   printf("                    unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1586   printf("                    if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1587   printf("                      unsigned int diff = %s_ranges[k1].diff;\n", name);
1588   printf("                      *pwc = (ucs4_t) (i + diff);\n");
1589   printf("                      return 4;\n");
1590   printf("                    }\n");
1591   printf("                  }\n");
1592   printf("                }\n");
1593   printf("              }\n");
1594   printf("              return RET_ILSEQ;\n");
1595   printf("            }\n");
1596   printf("            return RET_TOOFEW(0);\n");
1597   printf("          }\n");
1598   printf("          return RET_ILSEQ;\n");
1599   printf("        }\n");
1600   printf("        return RET_TOOFEW(0);\n");
1601   printf("      }\n");
1602   printf("      return RET_ILSEQ;\n");
1603   printf("    }\n");
1604   printf("    return RET_TOOFEW(0);\n");
1605   printf("  }\n");
1606   printf("  return RET_ILSEQ;\n");
1607   printf("}\n");
1608 
1609   printf("\n");
1610 
1611   printf("static int\n");
1612   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
1613   printf("{\n");
1614   printf("  if (n >= 4) {\n");
1615   printf("    unsigned int i = wc;\n");
1616   printf("    if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1617   printf("      unsigned int k1 = 0;\n");
1618   printf("      unsigned int k2 = %d;\n", ranges_count-1);
1619   printf("      while (k1 < k2) {\n");
1620   printf("        unsigned int k = (k1 + k2) / 2;\n");
1621   printf("        if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1622   printf("          k2 = k;\n");
1623   printf("        else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1624   printf("          k1 = k + 1;\n");
1625   printf("        else\n");
1626   printf("          return RET_ILUNI;\n");
1627   printf("      }\n");
1628   printf("      {\n");
1629   printf("        unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1630   printf("        if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1631   printf("          unsigned int diff = %s_ranges[k1].diff;\n", name);
1632   printf("          i -= diff;\n");
1633   printf("          r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1634   printf("          r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1635   printf("          r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1636   printf("          r[0] = i + 0x81;\n");
1637   printf("          return 4;\n");
1638   printf("        }\n");
1639   printf("      }\n");
1640   printf("    }\n");
1641   printf("    return RET_ILUNI;\n");
1642   printf("  }\n");
1643   printf("  return RET_TOOSMALL;\n");
1644   printf("}\n");
1645 }
1646 
1647 /* JISX0213 specifics */
1648 
do_jisx0213(const char * name)1649 static void do_jisx0213 (const char* name)
1650 {
1651   printf("#ifndef _JISX0213_H\n");
1652   printf("#define _JISX0213_H\n");
1653   printf("\n");
1654   printf("/* JISX0213 plane 1 (= ISO-IR-233) characters are in the range\n");
1655   printf("   0x{21..7E}{21..7E}.\n");
1656   printf("   JISX0213 plane 2 (= ISO-IR-229) characters are in the range\n");
1657   printf("   0x{21,23..25,28,2C..2F,6E..7E}{21..7E}.\n");
1658   printf("   Together this makes 120 rows of 94 characters.\n");
1659   printf("*/\n");
1660   printf("\n");
1661   {
1662 #define row_convert(row) \
1663       ((row) >= 0x121 && (row) <= 0x17E ? row-289 : /* 0..93 */    \
1664        (row) == 0x221                   ? row-451 : /* 94 */       \
1665        (row) >= 0x223 && (row) <= 0x225 ? row-452 : /* 95..97 */   \
1666        (row) == 0x228                   ? row-454 : /* 98 */       \
1667        (row) >= 0x22C && (row) <= 0x22F ? row-457 : /* 99..102 */  \
1668        (row) >= 0x26E && (row) <= 0x27E ? row-519 : /* 103..119 */ \
1669        -1)
1670     unsigned int table[120][94];
1671     int pagemin[0x1100];
1672     int pagemax[0x1100];
1673     int pageidx[0x1100];
1674     unsigned int pagestart[0x1100];
1675     unsigned int pagestart_len = 0;
1676     {
1677       unsigned int rowc, colc;
1678       for (rowc = 0; rowc < 120; rowc++)
1679         for (colc = 0; colc < 94; colc++)
1680           table[rowc][colc] = 0;
1681     }
1682     {
1683       unsigned int page;
1684       for (page = 0; page < 0x1100; page++)
1685         pagemin[page] = -1;
1686       for (page = 0; page < 0x1100; page++)
1687         pagemax[page] = -1;
1688       for (page = 0; page < 0x1100; page++)
1689         pageidx[page] = -1;
1690     }
1691     printf("static const unsigned short jisx0213_to_ucs_combining[][2] = {\n");
1692     {
1693       int private_use = 0x0001;
1694       for (;;) {
1695         char line[30];
1696         unsigned int row, col;
1697         unsigned int ucs;
1698         memset(line,0,sizeof(line));
1699         if (scanf("%[^\n]\n",line) < 1)
1700           break;
1701         assert(line[0]=='0');
1702         assert(line[1]=='x');
1703         assert(isxdigit(line[2]));
1704         assert(isxdigit(line[3]));
1705         assert(isxdigit(line[4]));
1706         assert(isxdigit(line[5]));
1707         assert(isxdigit(line[6]));
1708         assert(line[7]=='\t');
1709         line[7] = '\0';
1710         col = strtoul(&line[5],NULL,16);
1711         line[5] = '\0';
1712         row = strtoul(&line[2],NULL,16);
1713         if (line[20] != '\0' && line[21] == '\0') {
1714           unsigned int u1, u2;
1715           assert(line[8]=='0');
1716           assert(line[9]=='x');
1717           assert(isxdigit(line[10]));
1718           assert(isxdigit(line[11]));
1719           assert(isxdigit(line[12]));
1720           assert(isxdigit(line[13]));
1721           assert(line[14]==' ');
1722           assert(line[15]=='0');
1723           assert(line[16]=='x');
1724           assert(isxdigit(line[17]));
1725           assert(isxdigit(line[18]));
1726           assert(isxdigit(line[19]));
1727           assert(isxdigit(line[20]));
1728           u2 = strtoul(&line[17],NULL,16);
1729           line[14] = '\0';
1730           u1 = strtoul(&line[10],NULL,16);
1731           printf("  { 0x%04x, 0x%04x },\n", u1, u2);
1732           ucs = private_use++;
1733         } else {
1734           assert(line[8]=='0');
1735           assert(line[9]=='x');
1736           assert(isxdigit(line[10]));
1737           assert(isxdigit(line[11]));
1738           assert(isxdigit(line[12]));
1739           assert(isxdigit(line[13]));
1740           ucs = strtoul(&line[10],NULL,16);
1741         }
1742         assert((unsigned int) row_convert(row) < 120);
1743         assert((unsigned int) (col-0x21) < 94);
1744         table[row_convert(row)][col-0x21] = ucs;
1745       }
1746     }
1747     printf("};\n");
1748     printf("\n");
1749     {
1750       unsigned int rowc, colc;
1751       for (rowc = 0; rowc < 120; rowc++) {
1752         for (colc = 0; colc < 94; colc++) {
1753           unsigned int value = table[rowc][colc];
1754           unsigned int page = value >> 8;
1755           unsigned int rest = value & 0xff;
1756           if (pagemin[page] < 0 || pagemin[page] > rest) pagemin[page] = rest;
1757           if (pagemax[page] < 0 || pagemax[page] < rest) pagemax[page] = rest;
1758         }
1759       }
1760     }
1761     {
1762       unsigned int index = 0;
1763       unsigned int i;
1764       for (i = 0; i < 0x1100; ) {
1765         if (pagemin[i] >= 0) {
1766           if (pagemin[i+1] >= 0 && pagemin[i] >= 0x80 && pagemax[i+1] < 0x80) {
1767             /* Combine two pages into a single one. */
1768             assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1769             pagestart[pagestart_len++] = (i<<8)+0x80;
1770             pageidx[i] = index;
1771             pageidx[i+1] = index;
1772             index++;
1773             i += 2;
1774           } else {
1775             /* A single page. */
1776             assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1777             pagestart[pagestart_len++] = i<<8;
1778             pageidx[i] = index;
1779             index++;
1780             i += 1;
1781           }
1782         } else
1783           i++;
1784       }
1785     }
1786     printf("static const unsigned short jisx0213_to_ucs_main[120 * 94] = {\n");
1787     {
1788       unsigned int row;
1789       for (row = 0; row < 0x300; row++) {
1790         unsigned int rowc = row_convert(row);
1791         if (rowc != (unsigned int) (-1)) {
1792           printf("  /* 0x%X21..0x%X7E */\n",row,row);
1793           {
1794             unsigned int count = 0;
1795             unsigned int colc;
1796             for (colc = 0; colc < 94; colc++) {
1797               if ((count % 8) == 0) printf(" ");
1798               {
1799                 unsigned int value = table[rowc][colc];
1800                 unsigned int page = value >> 8;
1801                 unsigned int index = pageidx[page];
1802                 assert(value-pagestart[index] < 0x100);
1803                 printf(" 0x%04x,",(index<<8)|(value-pagestart[index]));
1804               }
1805               count++;
1806               if ((count % 8) == 0) printf("\n");
1807             }
1808           }
1809           printf("\n");
1810         }
1811       }
1812     }
1813     printf("};\n");
1814     printf("\n");
1815     printf("static const ucs4_t jisx0213_to_ucs_pagestart[] = {\n");
1816     {
1817       unsigned int count = 0;
1818       unsigned int i;
1819       for (i = 0; i < pagestart_len; i++) {
1820         char buf[10];
1821         if ((count % 8) == 0) printf(" ");
1822         printf(" ");
1823         sprintf(buf,"0x%04x",pagestart[i]);
1824         if (strlen(buf) < 7) printf("%*s",7-strlen(buf),"");
1825         printf("%s,",buf);
1826         count++;
1827         if ((count % 8) == 0) printf("\n");
1828       }
1829     }
1830     printf("\n");
1831     printf("};\n");
1832 #undef row_convert
1833   }
1834   rewind(stdin);
1835   printf("\n");
1836   {
1837     int table[0x110000];
1838     bool pages[0x4400];
1839     int maxpage = -1;
1840     unsigned int combining_prefixes[100];
1841     unsigned int combining_prefixes_len = 0;
1842     {
1843       unsigned int i;
1844       for (i = 0; i < 0x110000; i++)
1845         table[i] = -1;
1846       for (i = 0; i < 0x4400; i++)
1847         pages[i] = false;
1848     }
1849     for (;;) {
1850       char line[30];
1851       unsigned int plane, row, col;
1852       memset(line,0,sizeof(line));
1853       if (scanf("%[^\n]\n",line) < 1)
1854         break;
1855       assert(line[0]=='0');
1856       assert(line[1]=='x');
1857       assert(isxdigit(line[2]));
1858       assert(isxdigit(line[3]));
1859       assert(isxdigit(line[4]));
1860       assert(isxdigit(line[5]));
1861       assert(isxdigit(line[6]));
1862       assert(line[7]=='\t');
1863       line[7] = '\0';
1864       col = strtoul(&line[5],NULL,16);
1865       line[5] = '\0';
1866       row = strtoul(&line[3],NULL,16);
1867       line[3] = '\0';
1868       plane = strtoul(&line[2],NULL,16) - 1;
1869       if (line[20] != '\0' && line[21] == '\0') {
1870         unsigned int u1, u2;
1871         assert(line[8]=='0');
1872         assert(line[9]=='x');
1873         assert(isxdigit(line[10]));
1874         assert(isxdigit(line[11]));
1875         assert(isxdigit(line[12]));
1876         assert(isxdigit(line[13]));
1877         assert(line[14]==' ');
1878         assert(line[15]=='0');
1879         assert(line[16]=='x');
1880         assert(isxdigit(line[17]));
1881         assert(isxdigit(line[18]));
1882         assert(isxdigit(line[19]));
1883         assert(isxdigit(line[20]));
1884         u2 = strtoul(&line[17],NULL,16);
1885         line[14] = '\0';
1886         u1 = strtoul(&line[10],NULL,16);
1887         assert(u2 == 0x02E5 || u2 == 0x02E9 || u2 == 0x0300 || u2 == 0x0301
1888                || u2 == 0x309A);
1889         assert(combining_prefixes_len < sizeof(combining_prefixes)/sizeof(combining_prefixes[0]));
1890         combining_prefixes[combining_prefixes_len++] = u1;
1891       } else {
1892         unsigned int ucs;
1893         assert(line[8]=='0');
1894         assert(line[9]=='x');
1895         assert(isxdigit(line[10]));
1896         assert(isxdigit(line[11]));
1897         assert(isxdigit(line[12]));
1898         assert(isxdigit(line[13]));
1899         ucs = strtoul(&line[10],NULL,16);
1900         /* Add an entry. */
1901         assert(plane <= 1);
1902         assert(row <= 0x7f);
1903         assert(col <= 0x7f);
1904         table[ucs] = (plane << 15) | (row << 8) | col;
1905         pages[ucs>>6] = true;
1906         if (maxpage < 0 || (ucs>>6) > maxpage) maxpage = ucs>>6;
1907       }
1908     }
1909     {
1910       unsigned int i;
1911       for (i = 0; i < combining_prefixes_len; i++) {
1912         unsigned int u1 = combining_prefixes[i];
1913         assert(table[u1] >= 0);
1914         table[u1] |= 0x0080;
1915       }
1916     }
1917     printf("static const short jisx0213_from_ucs_level1[%d] = {\n",maxpage+1);
1918     {
1919       unsigned int index = 0;
1920       unsigned int i;
1921       for (i = 0; i <= maxpage; i++) {
1922         if ((i % 8) == 0) printf(" ");
1923         if (pages[i]) {
1924           printf(" %3u,",index);
1925           index++;
1926         } else {
1927           printf(" %3d,",-1);
1928         }
1929         if (((i+1) % 8) == 0) printf("\n");
1930       }
1931     }
1932     printf("\n");
1933     printf("};\n");
1934     printf("\n");
1935     #if 0 /* Dense array */
1936     printf("static const unsigned short jisx0213_from_ucs_level2[] = {\n");
1937     {
1938       unsigned int i;
1939       for (i = 0; i <= maxpage; i++) {
1940         if (pages[i]) {
1941           printf("  /* 0x%04X */\n",i<<6);
1942           {
1943             unsigned int j;
1944             for (j = 0; j < 0x40; ) {
1945               unsigned int ucs = (i<<6)+j;
1946               int value = table[ucs];
1947               if (value < 0) value = 0;
1948               if ((j % 8) == 0) printf(" ");
1949               printf(" 0x%04x,",value);
1950               j++;
1951               if ((j % 8) == 0) printf("\n");
1952             }
1953           }
1954         }
1955       }
1956     }
1957     printf("};\n");
1958     #else /* Sparse array */
1959     {
1960       int summary_indx[0x11000];
1961       int summary_used[0x11000];
1962       unsigned int i, k, indx;
1963       printf("static const unsigned short jisx0213_from_ucs_level2_data[] = {\n");
1964       /* Fill summary_indx[] and summary_used[]. */
1965       indx = 0;
1966       for (i = 0, k = 0; i <= maxpage; i++) {
1967         if (pages[i]) {
1968           unsigned int j1, j2;
1969           unsigned int count = 0;
1970           printf("  /* 0x%04X */\n",i<<6);
1971           for (j1 = 0; j1 < 4; j1++) {
1972             summary_indx[4*k+j1] = indx;
1973             summary_used[4*k+j1] = 0;
1974             for (j2 = 0; j2 < 16; j2++) {
1975               unsigned int j = 16*j1+j2;
1976               unsigned int ucs = (i<<6)+j;
1977               int value = table[ucs];
1978               if (value < 0) value = 0;
1979               if (value > 0) {
1980                 summary_used[4*k+j1] |= (1 << j2);
1981                 if ((count % 8) == 0) printf(" ");
1982                 printf(" 0x%04x,",value);
1983                 count++;
1984                 if ((count % 8) == 0) printf("\n");
1985                 indx++;
1986               }
1987             }
1988           }
1989           if ((count % 8) > 0)
1990             printf("\n");
1991           k++;
1992         }
1993       }
1994       printf("};\n");
1995       printf("\n");
1996       printf("static const Summary16 jisx0213_from_ucs_level2_2indx[] = {\n");
1997       for (i = 0, k = 0; i <= maxpage; i++) {
1998         if (pages[i]) {
1999           unsigned int j1;
2000           printf("  /* 0x%04X */\n",i<<6);
2001           printf(" ");
2002           for (j1 = 0; j1 < 4; j1++) {
2003             printf(" { %4d, 0x%04x },", summary_indx[4*k+j1], summary_used[4*k+j1]);
2004           }
2005           printf("\n");
2006           k++;
2007         }
2008       }
2009       printf("};\n");
2010     }
2011     #endif
2012     printf("\n");
2013   }
2014   printf("#ifdef __GNUC__\n");
2015   printf("__inline\n");
2016   printf("#else\n");
2017   printf("#ifdef __cplusplus\n");
2018   printf("inline\n");
2019   printf("#endif\n");
2020   printf("#endif\n");
2021   printf("static ucs4_t jisx0213_to_ucs4 (unsigned int row, unsigned int col)\n");
2022   printf("{\n");
2023   printf("  ucs4_t val;\n");
2024   printf("\n");
2025   printf("  if (row >= 0x121 && row <= 0x17e)\n");
2026   printf("    row -= 289;\n");
2027   printf("  else if (row == 0x221)\n");
2028   printf("    row -= 451;\n");
2029   printf("  else if (row >= 0x223 && row <= 0x225)\n");
2030   printf("    row -= 452;\n");
2031   printf("  else if (row == 0x228)\n");
2032   printf("    row -= 454;\n");
2033   printf("  else if (row >= 0x22c && row <= 0x22f)\n");
2034   printf("    row -= 457;\n");
2035   printf("  else if (row >= 0x26e && row <= 0x27e)\n");
2036   printf("    row -= 519;\n");
2037   printf("  else\n");
2038   printf("    return 0x0000;\n");
2039   printf("\n");
2040   printf("  if (col >= 0x21 && col <= 0x7e)\n");
2041   printf("    col -= 0x21;\n");
2042   printf("  else\n");
2043   printf("    return 0x0000;\n");
2044   printf("\n");
2045   printf("  val = jisx0213_to_ucs_main[row * 94 + col];\n");
2046   printf("  val = jisx0213_to_ucs_pagestart[val >> 8] + (val & 0xff);\n");
2047   printf("  if (val == 0xfffd)\n");
2048   printf("    val = 0x0000;\n");
2049   printf("  return val;\n");
2050   printf("}\n");
2051   printf("\n");
2052   printf("#ifdef __GNUC__\n");
2053   printf("__inline\n");
2054   printf("#else\n");
2055   printf("#ifdef __cplusplus\n");
2056   printf("inline\n");
2057   printf("#endif\n");
2058   printf("#endif\n");
2059   printf("static unsigned short ucs4_to_jisx0213 (ucs4_t ucs)\n");
2060   printf("{\n");
2061   printf("  if (ucs < (sizeof(jisx0213_from_ucs_level1)/sizeof(jisx0213_from_ucs_level1[0])) << 6) {\n");
2062   printf("    int index1 = jisx0213_from_ucs_level1[ucs >> 6];\n");
2063   printf("    if (index1 >= 0)");
2064   #if 0 /* Dense array */
2065   printf("\n");
2066   printf("      return jisx0213_from_ucs_level2[(index1 << 6) + (ucs & 0x3f)];\n");
2067   #else /* Sparse array */
2068   printf(" {\n");
2069   printf("      const Summary16 *summary = &jisx0213_from_ucs_level2_2indx[((index1 << 6) + (ucs & 0x3f)) >> 4];\n");
2070   printf("      unsigned short used = summary->used;\n");
2071   printf("      unsigned int i = ucs & 0x0f;\n");
2072   printf("      if (used & ((unsigned short) 1 << i)) {\n");
2073   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
2074   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
2075   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
2076   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
2077   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
2078   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
2079   printf("        used = (used & 0x00ff) + (used >> 8);\n");
2080   printf("        return jisx0213_from_ucs_level2_data[summary->indx + used];\n");
2081   printf("      };\n");
2082   printf("    };\n");
2083   #endif
2084   printf("  }\n");
2085   printf("  return 0x0000;\n");
2086   printf("}\n");
2087   printf("\n");
2088   printf("#endif /* _JISX0213_H */\n");
2089 }
2090 
2091 /* Main program */
2092 
main(int argc,char * argv[])2093 int main (int argc, char *argv[])
2094 {
2095   const char* charsetname;
2096   const char* name;
2097 
2098   if (argc != 3)
2099     exit(1);
2100   charsetname = argv[1];
2101   name = argv[2];
2102 
2103   output_title(charsetname);
2104 
2105   if (!strcmp(name,"gb2312")
2106       || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
2107       || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
2108     do_normal(name);
2109   else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
2110            || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a")
2111            || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5")
2112            || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7")
2113            || !strcmp(name,"cns11643_15"))
2114     do_normal_only_charset2uni(name);
2115   else if (!strcmp(name,"cns11643_inv"))
2116     do_cns11643_only_uni2charset(name);
2117   else if (!strcmp(name,"gbkext1"))
2118     do_gbk1_only_charset2uni(name);
2119   else if (!strcmp(name,"gbkext2"))
2120     do_gbk2_only_charset2uni(name);
2121   else if (!strcmp(name,"gbkext_inv"))
2122     do_gbk1_only_uni2charset(name);
2123   else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
2124     do_gbk1(name);
2125   else if (!strcmp(name,"ksc5601"))
2126     do_ksc5601(name);
2127   else if (!strcmp(name,"uhc_1"))
2128     do_uhc_1(name);
2129   else if (!strcmp(name,"uhc_2"))
2130     do_uhc_2(name);
2131   else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
2132     do_big5(name);
2133   else if (!strcmp(name,"hkscs1999") || !strcmp(name,"hkscs2001")
2134            || !strcmp(name,"hkscs2004"))
2135     do_hkscs(name);
2136   else if (!strcmp(name,"johab_hangul"))
2137     do_johab_hangul(name);
2138   else if (!strcmp(name,"cp932ext"))
2139     do_sjis(name);
2140   else if (!strcmp(name,"gb18030uni"))
2141     do_gb18030uni(name);
2142   else if (!strcmp(name,"jisx0213"))
2143     do_jisx0213(name);
2144   else
2145     exit(1);
2146 
2147   return 0;
2148 }
2149