1 /* Copyright (C) 1999-2003, 2005, 2011 Free Software Foundation, Inc.
2    This file is part of the GNU LIBICONV Library.
3 
4    The GNU LIBICONV Library is free software; you can redistribute it
5    and/or modify it under the terms of the GNU Library General Public
6    License as published by the Free Software Foundation; either version 2
7    of the License, or (at your option) any later version.
8 
9    The GNU LIBICONV Library is distributed in the hope that it will be
10    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Library General Public License for more details.
13 
14    You should have received a copy of the GNU Library General Public
15    License along with the GNU LIBICONV Library; see the file COPYING.LIB.
16    If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
17    Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 /*
20  * Generates a table of small strings, used for transliteration, from a table
21  * containing lines of the form
22  *   Unicode <tab> utf-8 replacement <tab> # comment
23  */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <stdbool.h>
28 
main(int argc,char * argv[])29 int main (int argc, char *argv[])
30 {
31   unsigned int data[0x100000];
32   int uni2index[0x110000];
33   int index;
34 
35   if (argc != 1)
36     exit(1);
37 
38   printf("/*\n");
39   printf(" * Copyright (C) 1999-2003 Free Software Foundation, Inc.\n");
40   printf(" * This file is part of the GNU LIBICONV Library.\n");
41   printf(" *\n");
42   printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
43   printf(" * and/or modify it under the terms of the GNU Library General Public\n");
44   printf(" * License as published by the Free Software Foundation; either version 2\n");
45   printf(" * of the License, or (at your option) any later version.\n");
46   printf(" *\n");
47   printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
48   printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
49   printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
50   printf(" * Library General Public License for more details.\n");
51   printf(" *\n");
52   printf(" * You should have received a copy of the GNU Library General Public\n");
53   printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
54   printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
55   printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n");
56   printf(" */\n");
57   printf("\n");
58   printf("/*\n");
59   printf(" * Transliteration table\n");
60   printf(" */\n");
61   printf("\n");
62   {
63     int c;
64     int j;
65     for (j = 0; j < 0x110000; j++)
66       uni2index[j] = -1;
67     index = 0;
68     for (;;) {
69       c = getc(stdin);
70       if (c == EOF)
71         break;
72       if (c == '#') {
73         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
74         continue;
75       }
76       ungetc(c,stdin);
77       if (scanf("%x",&j) != 1)
78         exit(1);
79       c = getc(stdin);
80       if (c != '\t')
81         exit(1);
82       for (;;) {
83         c = getc(stdin);
84         if (c == EOF || c == '\n')
85           exit(1);
86         if (c == '\t')
87           break;
88         if (uni2index[j] < 0) {
89           uni2index[j] = index;
90           data[index++] = 0;
91         }
92         if (c >= 0x80) {
93           /* Finish reading an UTF-8 character. */
94           if (c < 0xc0)
95             exit(1);
96           else {
97             unsigned int i = (c < 0xe0 ? 2 : c < 0xf0 ? 3 : c < 0xf8 ? 4 : c < 0xfc ? 5 : 6);
98             c &= (1 << (8-i)) - 1;
99             while (--i > 0) {
100               int cc = getc(stdin);
101               if (!(cc >= 0x80 && cc < 0xc0))
102                 exit(1);
103               c <<= 6; c |= (cc & 0x3f);
104             }
105           }
106         }
107         data[index++] = (unsigned int) c;
108       }
109       if (uni2index[j] >= 0)
110         data[uni2index[j]] = index - uni2index[j] - 1;
111       do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
112     }
113   }
114   printf("static const unsigned int translit_data[%d] = {",index);
115   {
116     int i;
117     for (i = 0; i < index; i++) {
118       if (data[i] < 32)
119         printf("\n %3d,",data[i]);
120       else if (data[i] == '\'')
121         printf("'\\'',");
122       else if (data[i] == '\\')
123         printf("'\\\\',");
124       else if (data[i] < 127)
125         printf(" '%c',",data[i]);
126       else if (data[i] < 256)
127         printf("0x%02X,",data[i]);
128       else
129         printf("0x%04X,",data[i]);
130     }
131     printf("\n};\n");
132   }
133   printf("\n");
134   {
135     bool pages[0x1100];
136     int line[0x22000];
137     int tableno;
138     struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
139     int i, j, p, j1, j2, t;
140 
141     for (p = 0; p < 0x1100; p++)
142       pages[p] = false;
143     for (j = 0; j < 0x110000; j++)
144       if (uni2index[j] >= 0)
145         pages[j>>8] = true;
146     for (j1 = 0; j1 < 0x22000; j1++) {
147       bool all_invalid = true;
148       for (j2 = 0; j2 < 8; j2++) {
149         j = 8*j1+j2;
150         if (uni2index[j] >= 0)
151           all_invalid = false;
152       }
153       if (all_invalid)
154         line[j1] = -1;
155       else
156         line[j1] = 0;
157     }
158     tableno = 0;
159     for (j1 = 0; j1 < 0x22000; j1++) {
160       if (line[j1] >= 0) {
161         if (tableno > 0
162             && ((j1 > 0 && line[j1-1] == tableno-1)
163                 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
164                     && j1 - tables[tableno-1].maxline <= 8))) {
165           line[j1] = tableno-1;
166           tables[tableno-1].maxline = j1;
167         } else {
168           tableno++;
169           line[j1] = tableno-1;
170           tables[tableno-1].minline = tables[tableno-1].maxline = j1;
171         }
172       }
173     }
174     for (t = 0; t < tableno; t++) {
175       tables[t].usecount = 0;
176       j1 = 8*tables[t].minline;
177       j2 = 8*(tables[t].maxline+1);
178       for (j = j1; j < j2; j++)
179         if (uni2index[j] >= 0)
180           tables[t].usecount++;
181     }
182     for (t = 0, p = -1, i = 0; t < tableno; t++) {
183       if (tables[t].usecount > 1) {
184         char* s;
185         if (p == tables[t].minline >> 5) {
186           s = (char*) malloc(4+1+2+1);
187           sprintf(s, "%02x_%d", p, ++i);
188         } else {
189           p = tables[t].minline >> 5;
190           s = (char*) malloc(4+1);
191           sprintf(s, "%02x", p);
192         }
193         tables[t].suffix = s;
194       } else
195         tables[t].suffix = NULL;
196     }
197     {
198       p = -1;
199       for (t = 0; t < tableno; t++)
200         if (tables[t].usecount > 1) {
201           p = 0;
202           printf("static const short translit_page%s[%d] = {\n", tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
203           for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
204             if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
205               printf("  /* 0x%04x */\n", 8*j1);
206             printf(" ");
207             for (j2 = 0; j2 < 8; j2++) {
208               j = 8*j1+j2;
209               printf(" %4d,", uni2index[j]);
210             }
211             printf(" /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
212           }
213           printf("};\n");
214         }
215       if (p >= 0)
216         printf("\n");
217     }
218     printf("#define translit_index(wc) \\\n  (");
219     for (j1 = 0; j1 < 0x22000;) {
220       t = line[j1];
221       for (j2 = j1; j2 < 0x22000 && line[j2] == t; j2++);
222       if (t >= 0) {
223         if (j1 != tables[t].minline) abort();
224         if (j2 > tables[t].maxline+1) abort();
225         j2 = tables[t].maxline+1;
226       }
227       if (t == -1) {
228       } else {
229         if (t >= 0 && tables[t].usecount == 0) abort();
230         if (t >= 0 && tables[t].usecount == 1) {
231           if (j2 != j1+1) abort();
232           for (j = 8*j1; j < 8*j2; j++)
233             if (uni2index[j] >= 0) {
234               printf("wc == 0x%04x ? %d", j, uni2index[j]);
235               break;
236             }
237         } else {
238           if (j1 == 0) {
239             printf("wc < 0x%04x", 8*j2);
240           } else {
241             printf("wc >= 0x%04x && wc < 0x%04x", 8*j1, 8*j2);
242           }
243           printf(" ? translit_page%s[wc", tables[t].suffix);
244           if (tables[t].minline > 0)
245             printf("-0x%04x", 8*j1);
246           printf("]");
247         }
248         printf(" : \\\n   ");
249       }
250       j1 = j2;
251     }
252     printf("-1)\n");
253   }
254 
255   if (ferror(stdout) || fclose(stdout))
256     exit(1);
257   exit(0);
258 }
259