1 /*
2   make_hash.c v2003-01-24
3   make encodings.c from encodings.dat
4 
5   Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
6 
7   This program is free software; you can redistribute it and/or modify it
8   under the terms of version 2 of the GNU General Public License as published
9   by the Free Software Foundation.
10 
11   This program is distributed in the hope that it will be useful, but WITHOUT
12   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14   more details.
15 
16   You should have received a copy of the GNU General Public License along
17   with this program; if not, write to the Free Software Foundation, Inc.,
18   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
19 */
20 #ifdef HAVE_CONFIG_H
21 #  include "config.h"
22 #endif /* HAVE_CONFIG_H */
23 
24 #include <stdlib.h>
25 #include <stdio.h>
26 
27 #ifdef HAVE_STRING_H
28 #  include <string.h>
29 #else /* HAVE_STRING_H */
30 #  ifdef HAVE_STRINGS_H
31 #    include <strings.h>
32 #  endif /* HAVE_STRINGS_H */
33 #endif /* HAVE_STRING_H */
34 
35 #ifdef HAVE_MEMORY_H
36 #  include <memory.h>
37 #endif /* HAVE_MEMORY_H */
38 
39 #include <unistd.h>
40 #include <ctype.h>
41 
42 /* PARR {{{ */
43 #ifdef __GNUC__
44 # define PVAR(f, v) fprintf(stderr, "%s:%u %s(): " \
45                             #v " == %" #f "\n", __FILE__, __LINE__, __FUNCTION__, v)
46 # define PARR(f, v, n) ( { int _i; \
47   fprintf(stderr, "%s:%u %s(): " #v " == { ", __FILE__, __LINE__, __FUNCTION__); \
48   for (_i = 0; _i < n; _i++) fprintf(stderr, "%" #f ", ", (v)[_i]); \
49   fputs("}\n", stderr); \
50 } )
51 #else /* __GNUC__ */
52 /* FIXME */
53 #endif /* __GNUC__ */
54 /* }}} */
55 
56 #define LEN 4096
57 
58 typedef struct {
59   char *enca;
60   char *rfc1345;
61   char *cstocs;
62   char *iconv;
63   char *mime;
64   int naliases;
65   char **aliases;
66   char *human;
67   char *flags;
68   char *nsurface;
69 } EncaCharsetRaw;
70 
71 typedef struct {
72   int enca;
73   int rfc1345;
74   int cstocs;
75   int iconv;
76   int mime;
77   char *human;
78   char *flags;
79   char *nsurface;
80 } EncaCharsetFine;
81 
82 static EncaCharsetRaw RawNULL = {
83   NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL
84 };
85 
86 static char*
fixspaces(char * line)87 fixspaces(char *line)
88 {
89   char *p, *q;
90   int qs = 0;
91 
92   for (p = line; isspace(*p); p++)
93     ;
94   for (q = line; *p != '\0'; p++) {
95     if (isspace(*p)) {
96       *q = ' ';
97       qs = 1;
98     }
99     else {
100       if (qs) q++;
101       *q++ = *p;
102       qs = 0;
103     }
104   }
105   *q = '\0';
106 
107   return line;
108 }
109 
110 static int
add_item(const char * line,const char * name,char ** item)111 add_item(const char *line,
112          const char *name,
113          char **item)
114 {
115   const int len = strlen(name);
116 
117   if (*item != NULL) return 0;
118   if (strncmp(line, name, len) != 0) return 0;
119   *item = fixspaces(strdup(line + len));
120 
121   return 1;
122 }
123 
124 static char**
check_alias(char ** aliases,int * n,char * string)125 check_alias(char **aliases,
126             int *n,
127             char *string)
128 {
129   int i;
130 
131   if (string == NULL || string[0] == '\0') return aliases;
132   for (i = 0; i < *n; i++)
133     if (strcmp(aliases[i], string) == 0) return aliases;
134   (*n)++;
135   aliases = (char**)realloc(aliases, (*n)*sizeof(char*));
136   aliases[*n - 1] = strdup(string);
137 
138   return aliases;
139 }
140 
141 static EncaCharsetRaw*
read_raw_charset_data(FILE * stream,int * rsize)142 read_raw_charset_data(FILE *stream,
143                       int *rsize)
144 {
145   char *line;
146   EncaCharsetRaw *r, *raw;
147   int rs;
148   char *gl;
149 
150   line = (char*)malloc(LEN);
151   r = raw = (EncaCharsetRaw*)malloc(sizeof(EncaCharsetRaw));
152   *r = RawNULL;
153   rs = 1;
154   while (1) {
155     gl = fgets(line, LEN, stream);
156     if (r->enca && r->rfc1345 && r->cstocs && r->human && r->iconv && r->mime
157         && r->flags && r->nsurface && r->aliases) {
158       if (r->enca[0] == '\0') {
159         fprintf(stderr, "Enca's charset name #%d empty\n", (int)(r - raw + 1));
160         exit(1);
161       }
162       if (r->rfc1345[0] == '\0') {
163         fprintf(stderr, "RFC-1345 charset name #%d empty\n", (int)(r - raw + 1));
164         exit(1);
165       }
166       if (r->iconv[0] == '\0') r->iconv = NULL;
167       if (r->cstocs[0] == '\0') r->cstocs = NULL;
168       if (r->mime[0] == '\0') r->mime = NULL;
169       if (r->nsurface[0] == '\0') r->nsurface = strdup("0");
170       r->aliases = check_alias(r->aliases, &r->naliases, r->enca);
171       r->aliases = check_alias(r->aliases, &r->naliases, r->iconv);
172       r->aliases = check_alias(r->aliases, &r->naliases, r->rfc1345);
173       r->aliases = check_alias(r->aliases, &r->naliases, r->mime);
174       r->aliases = check_alias(r->aliases, &r->naliases, r->cstocs);
175       if (!gl) break;
176       rs++;
177       {
178         int d = r - raw;
179         raw = (EncaCharsetRaw*)realloc(raw, rs*sizeof(EncaCharsetRaw));
180         r = raw + d + 1;
181       }
182       *r = RawNULL;
183     }
184     line[LEN-1] = '\0';
185     fixspaces(line);
186     if (line[0] == '\0' || line[0] == '#') continue;
187     if (add_item(line, "enca:", &r->enca)) continue;
188     if (add_item(line, "rfc:", &r->rfc1345)) continue;
189     if (add_item(line, "iconv:", &r->iconv)) continue;
190     if (add_item(line, "mime:", &r->mime)) continue;
191     if (add_item(line, "cstocs:", &r->cstocs)) continue;
192     if (add_item(line, "human:", &r->human)) continue;
193     if (add_item(line, "flags:", &r->flags)) continue;
194     if (add_item(line, "nsurface:", &r->nsurface)) continue;
195     if (strncmp(line, "aliases:", 8) == 0 && !r->aliases) {
196       int i;
197       char *next, *l = fixspaces(line+8);
198       r->naliases = 1;
199       while ((l = strchr(l, ' ')) != NULL) {
200         r->naliases++;
201         l++;
202       }
203       r->aliases = (char**)malloc((r->naliases)*sizeof(char*));
204       l = line+8;
205       for (i = 0; i < r->naliases; i++) {
206         next = strchr(l, ' ');
207         if (next) *next = '\0';
208         r->aliases[i] = strdup(l);
209         l = next+1;
210       }
211       continue;
212     }
213     fprintf(stderr, "Unexpected `%s'\n", line);
214     exit(1);
215   }
216 
217   *rsize = rs;
218   return raw;
219 }
220 
221 static int
squeeze_compare(const char * x,const char * y)222 squeeze_compare(const char *x, const char *y)
223 {
224   while (*x != '\0' || *y != '\0') {
225     while (*x != '\0' && !isalnum(*x)) x++;
226     while (*y != '\0' && !isalnum(*y)) y++;
227     if (tolower(*x) != tolower(*y))
228       return (int)tolower(*x) - (int)tolower(*y);
229     if (*x != '\0') x++;
230     if (*y != '\0') y++;
231   }
232   return 0;
233 }
234 
235 static int
stable_compare(const void * p,const void * q)236 stable_compare(const void *p, const void *q)
237 {
238   char *x = *(char**)p;
239   char *y = *(char**)q;
240   int i;
241 
242   i = squeeze_compare(x, y);
243   /* to stabilize the sort */
244   if (i == 0) return strcmp(x, y);
245   return i;
246 }
247 
248 static int
bin_search(char ** alist,const int n,const char * s)249 bin_search(char **alist, const int n, const char *s)
250 {
251   int i1 = 0;
252   int i2 = n-1;
253   int i;
254 
255   i = stable_compare(&s, &alist[i1]);
256   if (i < 0) {
257     fprintf(stderr, "Out of search range: `%s'\n", s);
258     exit(0);
259   }
260   if (i == 0) return i1;
261 
262   i = stable_compare(&s, &alist[i2]);
263   if (i > 0) {
264     fprintf(stderr, "Out of search range: `%s'\n", s);
265     exit(0);
266   }
267   if (i == 0) return i2;
268 
269   while (i1+1 < i2) {
270     int im = (i1 + i2)/2;
271     i = stable_compare(&s, &alist[im]);
272     if (i == 0) return im;
273     if (i > 0) i1 = im; else i2 = im;
274   }
275   if (stable_compare(&s, &alist[i1+1]) == 0) return i1+1;
276 
277   fprintf(stderr, "Not found: `%s'\n", s);
278   exit(0);
279 }
280 
281 static char**
build_alias_list(EncaCharsetRaw * raw,const int ncs,int * total)282 build_alias_list(EncaCharsetRaw *raw, const int ncs, int *total)
283 {
284   char **alist;
285   int nn, i, j, k;
286 
287   for (i = nn = 0; i < ncs; i++) nn += raw[i].naliases;
288   alist = (char**)malloc(nn*sizeof(char*));
289   for (i = j = 0; i < ncs; i++) {
290     for (k = 0; k < raw[i].naliases; k++)
291       alist[j++] = raw[i].aliases[k];
292   }
293   qsort(alist, nn, sizeof(char*), &stable_compare);
294   for (i = 1; i < nn; ) {
295     if (squeeze_compare(alist[i], alist[i-1]) == 0) {
296       if (strcmp(alist[i], alist[i-1]) == 0) {
297         fprintf(stderr, "Removing duplicate `%s'\n", alist[i]);
298         memmove(alist+i-1, alist+i, (nn-i)*sizeof(char*));
299         nn--;
300       }
301       else {
302         fprintf(stderr, "Keeping equvialent `%s' and `%s'\n",
303                 alist[i], alist[i-1]);
304         i++;
305       }
306     }
307     else i++;
308   }
309 
310   *total = nn;
311   return alist;
312 }
313 
314 static EncaCharsetFine*
refine_data(EncaCharsetRaw * raw,const int ncs,char ** alist,const int nn)315 refine_data(EncaCharsetRaw *raw, const int ncs, char **alist, const int nn)
316 {
317   int i;
318   EncaCharsetFine *fine;
319 
320   fine = (EncaCharsetFine*)malloc(ncs*sizeof(EncaCharsetFine));
321 
322   for (i = 0; i < ncs; i++) {
323     fine[i].enca = bin_search(alist, nn, raw[i].enca);
324     fine[i].rfc1345 = bin_search(alist, nn, raw[i].rfc1345);
325     fine[i].iconv = raw[i].iconv ? bin_search(alist, nn, raw[i].iconv) : -1;
326     fine[i].cstocs = raw[i].cstocs ? bin_search(alist, nn, raw[i].cstocs) : -1;
327     fine[i].mime = raw[i].mime ? bin_search(alist, nn, raw[i].mime) : -1;
328     fine[i].human = raw[i].human;
329     fine[i].flags = raw[i].flags;
330     fine[i].nsurface = raw[i].nsurface;
331   }
332 
333   return fine;
334 }
335 
336 static int*
create_index_list(EncaCharsetRaw * raw,const int ncs,char ** alist,const int nn)337 create_index_list(EncaCharsetRaw *raw, const int ncs,
338                   char **alist, const int nn)
339 {
340   int i, k;
341   int *ilist;
342 
343   ilist = (int*)malloc(nn*sizeof(int));
344 
345   for (i = 0; i < ncs; i++) {
346     for (k = 0; k < raw[i].naliases; k++) {
347       ilist[bin_search(alist, nn, raw[i].aliases[k])] = i;
348     }
349   }
350 
351   return ilist;
352 }
353 
354 static void
print_fine_data(EncaCharsetFine * fine,const int ncs,int * ilist,char ** alist,const int nn)355 print_fine_data(EncaCharsetFine *fine, const int ncs,
356                 int *ilist, char **alist, const int nn)
357 {
358   int i;
359 
360   puts("/****  THIS IS A GENERATED FILE.  DO NOT TOUCH!  *****/");
361 
362   puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
363   puts("static const EncaCharsetInfo CHARSET_INFO[] = {");
364   for (i = 0; i < ncs; i++) {
365     printf("  {\n"
366            "     %d, %d, %d, %d, %d,\n"
367            "     \"%s\",\n"
368            "     %s,\n"
369            "     %s\n"
370            "  },\n",
371            fine[i].enca,
372            fine[i].rfc1345,
373            fine[i].cstocs,
374            fine[i].iconv,
375            fine[i].mime,
376            fine[i].human,
377            fine[i].flags,
378            fine[i].nsurface);
379   }
380   puts("};\n");
381 
382   puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
383   puts("static const char *ALIAS_LIST[] = {");
384   for (i = 0; i < nn; i++) printf("  \"%s\",\n", alist[i]);
385   puts("};\n");
386 
387   puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
388   puts("static const int INDEX_LIST[] = {");
389   for (i = 0; i < nn; i++) {
390     if (i%16 == 0) printf("  ");
391     printf("%2d, ", ilist[i]);
392     if (i%16 == 15 || i == nn-1) printf("\n");
393   }
394   puts("};\n");
395 }
396 
397 int
main(void)398 main(void)
399 {
400   EncaCharsetRaw *raw;
401   EncaCharsetFine *fine;
402   char **alist;
403   int *ilist;
404   int ncs, nn;
405 
406   raw = read_raw_charset_data(stdin, &ncs);
407   alist = build_alias_list(raw, ncs, &nn);
408   fine = refine_data(raw, ncs, alist, nn);
409   ilist = create_index_list(raw, ncs, alist, nn);
410   print_fine_data(fine, ncs, ilist, alist, nn);
411 
412   free(fine);
413   free(ilist);
414 
415   return 0;
416 }
417