1 /*
2 make_hash.c v2003-01-24
3 make encodings.c from encodings.dat
4
5 Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
6
7 This program is free software; you can redistribute it and/or modify it
8 under the terms of version 2 of the GNU General Public License as published
9 by the Free Software Foundation.
10
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation, Inc.,
18 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
19 */
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif /* HAVE_CONFIG_H */
23
24 #include <stdlib.h>
25 #include <stdio.h>
26
27 #ifdef HAVE_STRING_H
28 # include <string.h>
29 #else /* HAVE_STRING_H */
30 # ifdef HAVE_STRINGS_H
31 # include <strings.h>
32 # endif /* HAVE_STRINGS_H */
33 #endif /* HAVE_STRING_H */
34
35 #ifdef HAVE_MEMORY_H
36 # include <memory.h>
37 #endif /* HAVE_MEMORY_H */
38
39 #include <unistd.h>
40 #include <ctype.h>
41
42 /* PARR {{{ */
43 #ifdef __GNUC__
44 # define PVAR(f, v) fprintf(stderr, "%s:%u %s(): " \
45 #v " == %" #f "\n", __FILE__, __LINE__, __FUNCTION__, v)
46 # define PARR(f, v, n) ( { int _i; \
47 fprintf(stderr, "%s:%u %s(): " #v " == { ", __FILE__, __LINE__, __FUNCTION__); \
48 for (_i = 0; _i < n; _i++) fprintf(stderr, "%" #f ", ", (v)[_i]); \
49 fputs("}\n", stderr); \
50 } )
51 #else /* __GNUC__ */
52 /* FIXME */
53 #endif /* __GNUC__ */
54 /* }}} */
55
56 #define LEN 4096
57
58 typedef struct {
59 char *enca;
60 char *rfc1345;
61 char *cstocs;
62 char *iconv;
63 char *mime;
64 int naliases;
65 char **aliases;
66 char *human;
67 char *flags;
68 char *nsurface;
69 } EncaCharsetRaw;
70
71 typedef struct {
72 int enca;
73 int rfc1345;
74 int cstocs;
75 int iconv;
76 int mime;
77 char *human;
78 char *flags;
79 char *nsurface;
80 } EncaCharsetFine;
81
82 static EncaCharsetRaw RawNULL = {
83 NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL
84 };
85
86 static char*
fixspaces(char * line)87 fixspaces(char *line)
88 {
89 char *p, *q;
90 int qs = 0;
91
92 for (p = line; isspace(*p); p++)
93 ;
94 for (q = line; *p != '\0'; p++) {
95 if (isspace(*p)) {
96 *q = ' ';
97 qs = 1;
98 }
99 else {
100 if (qs) q++;
101 *q++ = *p;
102 qs = 0;
103 }
104 }
105 *q = '\0';
106
107 return line;
108 }
109
110 static int
add_item(const char * line,const char * name,char ** item)111 add_item(const char *line,
112 const char *name,
113 char **item)
114 {
115 const int len = strlen(name);
116
117 if (*item != NULL) return 0;
118 if (strncmp(line, name, len) != 0) return 0;
119 *item = fixspaces(strdup(line + len));
120
121 return 1;
122 }
123
124 static char**
check_alias(char ** aliases,int * n,char * string)125 check_alias(char **aliases,
126 int *n,
127 char *string)
128 {
129 int i;
130
131 if (string == NULL || string[0] == '\0') return aliases;
132 for (i = 0; i < *n; i++)
133 if (strcmp(aliases[i], string) == 0) return aliases;
134 (*n)++;
135 aliases = (char**)realloc(aliases, (*n)*sizeof(char*));
136 aliases[*n - 1] = strdup(string);
137
138 return aliases;
139 }
140
141 static EncaCharsetRaw*
read_raw_charset_data(FILE * stream,int * rsize)142 read_raw_charset_data(FILE *stream,
143 int *rsize)
144 {
145 char *line;
146 EncaCharsetRaw *r, *raw;
147 int rs;
148 char *gl;
149
150 line = (char*)malloc(LEN);
151 r = raw = (EncaCharsetRaw*)malloc(sizeof(EncaCharsetRaw));
152 *r = RawNULL;
153 rs = 1;
154 while (1) {
155 gl = fgets(line, LEN, stream);
156 if (r->enca && r->rfc1345 && r->cstocs && r->human && r->iconv && r->mime
157 && r->flags && r->nsurface && r->aliases) {
158 if (r->enca[0] == '\0') {
159 fprintf(stderr, "Enca's charset name #%d empty\n", (int)(r - raw + 1));
160 exit(1);
161 }
162 if (r->rfc1345[0] == '\0') {
163 fprintf(stderr, "RFC-1345 charset name #%d empty\n", (int)(r - raw + 1));
164 exit(1);
165 }
166 if (r->iconv[0] == '\0') r->iconv = NULL;
167 if (r->cstocs[0] == '\0') r->cstocs = NULL;
168 if (r->mime[0] == '\0') r->mime = NULL;
169 if (r->nsurface[0] == '\0') r->nsurface = strdup("0");
170 r->aliases = check_alias(r->aliases, &r->naliases, r->enca);
171 r->aliases = check_alias(r->aliases, &r->naliases, r->iconv);
172 r->aliases = check_alias(r->aliases, &r->naliases, r->rfc1345);
173 r->aliases = check_alias(r->aliases, &r->naliases, r->mime);
174 r->aliases = check_alias(r->aliases, &r->naliases, r->cstocs);
175 if (!gl) break;
176 rs++;
177 {
178 int d = r - raw;
179 raw = (EncaCharsetRaw*)realloc(raw, rs*sizeof(EncaCharsetRaw));
180 r = raw + d + 1;
181 }
182 *r = RawNULL;
183 }
184 line[LEN-1] = '\0';
185 fixspaces(line);
186 if (line[0] == '\0' || line[0] == '#') continue;
187 if (add_item(line, "enca:", &r->enca)) continue;
188 if (add_item(line, "rfc:", &r->rfc1345)) continue;
189 if (add_item(line, "iconv:", &r->iconv)) continue;
190 if (add_item(line, "mime:", &r->mime)) continue;
191 if (add_item(line, "cstocs:", &r->cstocs)) continue;
192 if (add_item(line, "human:", &r->human)) continue;
193 if (add_item(line, "flags:", &r->flags)) continue;
194 if (add_item(line, "nsurface:", &r->nsurface)) continue;
195 if (strncmp(line, "aliases:", 8) == 0 && !r->aliases) {
196 int i;
197 char *next, *l = fixspaces(line+8);
198 r->naliases = 1;
199 while ((l = strchr(l, ' ')) != NULL) {
200 r->naliases++;
201 l++;
202 }
203 r->aliases = (char**)malloc((r->naliases)*sizeof(char*));
204 l = line+8;
205 for (i = 0; i < r->naliases; i++) {
206 next = strchr(l, ' ');
207 if (next) *next = '\0';
208 r->aliases[i] = strdup(l);
209 l = next+1;
210 }
211 continue;
212 }
213 fprintf(stderr, "Unexpected `%s'\n", line);
214 exit(1);
215 }
216
217 *rsize = rs;
218 return raw;
219 }
220
221 static int
squeeze_compare(const char * x,const char * y)222 squeeze_compare(const char *x, const char *y)
223 {
224 while (*x != '\0' || *y != '\0') {
225 while (*x != '\0' && !isalnum(*x)) x++;
226 while (*y != '\0' && !isalnum(*y)) y++;
227 if (tolower(*x) != tolower(*y))
228 return (int)tolower(*x) - (int)tolower(*y);
229 if (*x != '\0') x++;
230 if (*y != '\0') y++;
231 }
232 return 0;
233 }
234
235 static int
stable_compare(const void * p,const void * q)236 stable_compare(const void *p, const void *q)
237 {
238 char *x = *(char**)p;
239 char *y = *(char**)q;
240 int i;
241
242 i = squeeze_compare(x, y);
243 /* to stabilize the sort */
244 if (i == 0) return strcmp(x, y);
245 return i;
246 }
247
248 static int
bin_search(char ** alist,const int n,const char * s)249 bin_search(char **alist, const int n, const char *s)
250 {
251 int i1 = 0;
252 int i2 = n-1;
253 int i;
254
255 i = stable_compare(&s, &alist[i1]);
256 if (i < 0) {
257 fprintf(stderr, "Out of search range: `%s'\n", s);
258 exit(0);
259 }
260 if (i == 0) return i1;
261
262 i = stable_compare(&s, &alist[i2]);
263 if (i > 0) {
264 fprintf(stderr, "Out of search range: `%s'\n", s);
265 exit(0);
266 }
267 if (i == 0) return i2;
268
269 while (i1+1 < i2) {
270 int im = (i1 + i2)/2;
271 i = stable_compare(&s, &alist[im]);
272 if (i == 0) return im;
273 if (i > 0) i1 = im; else i2 = im;
274 }
275 if (stable_compare(&s, &alist[i1+1]) == 0) return i1+1;
276
277 fprintf(stderr, "Not found: `%s'\n", s);
278 exit(0);
279 }
280
281 static char**
build_alias_list(EncaCharsetRaw * raw,const int ncs,int * total)282 build_alias_list(EncaCharsetRaw *raw, const int ncs, int *total)
283 {
284 char **alist;
285 int nn, i, j, k;
286
287 for (i = nn = 0; i < ncs; i++) nn += raw[i].naliases;
288 alist = (char**)malloc(nn*sizeof(char*));
289 for (i = j = 0; i < ncs; i++) {
290 for (k = 0; k < raw[i].naliases; k++)
291 alist[j++] = raw[i].aliases[k];
292 }
293 qsort(alist, nn, sizeof(char*), &stable_compare);
294 for (i = 1; i < nn; ) {
295 if (squeeze_compare(alist[i], alist[i-1]) == 0) {
296 if (strcmp(alist[i], alist[i-1]) == 0) {
297 fprintf(stderr, "Removing duplicate `%s'\n", alist[i]);
298 memmove(alist+i-1, alist+i, (nn-i)*sizeof(char*));
299 nn--;
300 }
301 else {
302 fprintf(stderr, "Keeping equvialent `%s' and `%s'\n",
303 alist[i], alist[i-1]);
304 i++;
305 }
306 }
307 else i++;
308 }
309
310 *total = nn;
311 return alist;
312 }
313
314 static EncaCharsetFine*
refine_data(EncaCharsetRaw * raw,const int ncs,char ** alist,const int nn)315 refine_data(EncaCharsetRaw *raw, const int ncs, char **alist, const int nn)
316 {
317 int i;
318 EncaCharsetFine *fine;
319
320 fine = (EncaCharsetFine*)malloc(ncs*sizeof(EncaCharsetFine));
321
322 for (i = 0; i < ncs; i++) {
323 fine[i].enca = bin_search(alist, nn, raw[i].enca);
324 fine[i].rfc1345 = bin_search(alist, nn, raw[i].rfc1345);
325 fine[i].iconv = raw[i].iconv ? bin_search(alist, nn, raw[i].iconv) : -1;
326 fine[i].cstocs = raw[i].cstocs ? bin_search(alist, nn, raw[i].cstocs) : -1;
327 fine[i].mime = raw[i].mime ? bin_search(alist, nn, raw[i].mime) : -1;
328 fine[i].human = raw[i].human;
329 fine[i].flags = raw[i].flags;
330 fine[i].nsurface = raw[i].nsurface;
331 }
332
333 return fine;
334 }
335
336 static int*
create_index_list(EncaCharsetRaw * raw,const int ncs,char ** alist,const int nn)337 create_index_list(EncaCharsetRaw *raw, const int ncs,
338 char **alist, const int nn)
339 {
340 int i, k;
341 int *ilist;
342
343 ilist = (int*)malloc(nn*sizeof(int));
344
345 for (i = 0; i < ncs; i++) {
346 for (k = 0; k < raw[i].naliases; k++) {
347 ilist[bin_search(alist, nn, raw[i].aliases[k])] = i;
348 }
349 }
350
351 return ilist;
352 }
353
354 static void
print_fine_data(EncaCharsetFine * fine,const int ncs,int * ilist,char ** alist,const int nn)355 print_fine_data(EncaCharsetFine *fine, const int ncs,
356 int *ilist, char **alist, const int nn)
357 {
358 int i;
359
360 puts("/**** THIS IS A GENERATED FILE. DO NOT TOUCH! *****/");
361
362 puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
363 puts("static const EncaCharsetInfo CHARSET_INFO[] = {");
364 for (i = 0; i < ncs; i++) {
365 printf(" {\n"
366 " %d, %d, %d, %d, %d,\n"
367 " \"%s\",\n"
368 " %s,\n"
369 " %s\n"
370 " },\n",
371 fine[i].enca,
372 fine[i].rfc1345,
373 fine[i].cstocs,
374 fine[i].iconv,
375 fine[i].mime,
376 fine[i].human,
377 fine[i].flags,
378 fine[i].nsurface);
379 }
380 puts("};\n");
381
382 puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
383 puts("static const char *ALIAS_LIST[] = {");
384 for (i = 0; i < nn; i++) printf(" \"%s\",\n", alist[i]);
385 puts("};\n");
386
387 puts("/* THIS IS A GENERATED TABLE, see tools/make_hash.c. */");
388 puts("static const int INDEX_LIST[] = {");
389 for (i = 0; i < nn; i++) {
390 if (i%16 == 0) printf(" ");
391 printf("%2d, ", ilist[i]);
392 if (i%16 == 15 || i == nn-1) printf("\n");
393 }
394 puts("};\n");
395 }
396
397 int
main(void)398 main(void)
399 {
400 EncaCharsetRaw *raw;
401 EncaCharsetFine *fine;
402 char **alist;
403 int *ilist;
404 int ncs, nn;
405
406 raw = read_raw_charset_data(stdin, &ncs);
407 alist = build_alias_list(raw, ncs, &nn);
408 fine = refine_data(raw, ncs, alist, nn);
409 ilist = create_index_list(raw, ncs, alist, nn);
410 print_fine_data(fine, ncs, ilist, alist, nn);
411
412 free(fine);
413 free(ilist);
414
415 return 0;
416 }
417