1 /* dictfmt.c --
2 * Created: Sun Jul 20 20:17:11 1997 by faith@acm.org
3 * Revised: Sun Jul 5 19:25:18 1998 by faith@acm.org
4 * Copyright 1997, 1998 Rickard E. Faith (faith@acm.org)
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 1, or (at your option) any
9 * later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: dictfmt.c,v 1.6 1998/07/06 01:33:40 faith Exp $
21 *
22 * Sun Jul 5 18:48:33 1998: added patches for Gutenberg's '1995 CIA World
23 * Factbook' from David Frey <david@eos.lugs.ch>.
24 *
25 */
26
27 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <time.h>
32 #include <ctype.h>
33
34 #if HAVE_GETOPT_H
35 #include <getopt.h>
36 #endif
37
38 #define FMT_MAXPOS 65
39 #define FMT_INDENT 0
40
41 #define JARGON 1
42 #define FOLDOC 2
43 #define EASTON 3
44 #define PERIODIC 4
45 #define HITCHCOCK 5
46 #define CIA1995 6
47
48 #define BSIZE 10240
49
50 static int Debug;
51 static FILE *str;
52
53 static unsigned char b64_list[] =
54 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
55
56 /* |b64_encode| encodes |val| in a printable base 64 format. A MSB-first
57 encoding is generated. */
58
b64_encode(unsigned long val)59 static const char *b64_encode( unsigned long val )
60 {
61 static char result[7];
62 int i;
63
64 result[0] = b64_list[ (val & 0xc0000000) >> 30 ];
65 result[1] = b64_list[ (val & 0x3f000000) >> 24 ];
66 result[2] = b64_list[ (val & 0x00fc0000) >> 18 ];
67 result[3] = b64_list[ (val & 0x0003f000) >> 12 ];
68 result[4] = b64_list[ (val & 0x00000fc0) >> 6 ];
69 result[5] = b64_list[ (val & 0x0000003f) ];
70 result[6] = 0;
71
72 for (i = 0; i < 5; i++) if (result[i] != b64_list[0]) return result + i;
73 return result + 5;
74 }
75
76 static FILE *fmt_str;
77 static int fmt_indent;
78 static int fmt_pos;
79 static int fmt_pending;
80 static int fmt_hwcount;
81 static int fmt_maxpos = FMT_MAXPOS;
82
fmt_openindex(const char * filename)83 static void fmt_openindex( const char *filename )
84 {
85 char buffer[1024];
86
87 if (!filename) return;
88
89 sprintf( buffer, "sort -df > %s\n", filename );
90
91 if (!(fmt_str = popen( buffer, "w" ))) {
92 fprintf( stderr, "Cannot open %s for write\n", buffer );
93 exit(1);
94 }
95 }
96
fmt_newline(void)97 static void fmt_newline( void )
98 {
99 int i;
100
101 fputc('\n', str);
102 for (i = 0; i < fmt_indent; i++) fputc(' ', str);
103 fmt_pos = fmt_indent;
104 fmt_pending = 0;
105 }
106
fmt_string(const char * s)107 static void fmt_string( const char *s )
108 {
109 char *sdup = malloc( strlen(s) + 1 );
110 char *pt = sdup;
111 char *p = sdup;
112 #if 0
113 char *t;
114 #endif
115 int len;
116
117 #if 1
118 strcpy( sdup, s );
119 #else
120 for (t = sdup; *s; s++) {
121 if (*s == '_') *t++ = ' ';
122 else *t++ = *s;
123 }
124 *t = '\0';
125 #endif
126
127 while ((pt = strchr(pt, ' '))) {
128 *pt++ = '\0';
129 len = strlen(p);
130 if (fmt_pending && fmt_pos + len > fmt_maxpos) {
131 fmt_newline();
132 }
133 if (fmt_pending) {
134 fputc(' ', str);
135 ++fmt_pos;
136 fmt_pending = 0;
137 }
138 fprintf( str, "%s", p );
139 fmt_pos += len;
140 p = pt;
141 fmt_pending = 1;
142 }
143
144 len = strlen(p);
145 if (fmt_pending && fmt_pos + len > fmt_maxpos) {
146 fmt_newline();
147 }
148 if (len && fmt_pending) {
149 fputc(' ', str);
150 ++fmt_pos;
151 fmt_pending = 0;
152 }
153 if (!len) {
154 fmt_pending = 1;
155 } else {
156 fprintf( str, "%s", p );
157 fmt_pos += len;
158 }
159
160 free(sdup);
161 }
162
fmt_newheadword(const char * word,int flag)163 static void fmt_newheadword( const char *word, int flag )
164 {
165 static char prev[1024] = "";
166 static int start = 0;
167 static int end;
168
169 fmt_indent = 0;
170 if (*prev) fmt_newline();
171 fflush(stdout);
172 end = ftell(str);
173
174 if (fmt_str && *prev) {
175 fprintf( fmt_str, "%s\t%s\t", prev, b64_encode(start) );
176 fprintf( fmt_str, "%s\n", b64_encode(end-start) );
177 }
178 if (word) {
179 strcpy(prev,word);
180 start = end;
181 if (flag) {
182 fmt_string(word);
183 fmt_indent += FMT_INDENT;
184 fmt_newline();
185 }
186 }
187
188 if (fmt_hwcount && !(fmt_hwcount % 100)) {
189 fprintf( stderr, "%10d headwords\r", fmt_hwcount );
190 }
191 ++fmt_hwcount;
192 }
193
fmt_closeindex(void)194 static void fmt_closeindex( void )
195 {
196 fmt_newheadword(NULL,0);
197 if (fmt_str) pclose( fmt_str );
198 fprintf( stderr, "%12d headwords\n", fmt_hwcount );
199 }
200
main(int argc,char ** argv)201 int main( int argc, char **argv )
202 {
203 int c;
204 int type = 0;
205 char buffer[1024];
206 char buffer2[1024];
207 char indexname[1024];
208 char dataname[1024];
209 const char *url = "unknown";
210 const char *sname = "unknown";
211 int header = 0;
212 time_t t;
213 char *pt;
214 char *s, *d;
215 char *buf;
216
217 while ((c = getopt(argc, argv, "jfephDu:s:c:")) != EOF)
218 switch (c) {
219 case 'j': type = JARGON; break;
220 case 'f': type = FOLDOC; break;
221 case 'e': type = EASTON; break;
222 case 'p': type = PERIODIC; break;
223 case 'h': type = HITCHCOCK; break;
224 case 'D': ++Debug; break;
225 case 'u': url = optarg; break;
226 case 's': sname = optarg; break;
227 case 'c':
228 switch (*optarg) {
229 case '5': type = CIA1995; break;
230 default: fprintf( stderr,
231 "Only CIA 1995 (-c5) currently supported\n" );
232 exit(1);
233 }
234 break;
235 default:
236 fprintf( stderr,
237 "usage: dictfmt [-jfephD] [-c5] -u url -s short basename\n");
238 exit(1);
239 }
240
241 if (optind + 1 != argc) {
242 fprintf( stderr,
243 "usage: dictfmt [-jfephD] [-c5] -u url -s short basename\n" );
244 exit(1);
245 }
246
247 sprintf( indexname, "%s.index", argv[optind] );
248 sprintf( dataname, "%s.dict", argv[optind] );
249
250 fmt_openindex( indexname );
251 if (Debug) {
252 str = stdout;
253 } else {
254 if (!(str = fopen(dataname, "w"))) {
255 fprintf(stderr, "Cannot open %s for write\n", dataname);
256 exit(1);
257 }
258 }
259
260 fmt_newheadword("00-database-url",1);
261 fmt_string( " " );
262 fmt_string( url );
263
264 fmt_newheadword("00-database-short",1);
265 fmt_string( " " );
266 fmt_string( sname );
267
268 fmt_newheadword("00-database-info",1);
269 fmt_string("This file was converted from the original database on:" );
270 fmt_newline();
271 time(&t);
272 sprintf( buffer, " %25.25s", ctime(&t) );
273 fmt_string( buffer );
274 fmt_newline();
275 fmt_newline();
276 fmt_string( "The original data is available from:" );
277 fmt_newline();
278 fmt_string( " " );
279 fmt_string( url );
280 fmt_newline();
281 fmt_newline();
282 fmt_string(
283 "The original data was distributed with the notice shown below."
284 " No additional restrictions are claimed. Please redistribute"
285 " this changed version under the same conditions and restriction"
286 " that apply to the original version." );
287 fmt_newline();
288 fmt_indent += 3;
289 fmt_newline();
290 fmt_maxpos = 200; /* Don't wrap */
291
292 while (fgets(buf = buffer,BSIZE-1,stdin)) {
293 if (strlen(buffer))
294 buffer[strlen(buffer)-1] = '\0'; /* remove newline */
295
296 switch (type) {
297 case HITCHCOCK:
298 if (strlen(buffer) == 1) {
299 header = 1;
300 continue;;
301 }
302 if (header) {
303 strcpy( buffer2, buffer );
304 if ((pt = strchr( buffer2, ','))) {
305 *pt = '\0';
306 fmt_newheadword(buffer2, 0);
307 }
308 }
309 break;
310 case EASTON:
311 strcpy( buffer2, buffer );
312 for (s = buffer2, d = buffer; *s; ++s) {
313 if (*s == '<') {
314 header = 1;
315 switch (s[1]) {
316 case 'I': *d++ = '_'; break;
317 case 'A':
318 if (s[3] == 'N') goto skip;
319 *d++ = '{';
320 break;
321 case 'P': goto skip;
322 case 'B': goto copy;
323 case '/':
324 switch(s[2]) {
325 case 'I': *d++ = '_'; break;
326 case 'A': *d++ = '}'; break;
327 case 'B': goto copy;
328 default:
329 fprintf( stderr,
330 "Unknown tag: %s (%c%c)\n",
331 buffer2, s[1], s[2] );
332 exit(1);
333 }
334 break;
335 default:
336 fprintf( stderr, "Unknown tag: %s (%c)\n", buffer2, s[1] );
337 exit(1);
338 }
339 while (*s && *s != '>') s++;
340 continue;
341 }
342 copy:
343 *d++ = *s;
344 }
345 *d = '\0';
346 #if 0
347 printf( "BEFORE: %s\n", buffer2 );
348 printf( "AFTER: %s\n", buffer );
349 #endif
350
351 if (*buffer == '<') {
352 switch (buffer[1]) {
353 case 'B':
354 if ((pt = strstr( buffer+3, " - </B>" ))) {
355 *pt = '\0';
356 fmt_newheadword(buffer+3, 0);
357 fmt_indent += 3;
358 memmove( buf, buffer+3, strlen(buffer+3)+1 );
359 } else {
360 fprintf( stderr, "No end: %s\n", buffer );
361 exit(1);
362 }
363 break;
364 default:
365 fprintf( stderr, "Unknown: %s\n", buffer );
366 exit(1);
367 }
368 } else {
369 if (buffer[0] == ' ' && buffer[1] == ' ') fmt_newline();
370 }
371 break;
372 case JARGON:
373 switch (*buffer) {
374 case ':':
375 header = 1;
376 if ((pt = strchr( buffer+1, ':' ))) {
377 s = pt + 1;
378 if (*s == ':') ++s;
379
380 *pt = '\0';
381 fmt_newheadword(buffer+1, 0);
382
383 memmove( buf, buffer+1, strlen(buffer+1));
384 memmove( pt-1, s, strlen(s)+1 ); /* move \0 also */
385 }
386 break;
387 case '*':
388 case '=':
389 case '-':
390 if (buffer[0] == buffer[1]
391 && buffer[0] == buffer[2]
392 && buffer[0] == buffer[3])
393 continue; /* Skip lines with *'s and ='s */
394 }
395 break;
396 case PERIODIC:
397 switch (*buffer) {
398 case '%':
399 if (buffer[1] == 'h') {
400 if (!header) {
401 header = 1;
402 continue;
403 } else {
404 fmt_newheadword(buffer+3,1);
405 continue;
406 }
407 } else if (buffer[1] == 'd') {
408 continue;
409 }
410 break;
411 }
412 break;
413 case FOLDOC:
414 if (*buffer && *buffer != ' ' && *buffer != '\t') {
415 if (header < 2) {
416 ++header;
417 } else {
418 fmt_newheadword(buffer,1);
419 continue;
420 }
421 }
422 if (*buf == '\t') {
423 memmove( buf+2, buf, strlen(buf)+1 ); /* move \0 */
424 buf[0] = buf[1] = buf[2] = ' ';
425 }
426 break;
427 case CIA1995:
428 if (*buffer == '@') {
429 buf++;
430 } else if (strncmp(buffer, "_____",5) == 0) {
431 fgets(buf = buffer,BSIZE-1,stdin); /* empty line */
432
433 fgets(buf = buffer,BSIZE-1,stdin);
434 if (strlen(buffer))
435 buffer[strlen(buffer)-1] = '\0'; /* remove newline */
436
437 while (isspace(*buf)) buf++;
438 if (*buf != '\0') {
439 char *p;
440 int l; /* flag: downcase the letter */
441
442 header=1;
443
444 /* Downcase `buf' sensibly */
445
446 /* Leave first character in upper case,
447 but downcase the rest */
448 p=buf; p++; l=1;
449 while (*p != '\0') {
450 if (isspace(*p)) l=0;
451 else {
452 if (l) *p=tolower(*p);
453 l=1;
454 }
455 p++;
456 }
457 fmt_newheadword(buf,0);
458 }
459 }
460 break;
461 default:
462 fprintf(stderr, "Unknown input format type %d\n", type );
463 exit(2);
464 }
465 fmt_string(buf);
466 fmt_newline();
467 skip:
468 }
469
470 fmt_newheadword(NULL,0);
471
472 fmt_closeindex();
473 fclose(str);
474 return 0;
475 }
476