1 /* dictfmt.c --
2  * Created: Sun Jul 20 20:17:11 1997 by faith@acm.org
3  * Revised: Sun Jul  5 19:25:18 1998 by faith@acm.org
4  * Copyright 1997, 1998 Rickard E. Faith (faith@acm.org)
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License as published by the
8  * Free Software Foundation; either version 1, or (at your option) any
9  * later version.
10  *
11  * This program is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with this program; if not, write to the Free Software Foundation, Inc.,
18  * 675 Mass Ave, Cambridge, MA 02139, USA.
19  *
20  * $Id: dictfmt.c,v 1.6 1998/07/06 01:33:40 faith Exp $
21  *
22  * Sun Jul 5 18:48:33 1998: added patches for Gutenberg's '1995 CIA World
23  * Factbook' from David Frey <david@eos.lugs.ch>.
24  *
25  */
26 
27 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <time.h>
32 #include <ctype.h>
33 
34 #if HAVE_GETOPT_H
35 #include <getopt.h>
36 #endif
37 
38 #define FMT_MAXPOS 65
39 #define FMT_INDENT  0
40 
41 #define JARGON    1
42 #define FOLDOC    2
43 #define EASTON    3
44 #define PERIODIC  4
45 #define HITCHCOCK 5
46 #define CIA1995   6
47 
48 #define BSIZE 10240
49 
50 static int  Debug;
51 static FILE *str;
52 
53 static unsigned char b64_list[] =
54 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
55 
56 /* |b64_encode| encodes |val| in a printable base 64 format.  A MSB-first
57    encoding is generated. */
58 
b64_encode(unsigned long val)59 static const char *b64_encode( unsigned long val )
60 {
61    static char   result[7];
62    int    i;
63 
64    result[0] = b64_list[ (val & 0xc0000000) >> 30 ];
65    result[1] = b64_list[ (val & 0x3f000000) >> 24 ];
66    result[2] = b64_list[ (val & 0x00fc0000) >> 18 ];
67    result[3] = b64_list[ (val & 0x0003f000) >> 12 ];
68    result[4] = b64_list[ (val & 0x00000fc0) >>  6 ];
69    result[5] = b64_list[ (val & 0x0000003f)       ];
70    result[6] = 0;
71 
72    for (i = 0; i < 5; i++) if (result[i] != b64_list[0]) return result + i;
73    return result + 5;
74 }
75 
76 static FILE *fmt_str;
77 static int  fmt_indent;
78 static int  fmt_pos;
79 static int  fmt_pending;
80 static int  fmt_hwcount;
81 static int  fmt_maxpos = FMT_MAXPOS;
82 
fmt_openindex(const char * filename)83 static void fmt_openindex( const char *filename )
84 {
85    char buffer[1024];
86 
87    if (!filename) return;
88 
89    sprintf( buffer, "sort -df > %s\n", filename );
90 
91    if (!(fmt_str = popen( buffer, "w" ))) {
92       fprintf( stderr, "Cannot open %s for write\n", buffer );
93       exit(1);
94    }
95 }
96 
fmt_newline(void)97 static void fmt_newline( void )
98 {
99    int i;
100 
101    fputc('\n', str);
102    for (i = 0; i < fmt_indent; i++) fputc(' ', str);
103    fmt_pos = fmt_indent;
104    fmt_pending = 0;
105 }
106 
fmt_string(const char * s)107 static void fmt_string( const char *s )
108 {
109    char *sdup = malloc( strlen(s) + 1 );
110    char *pt = sdup;
111    char *p = sdup;
112 #if 0
113    char *t;
114 #endif
115    int  len;
116 
117 #if 1
118    strcpy( sdup, s );
119 #else
120    for (t = sdup; *s; s++) {
121       if (*s == '_') *t++ = ' ';
122       else *t++ = *s;
123    }
124    *t = '\0';
125 #endif
126 
127    while ((pt = strchr(pt, ' '))) {
128       *pt++ = '\0';
129       len = strlen(p);
130       if (fmt_pending && fmt_pos + len > fmt_maxpos) {
131 	 fmt_newline();
132       }
133       if (fmt_pending) {
134 	 fputc(' ', str);
135 	 ++fmt_pos;
136 	 fmt_pending = 0;
137       }
138       fprintf( str, "%s", p );
139       fmt_pos += len;
140       p = pt;
141       fmt_pending = 1;
142    }
143 
144    len = strlen(p);
145    if (fmt_pending && fmt_pos + len > fmt_maxpos) {
146       fmt_newline();
147    }
148    if (len && fmt_pending) {
149       fputc(' ', str);
150       ++fmt_pos;
151       fmt_pending = 0;
152    }
153    if (!len) {
154       fmt_pending = 1;
155    } else {
156       fprintf( str, "%s", p );
157       fmt_pos += len;
158    }
159 
160    free(sdup);
161 }
162 
fmt_newheadword(const char * word,int flag)163 static void fmt_newheadword( const char *word, int flag )
164 {
165    static char prev[1024] = "";
166    static int  start = 0;
167    static int  end;
168 
169    fmt_indent = 0;
170    if (*prev) fmt_newline();
171    fflush(stdout);
172    end = ftell(str);
173 
174    if (fmt_str && *prev) {
175       fprintf( fmt_str, "%s\t%s\t", prev, b64_encode(start) );
176       fprintf( fmt_str, "%s\n", b64_encode(end-start) );
177    }
178    if (word) {
179       strcpy(prev,word);
180       start = end;
181       if (flag) {
182 	 fmt_string(word);
183 	 fmt_indent += FMT_INDENT;
184 	 fmt_newline();
185       }
186    }
187 
188    if (fmt_hwcount && !(fmt_hwcount % 100)) {
189       fprintf( stderr, "%10d headwords\r", fmt_hwcount );
190    }
191    ++fmt_hwcount;
192 }
193 
fmt_closeindex(void)194 static void fmt_closeindex( void )
195 {
196    fmt_newheadword(NULL,0);
197    if (fmt_str) pclose( fmt_str );
198    fprintf( stderr, "%12d headwords\n", fmt_hwcount );
199 }
200 
main(int argc,char ** argv)201 int main( int argc, char **argv )
202 {
203    int        c;
204    int        type = 0;
205    char       buffer[1024];
206    char       buffer2[1024];
207    char       indexname[1024];
208    char       dataname[1024];
209    const char *url = "unknown";
210    const char *sname = "unknown";
211    int        header = 0;
212    time_t     t;
213    char       *pt;
214    char       *s, *d;
215    char       *buf;
216 
217    while ((c = getopt(argc, argv, "jfephDu:s:c:")) != EOF)
218       switch (c) {
219       case 'j': type = JARGON;    break;
220       case 'f': type = FOLDOC;    break;
221       case 'e': type = EASTON;    break;
222       case 'p': type = PERIODIC;  break;
223       case 'h': type = HITCHCOCK; break;
224       case 'D': ++Debug;          break;
225       case 'u': url = optarg;     break;
226       case 's': sname = optarg;   break;
227       case 'c':
228 	 switch (*optarg) {
229 	 case '5': type = CIA1995; break;
230 	 default:  fprintf( stderr,
231 			    "Only CIA 1995 (-c5) currently supported\n" );
232 	 exit(1);
233 	 }
234 	 break;
235       default:
236 	 fprintf( stderr,
237 		  "usage: dictfmt [-jfephD] [-c5] -u url -s short basename\n");
238 	 exit(1);
239       }
240 
241    if (optind + 1 != argc) {
242       fprintf( stderr,
243 	       "usage: dictfmt [-jfephD] [-c5] -u url -s short basename\n" );
244       exit(1);
245    }
246 
247    sprintf( indexname, "%s.index", argv[optind] );
248    sprintf( dataname,  "%s.dict", argv[optind] );
249 
250    fmt_openindex( indexname );
251    if (Debug) {
252       str = stdout;
253    } else {
254       if (!(str = fopen(dataname, "w"))) {
255 	 fprintf(stderr, "Cannot open %s for write\n", dataname);
256 	 exit(1);
257       }
258    }
259 
260    fmt_newheadword("00-database-url",1);
261    fmt_string( "     " );
262    fmt_string( url );
263 
264    fmt_newheadword("00-database-short",1);
265    fmt_string( "     " );
266    fmt_string( sname );
267 
268    fmt_newheadword("00-database-info",1);
269    fmt_string("This file was converted from the original database on:" );
270    fmt_newline();
271    time(&t);
272    sprintf( buffer, "          %25.25s", ctime(&t) );
273    fmt_string( buffer );
274    fmt_newline();
275    fmt_newline();
276    fmt_string( "The original data is available from:" );
277    fmt_newline();
278    fmt_string( "     " );
279    fmt_string( url );
280    fmt_newline();
281    fmt_newline();
282    fmt_string(
283       "The original data was distributed with the notice shown below."
284       "  No additional restrictions are claimed.  Please redistribute"
285       " this changed version under the same conditions and restriction"
286       " that apply to the original version." );
287    fmt_newline();
288    fmt_indent += 3;
289    fmt_newline();
290    fmt_maxpos = 200;		/* Don't wrap */
291 
292    while (fgets(buf = buffer,BSIZE-1,stdin)) {
293       if (strlen(buffer))
294 	 buffer[strlen(buffer)-1] = '\0'; /* remove newline */
295 
296       switch (type) {
297       case HITCHCOCK:
298 	 if (strlen(buffer) == 1) {
299 	    header = 1;
300 	    continue;;
301 	 }
302 	 if (header) {
303 	    strcpy( buffer2, buffer );
304 	    if ((pt = strchr( buffer2, ','))) {
305 	       *pt = '\0';
306 	       fmt_newheadword(buffer2, 0);
307 	    }
308 	 }
309 	 break;
310       case EASTON:
311 	 strcpy( buffer2, buffer );
312 	 for (s = buffer2, d = buffer; *s; ++s) {
313 	    if (*s == '<') {
314 	       header = 1;
315 	       switch (s[1]) {
316 	       case 'I': *d++ = '_'; break;
317 	       case 'A':
318 		  if (s[3] == 'N') goto skip;
319 		  *d++ = '{';
320 		  break;
321 	       case 'P': goto skip;
322 	       case 'B': goto copy;
323 	       case '/':
324 		  switch(s[2]) {
325 		  case 'I': *d++ = '_'; break;
326 		  case 'A': *d++ = '}'; break;
327 		  case 'B': goto copy;
328 		  default:
329 		     fprintf( stderr,
330 			      "Unknown tag: %s (%c%c)\n",
331 			      buffer2, s[1], s[2] );
332 		     exit(1);
333 		  }
334 		  break;
335 	       default:
336 		  fprintf( stderr, "Unknown tag: %s (%c)\n", buffer2, s[1] );
337 		  exit(1);
338 	       }
339 	       while (*s && *s != '>') s++;
340 	       continue;
341 	    }
342       copy:
343 	    *d++ = *s;
344 	 }
345 	 *d = '\0';
346 #if 0
347 	 printf( "BEFORE: %s\n", buffer2 );
348 	 printf( "AFTER: %s\n", buffer );
349 #endif
350 
351 	 if (*buffer == '<') {
352 	    switch (buffer[1]) {
353 	    case 'B':
354 	       if ((pt = strstr( buffer+3, " - </B>" ))) {
355 		  *pt = '\0';
356 		  fmt_newheadword(buffer+3, 0);
357 		  fmt_indent += 3;
358 		  memmove( buf, buffer+3, strlen(buffer+3)+1 );
359 	       } else {
360 		  fprintf( stderr, "No end: %s\n", buffer );
361 		  exit(1);
362 	       }
363 	       break;
364 	    default:
365 	       fprintf( stderr, "Unknown: %s\n", buffer );
366 	       exit(1);
367 	    }
368 	 } else {
369 	    if (buffer[0] == ' ' && buffer[1] == ' ') fmt_newline();
370 	 }
371 	 break;
372       case JARGON:
373 	 switch (*buffer) {
374 	 case ':':
375 	    header = 1;
376 	    if ((pt = strchr( buffer+1, ':' ))) {
377 	       s = pt + 1;
378 	       if (*s == ':') ++s;
379 
380 	       *pt = '\0';
381 	       fmt_newheadword(buffer+1, 0);
382 
383 	       memmove( buf, buffer+1, strlen(buffer+1));
384 	       memmove( pt-1, s, strlen(s)+1 ); /* move \0 also */
385 	    }
386 	    break;
387 	 case '*':
388 	 case '=':
389 	 case '-':
390 	    if (buffer[0] == buffer[1]
391 		&& buffer[0] == buffer[2]
392 		&& buffer[0] == buffer[3])
393 	       continue;		/* Skip lines with *'s and ='s */
394 	 }
395 	 break;
396       case PERIODIC:
397 	 switch (*buffer) {
398 	 case '%':
399 	    if (buffer[1] == 'h') {
400 	       if (!header) {
401 		  header = 1;
402 		  continue;
403 	       } else {
404 		  fmt_newheadword(buffer+3,1);
405 		  continue;
406 	       }
407 	    } else if (buffer[1] == 'd') {
408 	       continue;
409 	    }
410 	    break;
411 	 }
412 	 break;
413       case FOLDOC:
414 	 if (*buffer && *buffer != ' ' && *buffer != '\t') {
415 	    if (header < 2) {
416 	       ++header;
417 	    } else {
418 	       fmt_newheadword(buffer,1);
419 	       continue;
420 	    }
421 	 }
422 	 if (*buf == '\t') {
423 	    memmove( buf+2, buf, strlen(buf)+1 ); /* move \0 */
424 	    buf[0] = buf[1] = buf[2] = ' ';
425 	 }
426 	 break;
427       case CIA1995:
428 	 if (*buffer == '@') {
429 	    buf++;
430 	 } else if (strncmp(buffer, "_____",5) == 0) {
431 	    fgets(buf = buffer,BSIZE-1,stdin); /* empty line */
432 
433 	    fgets(buf = buffer,BSIZE-1,stdin);
434 	    if (strlen(buffer))
435 	       buffer[strlen(buffer)-1] = '\0'; /* remove newline */
436 
437 	    while (isspace(*buf)) buf++;
438 	    if (*buf != '\0') {
439 	       char *p;
440 	       int l; /* flag: downcase the letter */
441 
442 	       header=1;
443 
444 	       /* Downcase `buf' sensibly */
445 
446 	       /* Leave first character in upper case,
447 		  but downcase the rest */
448 	       p=buf; p++; l=1;
449 	       while (*p != '\0') {
450 		  if (isspace(*p)) l=0;
451 		  else {
452 		     if (l) *p=tolower(*p);
453 		     l=1;
454 		  }
455 		  p++;
456 	       }
457 	       fmt_newheadword(buf,0);
458 	    }
459  	 }
460  	 break;
461       default:
462 	 fprintf(stderr, "Unknown input format type %d\n", type );
463 	 exit(2);
464       }
465       fmt_string(buf);
466       fmt_newline();
467  skip:
468    }
469 
470    fmt_newheadword(NULL,0);
471 
472    fmt_closeindex();
473    fclose(str);
474    return 0;
475 }
476