1 /* -*-C-*-
2 *******************************************************************************
3 *
4 * File: makeconcfile.c
5 * RCS: $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcfile.c,v 2.7 2005/01/23 11:27:00 matthew Exp $
6 * Description: Generate Concordance Data File
7 * Author: Chip Chapin
8 * Created: Thu Dec 17 11:28:20 1992
9 * Modified: Mon Apr 26 11:14:38 1993 (Chip Chapin) chip@hpclbis
10 * Language: C
11 * Package: Text Storage Library (Bible Retrieval System)
12 * Status: Experimental (Do Not Distribute)
13 *
14 *******************************************************************************
15 *
16 * Revisions:
17 *
18 * Thu Jan 7 12:07:05 1993 (Chip Chapin) chip@hpclbis
19 * Revised fileheader.
20 * Implemented cumulative index entries to save space (24kbytes for KJV).
21 * Wed Jan 6 10:08:12 1993 (Chip Chapin) chip@hpclbis
22 * Began implementation of list compression.
23 * Wed Dec 23 13:43:11 1992 (Chip Chapin) chip@hpclbis
24 * Unlink tmp files when done.
25 *******************************************************************************
26 * $Log: makeconcfile.c,v $
27 * Revision 2.7 2005/01/23 11:27:00 matthew
28 * include more standard header files
29 *
30 * Revision 2.6 2005/01/22 18:07:28 matthew
31 * prototype main properly
32 *
33 * Revision 2.5 2005/01/22 17:17:54 matthew
34 * cast return value of sizeof
35 *
36 * Revision 2.4 2005/01/22 16:28:28 matthew
37 * initialise cmpnum.
38 *
39 * Revision 2.3 2005/01/22 16:27:34 matthew
40 * sort out function prototyping, and include util.h
41 *
42 * Revision 2.2 2003/07/26 10:02:53 matthew
43 * fprintf("\0") is not very useful
44 *
45 * Revision 2.1 2003/02/01 02:39:53 matthew
46 * make main int main ..., and accordingly return 0 at the end of it, if
47 * all is well
48 *
49 * Revision 2.0 2003/01/08 15:29:52 matthew
50 * versions collected from the net
51 *
52 * Revision 1.5 93/04/26 11:18:17 11:18:17 chip (Chip Chapin)
53 * Release 4.00
54 * Public release of portable datafile version.
55 *
56 * Revision 1.4 93/04/23 13:08:07 13:08:07 chip (Chip Chapin)
57 * PORTABILITY RELEASE
58 * This version supports portable data files, usable on machines with
59 * differing native byte-orders.
60 * Also, this version compiles and runs on non-HPUX systems. It has been
61 * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
62 * respectively. Note that the data file format has rolled again.
63 *
64 * Revision 1.3 93/01/07 12:18:06 12:18:06 chip (Chip Chapin)
65 * Release 3.01: Greatly improved compression of concordance data file.
66 *
67 *
68 *******************************************************************************
69 */
70
71 #include <stdio.h>
72 #include <errno.h>
73 #include <unistd.h>
74 #include <string.h>
75 #include <stdlib.h>
76 #include "tsl.h"
77 #include "util.h"
78
79 #define FALSE (0)
80 #define TRUE (1)
81
82 #define INDEXTMPF "/tmp/mci"
83 #define DATATMPF "/tmp/mcd"
84
85 static void outerr(int n);
86 static void print_header(void);
87
88 char *myname; /* Program Name */
89
90 /*
91 Structure of Input Data ...
92
93 Input data is a sorted ASCII file.
94 Each line is a record.
95 Each record is a variable-length list of fields, separated by blanks,
96 as follows:
97
98 <word> <ref> {<ref>...}
99
100 where <word> is a lower-case alpha string, and <ref> is an integer
101 numeric string. Each <word> is guaranteed to be unique. <ref> is
102 expected to describe a location in the subject document where
103 <word> occurs, though this program doesn't actually care about
104 that.
105
106 Structure of Output Data ...
107
108 Please refer to description in tsl.h.
109 */
110
111 struct tsl_conc_fileheader fh;
112
113
main(int argc,char ** argv)114 int main(int argc,char **argv)
115 /*----------------------------------------------------------------------
116 | NAME:
117 | main
118 |
119 | ALGORITHM:
120 | Main Program.
121 |
122 | HISTORY:
123 | 921221 cc Initial creation.
124 |
125 \*----------------------------------------------------------------------*/
126 {
127 FILE *outfp, *indexfp, *datafp;
128 int ref; /* NOT a ref_t, because it must be signed */
129 ref_t rbuf[SELECTSZ];
130 unsigned char obuf[SELECTSZ*sizeof(ref_t)];
131 int n, oinx, cnt;
132 unsigned char bhi, blo;
133 int refs_left;
134 int no_comp;
135 int cmpnum=0;
136 int base, curbase;
137 int m_n, m_ref, m_cmpnum, m_cmpsize;
138 file_ptr_t word_index, data_index;
139 int entry_size;
140 short int this_index;
141 Short_Univ_Int indextmp;
142 #define WRDSZ 100
143 char word[WRDSZ];
144
145 myname = *argv++; argc--; /* Program name */
146
147 if (argc < 1) {
148 fprintf( stderr, "%s: Missing output file name\n", myname );
149 fprintf( stderr, "Usage: %s datafile\n", myname );
150 exit( 1 );
151 }
152
153 outfp = fopen( *argv, "w" );
154 if (outfp == NULL) {
155 fprintf( stderr, "%s: Cannot open output file %s\n", myname, *argv );
156 exit( 1 );
157 }
158 indexfp = fopen( INDEXTMPF, "w+" );
159 datafp = fopen( DATATMPF, "w+" );
160 if ((indexfp == NULL) || (datafp == NULL)) {
161 fprintf( stderr, "%s: Cannot open temp files.\n", myname );
162 exit( 1 );
163 }
164
165 /* Initialize the header and write it out... */
166 fh.magic[0] = TSL_CONCMAGIC1;
167 fh.magic[1] = TSL_CONCMAGIC2;
168 fh.version[0] = TSL_CONCFVERSION1;
169 fh.version[1] = TSL_CONCFVERSION2;
170 strncpy( fh.name, "KJV Concordance", TSL_DESCRSZ );
171 univ_assign(fh.word_ptr, sizeof( fh ));
172 univ_assign(fh.word_cnt, 0);
173 univ_assign(fh.index_ptr, 0);
174 univ_assign(fh.data_ptr, 0);
175 fwrite( &fh, sizeof(fh), 1, outfp );
176
177
178 /* Process input data.
179 Each line consists of a word, followed by a list of numeric
180 references where the word occurs. Separated by blanks.
181
182 We will write the strings for the word list into the output file
183 as we go, meanwhile counting the words and building both the reference
184 index and the reference data pool in temp files.
185
186 To save space, the reference index entries are not actually pointers
187 into the reference data pool, but are the SIZE of the entry's ref data
188 pool.
189
190 So the offset in the ref data pool for entry N is the
191 sum over i=0..N-1 of index(i).
192
193 To complicate things more, there is a special case when an
194 entry N has only a single ref. In that case, the ref is NOT
195 entered into the ref data pool but the ref itself is stored as
196 index(N). It is negated to distinguish it from a regular index
197 entry.
198
199 */
200 word_index = 0; /* The index of each word, 0..N */
201 data_index = 0; /* The offset in the ref data pool of current entry */
202 while (scanf( "%s", word) > 0) {
203 /* Append string to word list */
204 if ((n=fputs( word, outfp )) <= 0)
205 outerr(n);
206 putc( 0, outfp );
207
208 /* Start reading references */
209 cnt = 0;
210 while (scanf( "%d", &n) > 0) {
211 /* Read all refs into buffer */
212 rbuf[cnt++] = (ref_t) n;
213 }
214 if (cnt <= 0) {
215 /* this should never happen */
216 fprintf( stderr, "%s: Bad input format. No refs for %s\n",
217 myname, word );
218 }
219
220 entry_size = 0; /* size (bytes) of current ref data entry */
221 if (cnt == 1) {
222 /* Special handling for this case: word has a SINGLE REFERENCE.
223 Instead of putting the ref into the data pool, just keep it
224 in the index. Encode it by negating it.
225 */
226 this_index = -rbuf[0];
227 } else {
228 /* Write reference list to reference data pool */
229 for (n=oinx=0, refs_left=cnt; refs_left--; n++) {
230 ref = rbuf[n];
231 no_comp = TRUE;
232 if (refs_left && (rbuf[n+1]-ref <= 16)) {
233 /* We can compress at least one (though possibly to
234 no advantage). How many more might we compress?
235 */
236 cmpnum=1;
237 while ((cmpnum < refs_left) &&
238 (rbuf[n+cmpnum+1]-rbuf[n+cmpnum] <= 16)) {
239 cmpnum++;
240 }
241
242 /*
243 How much will we really save?
244 Model the actual algorithm and compare...
245 */
246 m_cmpsize = 0;
247 curbase = ref;
248 m_n = n;
249 m_ref = rbuf[++m_n]-curbase-1;
250 m_cmpnum = cmpnum-1;
251 while (m_ref >= 0) {
252 /* For each byte in bitmap... */
253 while ((m_ref < 8) && (m_ref >= 0)) {
254 /* For each ref that fits in this byte... */
255 if (m_cmpnum > 0) {
256 m_cmpnum--;
257 m_ref = rbuf[++m_n]-curbase-1;
258 } else {
259 m_ref = -1; /* term. flag, both loops */
260 }
261 }
262 m_cmpsize++;
263 curbase += 8;
264 m_ref -= 8;
265 }
266 m_cmpsize += 2; /* Add terminator size */
267
268 /* Well, did it pay off? */
269 no_comp = (m_cmpsize > (cmpnum*(int)sizeof(ref_t)));
270 }
271 if (no_comp) {
272 /* No compression */
273 bhi = (ref >> 8);
274 blo = ref & 0x00ff;
275 obuf[oinx++] = bhi;
276 obuf[oinx++] = blo;
277 } else {
278 /* Do the compression.
279 Format:
280 <base address><bitmap>
281 Where <base address> is the NEGATED reference to the
282 line BEFORE the start of the bitmap refs.
283 */
284 /* Write base address */
285 base = ref;
286 ref = -ref;
287 bhi = (ref >> 8);
288 blo = ref & 0x00ff;
289 obuf[oinx++] = bhi;
290 obuf[oinx++] = blo;
291
292 /* Write bitmap.
293 ref is always with respect to curbase.
294 It is the number of verses, MINUS ONE, that
295 separate them. The MINUS ONE is so the
296 left-shift works correctly.
297 */
298 curbase = base;
299 ref = rbuf[++n]-curbase-1;
300 refs_left -= cmpnum;
301 cmpnum--;
302 while (ref >= 0) {
303 /* For each byte in bitmap... */
304 blo = 0;
305 while ((ref < 8) && (ref >= 0)) {
306 /* For each ref that fits in this byte... */
307 blo |= 0x01 << ref;
308 if (cmpnum > 0) {
309 cmpnum--;
310 ref = rbuf[++n]-curbase-1;
311 } else {
312 ref = -1; /* term. flag, both loops */
313 }
314 }
315 obuf[oinx++] = blo;
316 curbase += 8;
317 ref -= 8;
318 }
319
320 /* Write TWO terminator bytes */
321 /* Might be better to use ONE, and change the requirement
322 to being within 8 lines, not 16 */
323 obuf[oinx++] = 0;
324 obuf[oinx++] = 0;
325 } /* compression */
326 } /* for */
327 if ((n=fwrite( obuf, 1, oinx, datafp )) <= 0)
328 outerr(n);
329
330 entry_size = oinx;
331 if (entry_size > 32767) {
332 fprintf( stderr, "%s: Too many entries for \"%s\"!\n",
333 myname, word );
334 exit(1);
335 }
336 this_index = entry_size;
337 } /* write ref list */
338
339 /* Write the index entry for this word */
340 shortuniv_assign( indextmp, this_index );
341 if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
342 outerr(n);
343
344 data_index += entry_size;
345 word_index++;
346 }
347
348 /* It makes searching easier if we guarantee an ending string */
349 if ((n=fprintf( outfp, "~" )) <= 0)
350 outerr(n);
351 shortuniv_assign( indextmp, 0 );
352 if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
353 outerr(n);
354
355 /* OK, now finish putting the output file together. */
356 /* First, another header field */
357 univ_assign(fh.word_cnt, word_index);
358
359 /* Now copy the reference index from temp file to output file */
360 univ_assign(fh.index_ptr, ftell( outfp ));
361 rewind( indexfp );
362 while (fread( obuf, sizeof(Short_Univ_Int), 1, indexfp ) > 0) {
363 fwrite( obuf, sizeof(Short_Univ_Int), 1, outfp );
364 }
365
366 /* Then copy the reference data from temp file to output file */
367 univ_assign(fh.data_ptr, ftell( outfp ));
368 rewind( datafp );
369 while (fread( obuf, 1, 1, datafp ) > 0) {
370 fwrite( obuf, 1, 1, outfp );
371 }
372
373 /* Finally, write out the updated file header */
374 rewind( outfp );
375 fwrite( &fh, sizeof(fh), 1, outfp );
376
377 /* Prove that we've been working hard... */
378 print_header();
379
380 fclose(outfp);
381 fclose(indexfp);
382 fclose(datafp);
383 unlink( INDEXTMPF );
384 unlink( DATATMPF );
385 return(0);
386 } /* main */
387
388
outerr(int n)389 static void outerr(int n)
390 {
391 fprintf( stderr, "%s: File I/O error #%d, %d\n", __FILE__, n, errno );
392 }
393
394
print_header(void)395 static void print_header(void)
396 {
397 printf( "Concordance data file:\n" );
398 printf( " Version: %c%c\n", fh.version[0], fh.version[1] );
399 printf( " Name: %s\n", fh.name );
400 printf( " Contents: %d words\n", univ2int(fh.word_cnt) );
401 printf( " Word list at file offset %d\n", univ2int(fh.word_ptr) );
402 printf( " Index at file offset %d\n", univ2int(fh.index_ptr) );
403 printf( " Data at file offset %d\n", univ2int(fh.data_ptr) );
404 }
405