1 /* -*-C-*-
2 *******************************************************************************
3 *
4 * File:         makeconcfile.c
5 * RCS:          $Header: /home/matthew/cvs/bible-kjv-4.10/makeconcfile.c,v 2.7 2005/01/23 11:27:00 matthew Exp $
6 * Description:  Generate Concordance Data File
7 * Author:       Chip Chapin
8 * Created:      Thu Dec 17 11:28:20 1992
9 * Modified:     Mon Apr 26 11:14:38 1993 (Chip Chapin) chip@hpclbis
10 * Language:     C
11 * Package:      Text Storage Library (Bible Retrieval System)
12 * Status:       Experimental (Do Not Distribute)
13 *
14 *******************************************************************************
15 *
16 * Revisions:
17 *
18 * Thu Jan  7 12:07:05 1993 (Chip Chapin) chip@hpclbis
19 *  Revised fileheader.
20 *  Implemented cumulative index entries to save space (24kbytes for KJV).
21 * Wed Jan  6 10:08:12 1993 (Chip Chapin) chip@hpclbis
22 *  Began implementation of list compression.
23 * Wed Dec 23 13:43:11 1992 (Chip Chapin) chip@hpclbis
24 *  Unlink tmp files when done.
25 *******************************************************************************
26 * $Log: makeconcfile.c,v $
27 * Revision 2.7  2005/01/23 11:27:00  matthew
28 * include more standard header files
29 *
30 * Revision 2.6  2005/01/22 18:07:28  matthew
31 * prototype main properly
32 *
33 * Revision 2.5  2005/01/22 17:17:54  matthew
34 * cast return value of sizeof
35 *
36 * Revision 2.4  2005/01/22 16:28:28  matthew
37 * initialise cmpnum.
38 *
39 * Revision 2.3  2005/01/22 16:27:34  matthew
40 * sort out function prototyping, and include util.h
41 *
42 * Revision 2.2  2003/07/26 10:02:53  matthew
43 * fprintf("\0") is not very useful
44 *
45 * Revision 2.1  2003/02/01 02:39:53  matthew
46 * make main int main ..., and accordingly return 0 at the end of it, if
47 * all is well
48 *
49 * Revision 2.0  2003/01/08 15:29:52  matthew
50 * versions collected from the net
51 *
52  * Revision 1.5  93/04/26  11:18:17  11:18:17  chip (Chip Chapin)
53  * Release 4.00
54  * Public release of portable datafile version.
55  *
56  * Revision 1.4  93/04/23  13:08:07  13:08:07  chip (Chip Chapin)
57  * PORTABILITY RELEASE
58  * This version supports portable data files, usable on machines with
59  * differing native byte-orders.
60  * Also, this version compiles and runs on non-HPUX systems.  It has been
61  * tested on SunOS 4.? and ULTRIX 4.?, using SPARC and DEC 3100 hardware
62  * respectively.  Note that the data file format has rolled again.
63  *
64  * Revision 1.3  93/01/07  12:18:06  12:18:06  chip (Chip Chapin)
65  * Release 3.01: Greatly improved compression of concordance data file.
66  *
67 *
68 *******************************************************************************
69 */
70 
71 #include <stdio.h>
72 #include <errno.h>
73 #include <unistd.h>
74 #include <string.h>
75 #include <stdlib.h>
76 #include "tsl.h"
77 #include "util.h"
78 
79 #define FALSE	(0)
80 #define TRUE	(1)
81 
82 #define INDEXTMPF	"/tmp/mci"
83 #define DATATMPF	"/tmp/mcd"
84 
85 static void outerr(int n);
86 static void print_header(void);
87 
88 char *myname;			/* Program Name */
89 
90 /*
91   Structure of Input Data ...
92 
93     Input data is a sorted ASCII file.
94     Each line is a record.
95     Each record is a variable-length list of fields, separated by blanks,
96     as follows:
97 
98        <word> <ref> {<ref>...}
99 
100     where <word> is a lower-case alpha string, and <ref> is an integer
101     numeric string.  Each <word> is guaranteed to be unique.  <ref> is
102     expected to describe a location in the subject document where
103     <word> occurs, though this program doesn't actually care about
104     that.
105 
106   Structure of Output Data ...
107 
108     Please refer to description in tsl.h.
109  */
110 
111 struct tsl_conc_fileheader fh;
112 
113 
main(int argc,char ** argv)114 int main(int argc,char **argv)
115 /*----------------------------------------------------------------------
116 |   NAME:
117 |       main
118 |
119 |   ALGORITHM:
120 |       Main Program.
121 |
122 |   HISTORY:
123 |       921221 cc Initial creation.
124 |
125 \*----------------------------------------------------------------------*/
126 {
127     FILE	*outfp, *indexfp, *datafp;
128     int		ref;		/* NOT a ref_t, because it must be signed */
129     ref_t	rbuf[SELECTSZ];
130     unsigned char obuf[SELECTSZ*sizeof(ref_t)];
131     int		n, oinx, cnt;
132     unsigned char bhi, blo;
133     int		refs_left;
134     int		no_comp;
135     int		cmpnum=0;
136     int		base, curbase;
137     int		m_n, m_ref, m_cmpnum, m_cmpsize;
138     file_ptr_t	word_index, data_index;
139     int		entry_size;
140     short int	this_index;
141     Short_Univ_Int indextmp;
142 #define WRDSZ    100
143     char word[WRDSZ];
144 
145     myname = *argv++; argc--;	/* Program name */
146 
147     if (argc < 1) {
148 	fprintf( stderr, "%s: Missing output file name\n", myname );
149 	fprintf( stderr, "Usage: %s datafile\n", myname );
150 	exit( 1 );
151     }
152 
153     outfp = fopen( *argv, "w" );
154     if (outfp == NULL) {
155 	fprintf( stderr, "%s: Cannot open output file %s\n", myname, *argv );
156 	exit( 1 );
157     }
158     indexfp = fopen( INDEXTMPF, "w+" );
159     datafp = fopen( DATATMPF, "w+" );
160     if ((indexfp == NULL) || (datafp == NULL)) {
161 	fprintf( stderr, "%s: Cannot open temp files.\n", myname );
162 	exit( 1 );
163     }
164 
165     /* Initialize the header and write it out... */
166     fh.magic[0] = TSL_CONCMAGIC1;
167     fh.magic[1] = TSL_CONCMAGIC2;
168     fh.version[0] = TSL_CONCFVERSION1;
169     fh.version[1] = TSL_CONCFVERSION2;
170     strncpy( fh.name, "KJV Concordance", TSL_DESCRSZ );
171     univ_assign(fh.word_ptr, sizeof( fh ));
172     univ_assign(fh.word_cnt, 0);
173     univ_assign(fh.index_ptr, 0);
174     univ_assign(fh.data_ptr, 0);
175     fwrite( &fh, sizeof(fh), 1, outfp );
176 
177 
178     /* Process input data.
179        Each line consists of a word, followed by a list of numeric
180        references where the word occurs.  Separated by blanks.
181 
182        We will write the strings for the word list into the output file
183        as we go, meanwhile counting the words and building both the reference
184        index and the reference data pool in temp files.
185 
186        To save space, the reference index entries are not actually pointers
187        into the reference data pool, but are the SIZE of the entry's ref data
188        pool.
189 
190        So the offset in the ref data pool for entry N is the
191        sum over i=0..N-1 of index(i).
192 
193        To complicate things more, there is a special case when an
194        entry N has only a single ref.  In that case, the ref is NOT
195        entered into the ref data pool but the ref itself is stored as
196        index(N).  It is negated to distinguish it from a regular index
197        entry.
198 
199      */
200     word_index = 0;	/* The index of each word, 0..N */
201     data_index = 0;	/* The offset in the ref data pool of current entry */
202     while (scanf( "%s", word) > 0) {
203 	/* Append string to word list */
204 	if ((n=fputs( word, outfp )) <= 0)
205 	    outerr(n);
206 	putc( 0, outfp );
207 
208 	/* Start reading references */
209 	cnt = 0;
210 	while (scanf( "%d", &n) > 0) {
211 	    /* Read all refs into buffer */
212 	    rbuf[cnt++] = (ref_t) n;
213 	}
214 	if (cnt <= 0) {
215 	    /* this should never happen */
216 	    fprintf( stderr, "%s: Bad input format.  No refs for %s\n",
217 		    myname, word );
218 	}
219 
220 	entry_size = 0;		/* size (bytes) of current ref data entry */
221 	if (cnt == 1) {
222 	    /* Special handling for this case: word has a SINGLE REFERENCE.
223 	       Instead of putting the ref into the data pool, just keep it
224 	       in the index.  Encode it by negating it.
225 	     */
226 	    this_index = -rbuf[0];
227 	} else {
228 	    /* Write reference list to reference data pool */
229 	    for (n=oinx=0, refs_left=cnt; refs_left--; n++) {
230 		ref = rbuf[n];
231 		no_comp = TRUE;
232 		if (refs_left && (rbuf[n+1]-ref <= 16)) {
233 		    /* We can compress at least one (though possibly to
234 		       no advantage).  How many more might we compress?
235 		     */
236 		    cmpnum=1;
237 		    while ((cmpnum < refs_left) &&
238 			   (rbuf[n+cmpnum+1]-rbuf[n+cmpnum] <= 16)) {
239 			cmpnum++;
240 		    }
241 
242 		    /*
243 		       How much will we really save?
244 		       Model the actual algorithm and compare...
245 		     */
246 		    m_cmpsize = 0;
247 		    curbase = ref;
248 		    m_n = n;
249 		    m_ref = rbuf[++m_n]-curbase-1;
250 		    m_cmpnum = cmpnum-1;
251 		    while (m_ref >= 0) {
252 			/* For each byte in bitmap... */
253 			while ((m_ref < 8) && (m_ref >= 0)) {
254 			    /* For each ref that fits in this byte... */
255 			    if (m_cmpnum > 0) {
256 				m_cmpnum--;
257 				m_ref = rbuf[++m_n]-curbase-1;
258 			    } else {
259 				m_ref = -1;	/* term. flag, both loops */
260 			    }
261 			}
262 			m_cmpsize++;
263 			curbase += 8;
264 			m_ref -= 8;
265 		    }
266 		    m_cmpsize += 2;		/* Add terminator size */
267 
268 		    /* Well, did it pay off? */
269 		    no_comp = (m_cmpsize > (cmpnum*(int)sizeof(ref_t)));
270 		}
271 		if (no_comp) {
272 		    /* No compression */
273 		    bhi = (ref >> 8);
274 		    blo = ref & 0x00ff;
275 		    obuf[oinx++] = bhi;
276 		    obuf[oinx++] = blo;
277 		} else {
278 		    /* Do the compression.
279 		       Format:
280 		         <base address><bitmap>
281 		       Where <base address> is the NEGATED reference to the
282 		       line BEFORE the start of the bitmap refs.
283 		     */
284 		    /* Write base address */
285 		    base = ref;
286 		    ref = -ref;
287 		    bhi = (ref >> 8);
288 		    blo = ref & 0x00ff;
289 		    obuf[oinx++] = bhi;
290 		    obuf[oinx++] = blo;
291 
292 		    /* Write bitmap.
293 		         ref is always with respect to curbase.
294 			 It is the number of verses, MINUS ONE, that
295 			 separate them.  The MINUS ONE is so the
296 			 left-shift works correctly.
297 		     */
298 		    curbase = base;
299 		    ref = rbuf[++n]-curbase-1;
300 		    refs_left -= cmpnum;
301 		    cmpnum--;
302 		    while (ref >= 0) {
303 			/* For each byte in bitmap... */
304 			blo = 0;
305 			while ((ref < 8) && (ref >= 0)) {
306 			    /* For each ref that fits in this byte... */
307 			    blo |= 0x01 << ref;
308 			    if (cmpnum > 0) {
309 				cmpnum--;
310 				ref = rbuf[++n]-curbase-1;
311 			    } else {
312 				ref = -1;	/* term. flag, both loops */
313 			    }
314 			}
315 			obuf[oinx++] = blo;
316 			curbase += 8;
317 			ref -= 8;
318 		    }
319 
320 		    /* Write TWO terminator bytes */
321 		    /* Might be better to use ONE, and change the requirement
322 		       to being within 8 lines, not 16 */
323 		    obuf[oinx++] = 0;
324 		    obuf[oinx++] = 0;
325 		} /* compression */
326 	    } /* for */
327 	    if ((n=fwrite( obuf, 1, oinx, datafp )) <= 0)
328 		outerr(n);
329 
330 	    entry_size = oinx;
331 	    if (entry_size > 32767) {
332 		fprintf( stderr, "%s: Too many entries for \"%s\"!\n",
333 			myname, word );
334 		exit(1);
335 	    }
336 	    this_index = entry_size;
337 	} /* write ref list */
338 
339 	/* Write the index entry for this word */
340 	shortuniv_assign( indextmp, this_index );
341 	if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
342 	    outerr(n);
343 
344 	data_index += entry_size;
345 	word_index++;
346     }
347 
348     /* It makes searching easier if we guarantee an ending string */
349     if ((n=fprintf( outfp, "~" )) <= 0)
350 	outerr(n);
351     shortuniv_assign( indextmp, 0 );
352     if ((n=fwrite( indextmp, sizeof(Short_Univ_Int), 1, indexfp )) <= 0)
353 	outerr(n);
354 
355     /* OK, now finish putting the output file together. */
356     /* First, another header field */
357     univ_assign(fh.word_cnt, word_index);
358 
359     /* Now copy the reference index from temp file to output file */
360     univ_assign(fh.index_ptr, ftell( outfp ));
361     rewind( indexfp );
362     while (fread( obuf, sizeof(Short_Univ_Int), 1, indexfp ) > 0) {
363 	fwrite( obuf, sizeof(Short_Univ_Int), 1, outfp );
364     }
365 
366     /* Then copy the reference data from temp file to output file */
367     univ_assign(fh.data_ptr, ftell( outfp ));
368     rewind( datafp );
369     while (fread( obuf, 1, 1, datafp ) > 0) {
370 	fwrite( obuf, 1, 1, outfp );
371     }
372 
373     /* Finally, write out the updated file header */
374     rewind( outfp );
375     fwrite( &fh, sizeof(fh), 1, outfp );
376 
377     /* Prove that we've been working hard... */
378     print_header();
379 
380     fclose(outfp);
381     fclose(indexfp);
382     fclose(datafp);
383     unlink( INDEXTMPF );
384     unlink( DATATMPF );
385     return(0);
386 } /* main */
387 
388 
outerr(int n)389 static void outerr(int n)
390 {
391     fprintf( stderr, "%s: File I/O error #%d, %d\n", __FILE__, n, errno );
392 }
393 
394 
print_header(void)395 static void print_header(void)
396 {
397     printf( "Concordance data file:\n" );
398     printf( "  Version:  %c%c\n", fh.version[0], fh.version[1] );
399     printf( "  Name:     %s\n", fh.name );
400     printf( "  Contents: %d words\n", univ2int(fh.word_cnt) );
401     printf( "  Word list at file offset %d\n", univ2int(fh.word_ptr) );
402     printf( "  Index at file offset %d\n", univ2int(fh.index_ptr) );
403     printf( "  Data at file offset %d\n", univ2int(fh.data_ptr) );
404 }
405