1 /*===========================================================================
2  Copyright (c) 1998-2000, The Santa Cruz Operation
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7 
8  *Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11  *Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15  *Neither name of The Santa Cruz Operation nor the names of its contributors
16  may be used to endorse or promote products derived from this software
17  without specific prior written permission.
18 
19  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
20  IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
23  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  INTERRUPTION)
27  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30  DAMAGE.
31  =========================================================================*/
32 
33 
34 #include <ctype.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #if SHARE
38 #include <sys/types.h>
39 #include <sys/ipc.h>
40 #include <sys/shm.h>
41 #define ERR  -1
42 #endif
43 #include "invlib.h"
44 #include "global.h"
45 
46 #include <assert.h>
47 
48 #define	DEBUG		0	/* debugging code and realloc messages */
49 #define BLOCKSIZE	2 * BUFSIZ	/* logical block size */
50 #define	POSTINC		10000	/* posting buffer size increment */
51 #define SEP		' '	/* sorted posting field separator */
52 #define	SETINC		100	/* posting set size increment */
53 #define	STATS		0	/* print statistics */
54 #define	SUPERINC	10000	/* super index size increment */
55 #define	TERMMAX		512	/* term max size */
56 #define	FMTVERSION	1	/* inverted index format version */
57 #define	ZIPFSIZE	200	/* zipf curve size */
58 
59 #if DEBUG
60 /* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
61  * a value other than the (silent) initialization to zero. Pretty
62  * useless, that looks */
63 int	invbreak;
64 #endif
65 
66 static	int	boolready(void);
67 static	int	invnewterm(void);
68 static	void	invstep(INVCONTROL *invcntl);
69 static	void	invcannotalloc(unsigned n);
70 static	void	invcannotopen(char *file);
71 static	void	invcannotwrite(char *file);
72 
73 #if STATS
74 int	showzipf;	/* show postings per term distribution */
75 #endif
76 
77 static	POSTING	*item, *enditem, *item1 = NULL, *item2 = NULL;
78 static	unsigned int setsize1, setsize2;
79 static	long	numitems, totterm, zerolong;
80 static	char	*indexfile, *postingfile;
81 static	FILE	*outfile, *fpost;
82 static	size_t supersize = SUPERINC, supintsize;
83 static  unsigned int numpost, numlogblk, amtused, nextpost;
84 static  unsigned int lastinblk, numinvitems;
85 static	POSTING	*POST, *postptr;
86 static	unsigned long	*SUPINT, *supint, nextsupfing;
87 static	char	*SUPFING, *supfing;
88 static	char	thisterm[TERMMAX];
89 typedef union logicalblk {
90 	long	invblk[BLOCKSIZE / sizeof(long)];
91 	char	chrblk[BLOCKSIZE];
92 } t_logicalblk;
93 static t_logicalblk logicalblk;
94 
95 #if DEBUG || STATS
96 static	long	totpost;
97 #endif
98 
99 #if STATS
100 static	int	zipf[ZIPFSIZE + 1];
101 #endif
102 
103 long
invmake(char * invname,char * invpost,FILE * infile)104 invmake(char *invname, char *invpost, FILE *infile)
105 {
106 	char	*s;
107 	long	num;
108 	int	i;
109 	long	fileindex = 0;	/* initialze, to avoid warning */
110 	unsigned postsize = POSTINC * sizeof(*POST);
111 	unsigned long	*intptr;
112 	char	line[TERMMAX];
113 	long	tlong;
114 	PARAM	param;
115 	POSTING	posting;
116 	char 	temp[BLOCKSIZE];
117 #if STATS
118 	int	j;
119 	unsigned maxtermlen = 0;
120 #endif
121 	/* output file */
122 	if ((outfile = vpfopen(invname, "w+b")) == NULL) {
123 		invcannotopen(invname);
124 		return(0);
125 	}
126 	indexfile = invname;
127 	fseek(outfile, BUFSIZ, SEEK_SET);
128 
129 	/* posting file  */
130 	if ((fpost = vpfopen(invpost, "wb")) == NULL) {
131 		invcannotopen(invpost);
132 		return(0);
133 	}
134 	postingfile = invpost;
135 	nextpost = 0;
136 	/* get space for the postings list */
137 	if ((POST = malloc(postsize)) == NULL) {
138 		invcannotalloc(postsize);
139 		return(0);
140 	}
141 	postptr = POST;
142 	/* get space for the superfinger (superindex) */
143 	if ((SUPFING = malloc(supersize)) == NULL) {
144 		invcannotalloc(supersize);
145 		return(0);
146 	}
147 	supfing = SUPFING;
148 	/* FIXME HBB: magic number alert (40) */
149 	supintsize = supersize / 40u;
150 	/* also for the superfinger index */
151 	if ((SUPINT = malloc(supintsize * sizeof(*SUPINT))) == NULL) {
152 		invcannotalloc(supintsize * sizeof(*SUPINT));
153 		return(0);
154 	}
155 	supint = SUPINT;
156 	supint++; /* leave first term open for a count */
157 	/* initialize using an empty term */
158 	strcpy(thisterm, "");
159 	*supint++ = 0;
160 	*supfing++ = ' ';
161 	*supfing++ = '\0';
162 	nextsupfing = 2;
163 #if DEBUG || STATS
164 	totpost = 0L;
165 #endif
166 	totterm = 0L;
167 	numpost = 1;
168 
169 	/* set up as though a block had come and gone, i.e., set up for new block  */
170 	/* 3 longs needed for: numinvitems, next block, and previous block */
171 	amtused = 3 * sizeof(long);
172 	numinvitems = 0;
173 	numlogblk = 0;
174 	lastinblk = sizeof(t_logicalblk);
175 
176 	/* now loop as long as more to read (till eof)  */
177 	while (fgets(line, TERMMAX, infile) != NULL) {
178 #if DEBUG || STATS
179 		++totpost;
180 #endif
181 		s = strchr(line, SEP);
182 		if (s != NULL) {
183 			*s = '\0';
184 		}
185 		else {
186 			continue;
187 		}
188 #if STATS
189 		if ((i = strlen(line)) > maxtermlen) {
190 			maxtermlen = i;
191 		}
192 #endif
193 #if DEBUG
194 		printf("%ld: %s ", totpost, line);
195 		fflush(stdout);
196 #endif
197 		if (strcmp(thisterm, line) == 0) {
198 			if ((postptr + 10) > (POST + (postsize / sizeof(*POST)))) {
199 				i = postptr - POST;
200 				postsize += POSTINC * sizeof(*POST);
201 				if ((POST = realloc(POST, postsize)) == NULL) {
202 					invcannotalloc(postsize);
203 					return(0);
204 				}
205 				postptr = i + POST;
206 #if DEBUG
207 				printf("reallocated post space to %u, totpost=%ld\n",
208 				       postsize, totpost);
209 #endif
210 			}
211 			numpost++;
212 		} else {
213 			/* have a new term */
214 			if (!invnewterm()) {
215 				return(0);
216 			}
217 			strcpy(thisterm, line);
218 			numpost = 1;
219 			postptr = POST;
220 			fileindex = 0;
221 		}
222 		/* get the new posting */
223 		num = *++s - '!';
224 		i = 1;
225 		do {
226 			num = BASE * num + *++s - '!';
227 		} while (++i < PRECISION);
228 		posting.lineoffset = num;
229 		while (++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
230 			;
231 		}
232 		posting.fileindex = --fileindex;
233 		posting.type = *++s;
234 		++s;
235 		if (*s != '\n') {
236 			num = *++s - '!';
237 			while (*++s != '\n') {
238 				num = BASE * num + *s - '!';
239 			}
240 			posting.fcnoffset = num;
241 		}
242 		else {
243 			posting.fcnoffset = 0;
244 		}
245 		*postptr++ = posting;
246 #if DEBUG
247 		printf("%ld %ld %ld %ld\n", posting.fileindex,
248 		       posting.fcnoffset, posting.lineoffset, posting.type);
249 		fflush(stdout);
250 #endif
251 	}
252 	if (!invnewterm()) {
253 		return(0);
254 	}
255 	/* now clean up final block  */
256 	logicalblk.invblk[0] = numinvitems;
257 	/* loops pointer around to start */
258 	logicalblk.invblk[1] = 0;
259 	logicalblk.invblk[2] = numlogblk - 1;
260 	if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
261 		goto cannotwrite;
262 	}
263 	numlogblk++;
264 	/* write out block to save space. what in it doesn't matter */
265 	if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
266 		goto cannotwrite;
267 	}
268 	/* finish up the super finger */
269 	*SUPINT = numlogblk;
270 	/* add to the offsets the size of the offset pointers */
271 	intptr = (SUPINT + 1);
272 	i = (char *)supint - (char *)SUPINT;
273 	while (intptr < supint)
274 		*intptr++ += i;
275 	/* write out the offsets (1 for the N at start) and the super finger */
276 	if (fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
277 	    fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
278 		goto cannotwrite;
279 	}
280 	/* save the size for reference later */
281 	nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
282 	/* make sure the file ends at a logical block boundary.  This is
283 	necessary for invinsert to correctly create extended blocks
284 	 */
285 	i = nextsupfing % sizeof(t_logicalblk);
286 	/* write out junk to fill log blk */
287 	if (fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
288 	    fflush(outfile) == EOF) {	/* rewind doesn't check for write failure */
289 		goto cannotwrite;
290 	}
291 	/* write the control area */
292 	rewind(outfile);
293 	param.version = FMTVERSION;
294 	param.filestat = 0;
295 	param.sizeblk = sizeof(t_logicalblk);
296 	param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;;
297 	param.supsize = nextsupfing;
298 	param.cntlsize = BUFSIZ;
299 	param.share = 0;
300 	if (fwrite(&param, sizeof(param), 1, outfile) == 0) {
301 		goto cannotwrite;
302 	}
303 	for (i = 0; i < 10; i++)	/* for future use */
304 		if (fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) {
305 			goto cannotwrite;
306 		}
307 
308 	/* make first block loop backwards to last block */
309 	if (fflush(outfile) == EOF) {	/* fseek doesn't check for write failure */
310 		goto cannotwrite;
311 	}
312 	/* get to second word first block */
313 	fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
314 	tlong = numlogblk - 1;
315 	if (fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 ||
316 	    fclose(outfile) == EOF) {
317 	cannotwrite:
318 		invcannotwrite(invname);
319 		return(0);
320 	}
321 	if (fclose(fpost) == EOF) {
322 		invcannotwrite(postingfile);
323 		return(0);
324 	}
325 	--totterm;	/* don't count null term */
326 #if STATS
327 	printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
328 	    numlogblk, totpost, totterm, maxtermlen);
329 	if (showzipf) {
330 		printf("\n*************   ZIPF curve  ****************\n");
331 		for (j = ZIPFSIZE; j > 1; j--)
332 			if (zipf[j])
333 				break;
334 		for (i = 1; i < j; ++i) {
335 			printf("%3d -%6d ", i, zipf[i]);
336 			if (i % 6 == 0) putchar('\n');
337 		}
338 		printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
339 	}
340 #endif
341 	/* free all malloc'd memory */
342 	free(POST);
343 	free(SUPFING);
344 	free(SUPINT);
345 	return(totterm);
346 }
347 
348 /* add a term to the data base */
349 
350 static int
invnewterm(void)351 invnewterm(void)
352 {
353     int	backupflag, i, j, holditems, gooditems, howfar;
354     unsigned int maxback, len, numwilluse, wdlen;
355     char	*tptr, *tptr3;
356 
357     union {
358 	unsigned long	packword[2];
359 	ENTRY		e;
360     } iteminfo;
361 
362     gooditems = 0;		/* initialize, to avoid warning */
363     totterm++;
364 #if STATS
365     /* keep zipfian info on the distribution */
366     if (numpost <= ZIPFSIZE)
367 	zipf[numpost]++;
368     else
369 	zipf[0]++;
370 #endif
371     len = strlen(thisterm);
372     /* length of term rounded up to long boundary */
373     wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
374     /* each term needs 2 longs for its iteminfo and
375      * 1 long for its offset */
376     numwilluse = (wdlen + 3) * sizeof(long);
377     /* new block if at least 1 item in block */
378     if (numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
379 	/* set up new block */
380 	if (supfing + 500u > SUPFING + supersize) {
381 	    i = supfing - SUPFING;
382 	    supersize += 20000u;
383 	    if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
384 		invcannotalloc(supersize);
385 		return(0);
386 	    }
387 	    supfing = i + SUPFING;
388 #if DEBUG
389 	    printf("reallocated superfinger space to %d, totpost=%ld\n",
390 		   supersize, totpost);
391 #endif
392 	}
393 	/* check that room for the offset as well */
394 	/* FIXME HBB: magic number alert (10) */
395 	if ((numlogblk + 10) > supintsize) {
396 	    i = supint - SUPINT;
397 	    supintsize += SUPERINC;
398 	    if ((SUPINT = realloc(SUPINT, supintsize * sizeof(*SUPINT))) == NULL) {
399 		invcannotalloc(supintsize * sizeof(*SUPINT));
400 		return(0);
401 	    }
402 	    supint = i + SUPINT;
403 #if DEBUG
404 	    printf("reallocated superfinger offset to %d, totpost = %ld\n", supintsize * sizeof(*SUPINT), totpost);
405 #endif
406 	}
407 	/* See if backup is efficatious  */
408 	backupflag = 0;
409 	maxback = (int) strlen(thisterm) / 10;
410 	holditems = numinvitems;
411 	if (maxback > numinvitems)
412 	    maxback = numinvitems - 2;
413 	howfar = 0;
414 	while (maxback-- > 1) {
415 	    howfar++;
416 	    iteminfo.packword[0] =
417 		logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
418 	    if ((i = iteminfo.e.size / 10) < maxback) {
419 		maxback = i;
420 		backupflag = howfar;
421 		gooditems = holditems;
422 	    }
423 	}
424 	/* see if backup will occur  */
425 	if (backupflag) {
426 	    numinvitems = gooditems;
427 	}
428 	logicalblk.invblk[0] = numinvitems;
429 	/* set forward pointer pointing to next */
430 	logicalblk.invblk[1] = numlogblk + 1;
431 	/* set back pointer to last block */
432 	logicalblk.invblk[2] = numlogblk - 1;
433 	if (fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
434 	    invcannotwrite(indexfile);
435 	    return(0);
436 	}
437 	/* 3 longs needed for: numinvitems, next block, and previous block */
438 	amtused = 3 * sizeof(long);
439 	numlogblk++;
440 	/* check if had to back up, if so do it */
441 	if (backupflag) {
442 	    char *tptr2;
443 
444 	    /* find out where the end of the new block is */
445 	    iteminfo.packword[0] = logicalblk.invblk[numinvitems*2+1];
446 	    tptr3 = logicalblk.chrblk + iteminfo.e.offset;
447 	    /* move the index for this block */
448 	    for (i = 3; i <= (backupflag * 2 + 2); i++)
449 		logicalblk.invblk[i] = logicalblk.invblk[numinvitems*2+i];
450 	    /* move the word into the super index */
451 	    iteminfo.packword[0] = logicalblk.invblk[3];
452 	    iteminfo.packword[1] = logicalblk.invblk[4];
453 	    tptr2 = logicalblk.chrblk + iteminfo.e.offset;
454 	    strncpy(supfing, tptr2, (int) iteminfo.e.size);
455 	    *(supfing + iteminfo.e.size) = '\0';
456 #if DEBUG
457 	    printf("backup %d at term=%s to term=%s\n",
458 		   backupflag, thisterm, supfing);
459 #endif
460 	    *supint++ = nextsupfing;
461 	    nextsupfing += strlen(supfing) + 1;
462 	    supfing += strlen(supfing) + 1;
463 	    /* now fix up the logical block */
464 	    tptr = logicalblk.chrblk + lastinblk;
465 	    lastinblk = sizeof(t_logicalblk);
466 	    tptr2 = logicalblk.chrblk + lastinblk;
467 	    j = tptr3 - tptr;
468 	    while (tptr3 > tptr)
469 		*--tptr2 = *--tptr3;
470 	    lastinblk -= j;
471 	    amtused += ((2 * sizeof(long)) * backupflag + j);
472 	    for (i = 3; i < (backupflag * 2 + 2); i += 2) {
473 		iteminfo.packword[0] = logicalblk.invblk[i];
474 		iteminfo.e.offset += (tptr2 - tptr3);
475 		logicalblk.invblk[i] = iteminfo.packword[0];
476 	    }
477 	    numinvitems = backupflag;
478 	} else { /* no backup needed */
479 	    numinvitems = 0;
480 	    lastinblk = sizeof(t_logicalblk);
481 	    /* add new term to superindex */
482 	    strcpy(supfing, thisterm);
483 	    supfing += strlen(thisterm) + 1;
484 	    *supint++ = nextsupfing;
485 	    nextsupfing += strlen(thisterm) + 1;
486 	}
487     }
488     /* HBB 20010501: Fixed bug by replacing magic number '8' by
489      * what it actually represents. */
490     lastinblk -= (numwilluse - 2 * sizeof(long));
491     iteminfo.e.offset = lastinblk;
492     iteminfo.e.size = len;
493     iteminfo.e.space = 0;
494     iteminfo.e.post = numpost;
495     strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
496     amtused += numwilluse;
497     logicalblk.invblk[(lastinblk/sizeof(long))+wdlen] = nextpost;
498     if ((i = postptr - POST) > 0) {
499 	if (fwrite(POST, sizeof(*POST), i, fpost) == 0) {
500 	    invcannotwrite(postingfile);
501 	    return(0);
502 	}
503 	nextpost += i * sizeof(*POST);
504     }
505     logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
506     logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
507     return(1);
508 }
509 
510 /*
511  * If 'invname' ends with the 'from' substring, it is replaced inline with the
512  * 'to' substring (which must be of the exact same length), and the function
513  * returns 0. Otherwise, returns -1.
514  */
515 
516 static int
invflipname(char * invname,const char * from,const char * to)517 invflipname(char * invname, const char *from, const char *to)
518 {
519 	char *temp, *i = NULL;
520 
521 	assert(strlen(from) == strlen(to));
522 
523 	temp = invname - 1;
524 	while( (temp = strstr(temp + 1, from)))
525 		i = temp;
526 	if (!i || i[strlen(from)] != '\0')
527 		return -1;
528 	while(*to)
529 		*i++ = *to++;
530 	return 0;
531 }
532 
533 /* small helper function to centralize handling of binary opening
534  * for reading, and use of the 'stat" flag */
535 static FILE *
open_for_reading(char * name,int stat)536 open_for_reading(char *name, int stat)
537 {
538 	return vpfopen(name, ((stat == 0) ? "rb" : "r+b"));
539 }
540 
541 /* handle opening of a file under a possibly "flipped" name */
542 /* If db created without '-f', but now invoked with '-f cscope.out',
543  * we need to check for 'cscope.in.out', rather than 'cscope.out.in':
544  * I.e, hack around our own violation of the inverse db naming convention */
545 /* more silliness: if you create the db with '-f cscope', then try to open
546  * it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
547  * here. */
548 static FILE *
open_file_with_flipped_name(char * name,const char * flip_in,const char * flip_out,int stat)549 open_file_with_flipped_name(char *name, const char *flip_in, const char *flip_out, int stat)
550 {
551 	if (! invflipname(name, flip_in, flip_out)) {
552 		FILE *fptr = open_for_reading(name, stat);
553 		if (! fptr)
554 			/* flip back for error message */
555 			invflipname(name, flip_out, flip_in);
556 		return fptr;
557 	};
558 	return 0;
559 }
560 
561 static FILE *
open_file_with_possibly_flipped_name(char * name,const char * flip1,const char * flip2,int stat)562 open_file_with_possibly_flipped_name(char *name, const char *flip1, const char *flip2, int stat)
563 {
564 	FILE *fptr = open_for_reading(name, stat);
565 
566 	if (! fptr)
567 		fptr = open_file_with_flipped_name(name, flip2, flip1, stat);
568 	if (! fptr)
569 		fptr = open_file_with_flipped_name(name, flip1, flip2, stat);
570 	return fptr;
571 }
572 
573 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)574 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
575 {
576 	int	read_index;
577 
578 	invcntl->invfile = open_file_with_possibly_flipped_name(invname, INVNAME, INVNAME2, stat);
579 	if (! invcntl->invfile) {
580 		invcannotopen(invname);
581 		return(-1);
582 	}
583 	if (fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
584 		fprintf(stderr, "%s: empty inverted file\n", argv0);
585 		fclose(invcntl->invfile);
586 		return(-1);
587 	}
588 	if (invcntl->param.version != FMTVERSION) {
589 		fprintf(stderr, "%s: cannot read old index format; use -U option to force database to rebuild\n", argv0);
590 		fclose(invcntl->invfile);
591 		return(-1);
592 	}
593 	assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
594 
595 	if (stat == 0 && invcntl->param.filestat == INVALONE) {
596 		fprintf(stderr, "%s: inverted file is locked\n", argv0);
597 		fclose(invcntl->invfile);
598 		return(-1);
599 	}
600 
601 	invcntl->postfile = open_file_with_possibly_flipped_name(invpost, INVPOST, INVPOST2, stat);
602 	if (! invcntl->postfile) {
603 		invcannotopen(invpost);
604 		fclose(invcntl->invfile);
605 		return(-1);
606 	}
607 
608 	/* allocate core for a logical block  */
609 	if ((invcntl->logblk = malloc((size_t) invcntl->param.sizeblk)) == NULL) {
610 		invcannotalloc((size_t) invcntl->param.sizeblk);
611 		fclose(invcntl->postfile);
612 		fclose(invcntl->invfile);
613 		return(-1);
614 	}
615 	/* allocate for and read in superfinger  */
616 	read_index = 1;
617 	invcntl->iindex = NULL;
618 #if SHARE
619 	if (invcntl->param.share == 1) {
620 		key_t shm_key;
621 		struct shmid_ds shm_buf;
622 		int	shm_id;
623 
624 		/* see if the shared segment exists */
625 		shm_key = ftok(invname, 2);
626 		shm_id = shmget(shm_key, 0, 0);
627 		/* Failure simply means (hopefully) that segment doesn't exists */
628 		if (shm_id == -1) {
629 			/* Have to give general write permission due to AMdahl not having protected segments */
630 			shm_id = shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
631 			if (shm_id == -1)
632 				perror("Could not create shared memory segment");
633 		} else
634 			read_index = 0;
635 
636 		if (shm_id != -1) {
637 			invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
638 			if (invcntl->iindex == (char *)ERR) {
639 				fprintf(stderr, "%s: shared memory link failed\n", argv0);
640 				invcntl->iindex = NULL;
641 				read_index = 1;
642 			}
643 		}
644 	}
645 #endif
646 	if (invcntl->iindex == NULL)
647 	        /* FIXME HBB: magic number alert (4, sizeof(long)) */
648 		invcntl->iindex = malloc((size_t) invcntl->param.supsize + 4 *sizeof(long));
649 	if (invcntl->iindex == NULL) {
650 		invcannotalloc((size_t) invcntl->param.supsize);
651 		free(invcntl->logblk);
652 		fclose(invcntl->postfile);
653 		fclose(invcntl->invfile);
654 		return(-1);
655 	}
656 	if (read_index) {
657 		fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
658 		fread(invcntl->iindex, (int) invcntl->param.supsize, 1,
659 		      invcntl->invfile);
660 	}
661 	invcntl->numblk = -1;
662 	if (boolready() == -1) {
663 		fclose(invcntl->postfile);
664 		fclose(invcntl->invfile);
665 		return(-1);
666 	}
667 	/* write back out the control block if anything changed */
668 	invcntl->param.filestat = stat;
669 	if (stat > invcntl->param.filestat ) {
670 		rewind(invcntl->invfile);
671 		fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
672 	}
673 	return(1);
674 }
675 
676 /** invclose must be called to wrap things up and deallocate core  **/
677 void
invclose(INVCONTROL * invcntl)678 invclose(INVCONTROL *invcntl)
679 {
680 	/* write out the control block in case anything changed */
681 	if (invcntl->param.filestat > 0) {
682 		invcntl->param.filestat = 0;
683 		rewind(invcntl->invfile);
684 		fwrite(&invcntl->param, 1,
685 		    sizeof(invcntl->param), invcntl->invfile);
686 	}
687 	if (invcntl->param.filestat == INVALONE) {
688 		/* write out the super finger */
689 		fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
690 		fwrite(invcntl->iindex, 1,
691 		       (int) invcntl->param.supsize, invcntl->invfile);
692 	}
693 	fclose(invcntl->invfile);
694 	fclose(invcntl->postfile);
695 #if SHARE
696 	if (invcntl->param.share > 0) {
697 		shmdt(invcntl->iindex);
698 		invcntl->iindex = NULL;
699 	}
700 #endif
701 	if (invcntl->iindex != NULL)
702 		free(invcntl->iindex);
703 	free(invcntl->logblk);
704 }
705 
706 /** invstep steps the inverted file forward one item **/
707 static void
invstep(INVCONTROL * invcntl)708 invstep(INVCONTROL *invcntl)
709 {
710 	if (invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
711 		invcntl->keypnt++;
712 		return;
713 	}
714 
715 	/* move forward a block else wrap */
716 	invcntl->numblk = invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
717 
718 	/* now read in the block  */
719 	fseek(invcntl->invfile,
720 	      invcntl->numblk*invcntl->param.sizeblk + invcntl->param.cntlsize,
721 	      SEEK_SET);
722 	fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
723 	      invcntl->invfile);
724 	invcntl->keypnt = 0;
725 }
726 
727 /** invforward moves forward one term in the inverted file  **/
728 int
invforward(INVCONTROL * invcntl)729 invforward(INVCONTROL *invcntl)
730 {
731 	invstep(invcntl);
732 	/* skip things with 0 postings */
733 	/* FIXME HBB: magic number alert! (3) */
734 	while (((ENTRY * )(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
735 		invstep(invcntl);
736 	}
737 	/* Check for having wrapped - reached start of inverted file! */
738 	if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
739 		return(0);
740 	return(1);
741 }
742 
743 /**  invterm gets the present term from the present logical block  **/
744 long
invterm(INVCONTROL * invcntl,char * term)745 invterm(INVCONTROL *invcntl, char *term)
746 {
747 	ENTRY * entryptr;
748 
749 	/* FIXME HBB: magic number alert! (3) */
750 	entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
751 	strncpy(term, invcntl->logblk->chrblk + entryptr->offset,
752 		       (int) entryptr->size);
753 	*(term + entryptr->size) = '\0';
754 	return(entryptr->post);
755 }
756 
757 /** invfind searches for an individual item in the inverted file  **/
758 long
invfind(INVCONTROL * invcntl,char * searchterm)759 invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for  */
760 {
761 	int	imid, ilow, ihigh;
762 	long	num;
763 	int	i;
764 	unsigned long	*intptr, *intptr2;
765 	ENTRY *entryptr;
766 
767 	/* make sure it is initialized via invready  */
768 	if (invcntl->invfile == 0)
769 		return(-1L);
770 
771 	/* now search for the appropriate finger block */
772 	intptr = (unsigned long *)invcntl->iindex;
773 
774 	ilow = 0;
775 	ihigh = *intptr++ - 1;
776 	while (ilow <= ihigh) {
777 		imid = (ilow + ihigh) / 2;
778 		intptr2 = intptr + imid;
779 		i = strcmp(searchterm, (invcntl->iindex + *intptr2));
780 		if (i < 0)
781 			ihigh = imid - 1;
782 		else if (i > 0)
783 			ilow = ++imid;
784 		else {
785 			ilow = imid + 1;
786 			break;
787 		}
788 	}
789 	/* be careful about case where searchterm is after last in this block  */
790 	imid = (ilow) ? ilow - 1 : 0;
791 
792 	/* fetch the appropriate logical block if not in core  */
793 	/* note always fetch it if the file is busy */
794 	if ((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
795 		fseek(invcntl->invfile,
796 		      (imid*invcntl->param.sizeblk) + invcntl->param.cntlsize,
797 		      SEEK_SET);
798 		invcntl->numblk = imid;
799 		fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1,
800 		      invcntl->invfile);
801 	}
802 
803 srch_ext:
804 	/* now find the term in this block. tricky this  */
805 	intptr = (unsigned long *) invcntl->logblk->invblk;
806 
807 	ilow = 0;
808 	ihigh = *intptr - 1;
809 	intptr += 3;
810 	num = 0;
811 	while (ilow <= ihigh) {
812 		imid = (ilow + ihigh) / 2;
813 		entryptr = (ENTRY *)intptr + imid;
814 		i = strncmp(searchterm, invcntl->logblk->chrblk + entryptr->offset,
815 		    (int) entryptr->size );
816 		if (i == 0)
817 			i = strlen(searchterm) - entryptr->size;
818 		if (i < 0)
819 			ihigh = imid - 1;
820 		else if (i > 0)
821 			ilow = ++imid;
822 		else {
823 			num = entryptr->post;
824 			break;
825 		}
826 	}
827 	/* be careful about case where searchterm is after last in this block  */
828 	if (imid >= invcntl->logblk->invblk[0]) {
829 		invcntl->keypnt = invcntl->logblk->invblk[0];
830 		invstep(invcntl);
831 		/* note if this happens the term could be in extended block */
832 		if (invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
833 			goto srch_ext;
834 	} else
835 		invcntl->keypnt = imid;
836 	return(num);
837 }
838 
839 #if DEBUG
840 
841 /** invdump dumps the block the term parameter is in **/
842 void
invdump(INVCONTROL * invcntl,char * term)843 invdump(INVCONTROL *invcntl, char *term)
844 {
845 	long	i, j, n, *longptr;
846 	ENTRY * entryptr;
847 	char	temp[512], *ptr;
848 
849 	/* dump superindex if term is "-"  */
850 	if (*term == '-') {
851 		j = atoi(term + 1);
852 		longptr = (long *)invcntl->iindex;
853 		n = *longptr++;
854 		printf("Superindex dump, num blocks=%ld\n", n);
855 		longptr += j;
856 		while ((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
857 			printf("%2ld  %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
858 			longptr++;
859 		}
860 		return;
861 	} else if (*term == '#') {
862 		j = atoi(term + 1);
863 		/* fetch the appropriate logical block */
864 		invcntl->numblk = j;
865 		fseek(invcntl->invfile,
866 		      (j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
867 		      SEEK_SET);
868 		fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
869 		      invcntl->invfile);
870 	} else
871 		i = abs((int) invfind(invcntl, term));
872 	longptr = invcntl->logblk->invblk;
873 	n = *longptr++;
874 	printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n"
875 	    , term, i, *(longptr), *(longptr + 1));
876 	/* FIXME HBB: magic number alert! (3) */
877 	entryptr = (ENTRY *) (invcntl->logblk->invblk + 3);
878 	printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
879 	printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
880 	for (j = 0; j < n && invbreak == 0; j++) {
881 		ptr = invcntl->logblk->chrblk + entryptr->offset;
882 		strncpy(temp, ptr, (int) entryptr->size);
883 		temp[entryptr->size] = '\0';
884 		ptr += (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
885 		printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, entryptr->post,
886 		    entryptr->size, entryptr->offset, entryptr->space,
887 		    *(long *)ptr);
888 		entryptr++;
889 	}
890 }
891 #endif
892 
893 static int
boolready(void)894 boolready(void)
895 {
896 	numitems = 0;
897 	if (item1 != NULL)
898 		free(item1);
899 	setsize1 = SETINC;
900 	if ((item1 = malloc(SETINC * sizeof(*item1))) == NULL) {
901 		invcannotalloc(SETINC);
902 		return(-1);
903 	}
904 	if (item2 != NULL)
905 		free(item2);
906 	setsize2 = SETINC;
907 	if ((item2 = malloc(SETINC * sizeof(*item2))) == NULL) {
908 		invcannotalloc(SETINC);
909 		return(-1);
910 	}
911 	item = item1;
912 	enditem = item;
913 	return(0);
914 }
915 
916 void
boolclear(void)917 boolclear(void)
918 {
919 	numitems = 0;
920 	item = item1;
921 	enditem = item;
922 }
923 
924 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int boolarg)925 boolfile(INVCONTROL *invcntl, long *num, int boolarg)
926 {
927 	ENTRY	*entryptr;
928 	FILE	*file;
929 	void	*ptr;
930 	unsigned long	*ptr2;
931 	POSTING	*newitem = NULL; /* initialize, to avoid warning */
932 	POSTING	posting;
933 	unsigned u;
934 	POSTING *newsetp = NULL, *set1p;
935 	long	newsetc, set1c, set2c;
936 
937 	/* FIXME HBB: magic number alert! (3) */
938 	entryptr = (ENTRY *) (invcntl->logblk->invblk + 3) + invcntl->keypnt;
939 	ptr = invcntl->logblk->chrblk + entryptr->offset;
940 	ptr2 = ((unsigned long *) ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
941 	*num = entryptr->post;
942 	switch (boolarg) {
943 	case BOOL_OR:
944 	case NOT:
945 		if (*num == 0) {
946 			*num = numitems;
947 			return(item);
948 		}
949 	}
950 	/* make room for the new set */
951 	u = 0;
952 	switch (boolarg) {
953 	case AND:
954 	case NOT:
955 		newsetp = item;
956 		break;
957 
958 	case BOOL_OR:
959 		u = enditem - item;
960 		/* FALLTHROUGH */
961 	case REVERSENOT:
962 		u += *num;
963 		if (item == item2) {
964 			if (u > setsize1) {
965 				u += SETINC;
966 				if ((item1 = realloc(item1, u * sizeof(*item1))) == NULL) {
967 					invcannotalloc(u * sizeof(*item1));
968 					boolready();
969 					*num = -1;
970 					return(NULL);
971 				}
972 				setsize1 = u;
973 			}
974 			newitem = item1;
975 		}
976 		else {
977 			if (u > setsize2) {
978 				u += SETINC;
979 				if ((item2 = realloc(item2, u * sizeof(*item2))) == NULL) {
980 					invcannotalloc(u * sizeof(*item2));
981 					boolready();
982 					*num = -1;
983 					return(NULL);
984 				}
985 				setsize2 = u;
986 			}
987 			newitem = item2;
988 		}
989 #if 0 /* this write is only need by commented-out code later */
990 		set1p = item;
991 #endif
992 		newsetp = newitem;
993 	}
994 	file = invcntl->postfile;
995 	fseek(file, *ptr2, SEEK_SET);
996 	fread(&posting, sizeof(posting), 1, file);
997 	newsetc = 0;
998 	switch (boolarg) {
999 	case BOOL_OR:
1000 		/* while something in both sets */
1001 		set1p = item;
1002 		newsetp = newitem;
1003 		for (set1c = 0, set2c = 0;
1004 		    set1c < numitems && set2c < *num; newsetc++) {
1005 			if (set1p->lineoffset < posting.lineoffset) {
1006 				*newsetp++ = *set1p++;
1007 				set1c++;
1008 			}
1009 			else if (set1p->lineoffset > posting.lineoffset) {
1010 				*newsetp++ = posting;
1011 				fread(&posting, (int) sizeof(posting), 1, file);
1012 				set2c++;
1013 			}
1014 			else if (set1p->type < posting.type) {
1015 				*newsetp++ = *set1p++;
1016 				set1c++;
1017 			}
1018 			else if (set1p->type > posting.type) {
1019 				*newsetp++ = posting;
1020 				fread(&posting, (int) sizeof(posting), 1, file);
1021 				set2c++;
1022 			}
1023 			else {	/* identical postings */
1024 				*newsetp++ = *set1p++;
1025 				set1c++;
1026 				fread(&posting, (int) sizeof(posting), 1, file);
1027 				set2c++;
1028 			}
1029 		}
1030 		/* find out what ran out and move the rest in */
1031 		if (set1c < numitems) {
1032 			newsetc += numitems - set1c;
1033 			while (set1c++ < numitems) {
1034 				*newsetp++ = *set1p++;
1035 			}
1036 		} else {
1037 			while (set2c++ < *num) {
1038 				*newsetp++ = posting;
1039 				newsetc++;
1040 				fread(&posting, (int) sizeof(posting), 1, file);
1041 			}
1042 		}
1043 		item = newitem;
1044 		break; /* end of BOOL_OR */
1045 #if 0
1046 	case AND:
1047 		for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1048 			if (set1p->lineoffset < posting.lineoffset) {
1049 				set1p++;
1050 				set1c++;
1051 			}
1052 			else if (set1p->lineoffset > posting.lineoffset) {
1053 				fread(&posting, (int) sizeof(posting), 1, file);
1054 				set2c++;
1055 			}
1056 			else if (set1p->type < posting.type)  {
1057 				*set1p++;
1058 				set1c++;
1059 			}
1060 			else if (set1p->type > posting.type) {
1061 				fread(&posting, (int) sizeof(posting), 1, file);
1062 				set2c++;
1063 			}
1064 			else {	/* identical postings */
1065 				*newsetp++ = *set1p++;
1066 				newsetc++;
1067 				set1c++;
1068 				fread(&posting, (int) sizeof(posting), 1, file);
1069 				set2c++;
1070 			}
1071 		}
1072 		break; /* end of AND */
1073 
1074 	case NOT:
1075 		for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1076 			if (set1p->lineoffset < posting.lineoffset) {
1077 				*newsetp++ = *set1p++;
1078 				newsetc++;
1079 				set1c++;
1080 			}
1081 			else if (set1p->lineoffset > posting.lineoffset) {
1082 				fread(&posting, (int) sizeof(posting), 1, file);
1083 				set2c++;
1084 			}
1085 			else if (set1p->type < posting.type) {
1086 				*newsetp++ = *set1p++;
1087 				newsetc++;
1088 				set1c++;
1089 			}
1090 			else if (set1p->type > posting.type) {
1091 				fread(&posting, (int) sizeof(posting), 1, file);
1092 				set2c++;
1093 			}
1094 			else {	/* identical postings */
1095 				set1c++;
1096 				set1p++;
1097 				fread(&posting, (int) sizeof(posting), 1, file);
1098 				set2c++;
1099 			}
1100 		}
1101 		newsetc += numitems - set1c;
1102 		while (set1c++ < numitems) {
1103 			*newsetp++ = *set1p++;
1104 		}
1105 		break; /* end of NOT */
1106 
1107 	case REVERSENOT:  /* core NOT incoming set */
1108 		for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1109 			if (set1p->lineoffset < posting.lineoffset) {
1110 				set1p++;
1111 				set1c++;
1112 			}
1113 			else if (set1p->lineoffset > posting.lineoffset) {
1114 				*newsetp++ = posting;
1115 				fread(&posting, (int) sizeof(posting), 1, file);
1116 				set2c++;
1117 			}
1118 			else if (set1p->type < posting.type) {
1119 				set1p++;
1120 				set1c++;
1121 			}
1122 			else if (set1p->type > posting.type) {
1123 				*newsetp++ = posting;
1124 				fread(&posting, (int) sizeof(posting), 1, file);
1125 				set2c++;
1126 			}
1127 			else {	/* identical postings */
1128 				set1c++;
1129 				set1p++;
1130 				fread(&posting, (int) sizeof(posting), 1, file);
1131 				set2c++;
1132 			}
1133 		}
1134 		while (set2c++ < *num) {
1135 			*newsetp++ = posting;
1136 			newsetc++;
1137 			fread(&posting, (int) sizeof(posting), 1, file);
1138 		}
1139 		item = newitem;
1140 		break; /* end of REVERSENOT  */
1141 #endif
1142 	}
1143 	numitems = newsetc;
1144 	*num = newsetc;
1145 	enditem = (POSTING *) newsetp;
1146 	return((POSTING *) item);
1147 }
1148 
1149 #if 0
1150 POSTING *
1151 boolsave(int clear)		/* flag about whether to clear core  */
1152 {
1153 	int	i;
1154 	POSTING	*ptr;
1155 	POSTING	*oldstuff, *newstuff;
1156 
1157 	if (numitems == 0) {
1158 		if (clear)
1159 			boolclear();
1160 		return(NULL);
1161 	}
1162 	/* if clear then give them what we have and use boolready to realloc  */
1163 	if (clear) {
1164 		ptr = item;
1165 		/* free up the space we didn't give them */
1166 		if (item == item1)
1167 			item1 = NULL;
1168 		else
1169 			item2 = NULL;
1170 		boolready();
1171 		return(ptr);
1172 	}
1173 	i = (enditem - item) * sizeof(*ptr) + 100;
1174 	if ((ptr = malloc(i)) == NULL) {
1175 		invcannotalloc(i);
1176 		return(ptr);
1177 	}
1178 	/* move present set into place  */
1179 	oldstuff = item;
1180 	newstuff = ptr;
1181 	while (oldstuff < enditem)
1182 		*newstuff++ = *oldstuff++;
1183 	return(ptr);
1184 }
1185 #endif
1186 
1187 static void
invcannotalloc(unsigned n)1188 invcannotalloc(unsigned n)
1189 {
1190 	fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1191 }
1192 
1193 static void
invcannotopen(char * file)1194 invcannotopen(char *file)
1195 {
1196 	fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1197 }
1198 
1199 static void
invcannotwrite(char * file)1200 invcannotwrite(char *file)
1201 {
1202 	perror(argv0);	/* must be first to preserve errno */
1203 	fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1204 }
1205