1 /*===========================================================================
2 Copyright (c) 1998-2000, The Santa Cruz Operation
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 *Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 *Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 *Neither name of The Santa Cruz Operation nor the names of its contributors
16 may be used to endorse or promote products derived from this software
17 without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
20 IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
23 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 INTERRUPTION)
27 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 DAMAGE.
31 =========================================================================*/
32
33
34 #include <ctype.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #if SHARE
38 #include <sys/types.h>
39 #include <sys/ipc.h>
40 #include <sys/shm.h>
41 #define ERR -1
42 #endif
43 #include "invlib.h"
44 #include "global.h"
45
46 #include <assert.h>
47
48 #define DEBUG 0 /* debugging code and realloc messages */
49 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
50 #define POSTINC 10000 /* posting buffer size increment */
51 #define SEP ' ' /* sorted posting field separator */
52 #define SETINC 100 /* posting set size increment */
53 #define STATS 0 /* print statistics */
54 #define SUPERINC 10000 /* super index size increment */
55 #define TERMMAX 512 /* term max size */
56 #define FMTVERSION 1 /* inverted index format version */
57 #define ZIPFSIZE 200 /* zipf curve size */
58
59 #if DEBUG
60 /* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
61 * a value other than the (silent) initialization to zero. Pretty
62 * useless, that looks */
63 int invbreak;
64 #endif
65
66 static int boolready(void);
67 static int invnewterm(void);
68 static void invstep(INVCONTROL *invcntl);
69 static void invcannotalloc(unsigned n);
70 static void invcannotopen(char *file);
71 static void invcannotwrite(char *file);
72
73 #if STATS
74 int showzipf; /* show postings per term distribution */
75 #endif
76
77 static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
78 static unsigned int setsize1, setsize2;
79 static long numitems, totterm, zerolong;
80 static char *indexfile, *postingfile;
81 static FILE *outfile, *fpost;
82 static size_t supersize = SUPERINC, supintsize;
83 static unsigned int numpost, numlogblk, amtused, nextpost;
84 static unsigned int lastinblk, numinvitems;
85 static POSTING *POST, *postptr;
86 static unsigned long *SUPINT, *supint, nextsupfing;
87 static char *SUPFING, *supfing;
88 static char thisterm[TERMMAX];
89 typedef union logicalblk {
90 long invblk[BLOCKSIZE / sizeof(long)];
91 char chrblk[BLOCKSIZE];
92 } t_logicalblk;
93 static t_logicalblk logicalblk;
94
95 #if DEBUG || STATS
96 static long totpost;
97 #endif
98
99 #if STATS
100 static int zipf[ZIPFSIZE + 1];
101 #endif
102
103 long
invmake(char * invname,char * invpost,FILE * infile)104 invmake(char *invname, char *invpost, FILE *infile)
105 {
106 char *s;
107 long num;
108 int i;
109 long fileindex = 0; /* initialze, to avoid warning */
110 unsigned postsize = POSTINC * sizeof(*POST);
111 unsigned long *intptr;
112 char line[TERMMAX];
113 long tlong;
114 PARAM param;
115 POSTING posting;
116 char temp[BLOCKSIZE];
117 #if STATS
118 int j;
119 unsigned maxtermlen = 0;
120 #endif
121 /* output file */
122 if ((outfile = vpfopen(invname, "w+b")) == NULL) {
123 invcannotopen(invname);
124 return(0);
125 }
126 indexfile = invname;
127 fseek(outfile, BUFSIZ, SEEK_SET);
128
129 /* posting file */
130 if ((fpost = vpfopen(invpost, "wb")) == NULL) {
131 invcannotopen(invpost);
132 return(0);
133 }
134 postingfile = invpost;
135 nextpost = 0;
136 /* get space for the postings list */
137 if ((POST = malloc(postsize)) == NULL) {
138 invcannotalloc(postsize);
139 return(0);
140 }
141 postptr = POST;
142 /* get space for the superfinger (superindex) */
143 if ((SUPFING = malloc(supersize)) == NULL) {
144 invcannotalloc(supersize);
145 return(0);
146 }
147 supfing = SUPFING;
148 /* FIXME HBB: magic number alert (40) */
149 supintsize = supersize / 40u;
150 /* also for the superfinger index */
151 if ((SUPINT = malloc(supintsize * sizeof(*SUPINT))) == NULL) {
152 invcannotalloc(supintsize * sizeof(*SUPINT));
153 return(0);
154 }
155 supint = SUPINT;
156 supint++; /* leave first term open for a count */
157 /* initialize using an empty term */
158 strcpy(thisterm, "");
159 *supint++ = 0;
160 *supfing++ = ' ';
161 *supfing++ = '\0';
162 nextsupfing = 2;
163 #if DEBUG || STATS
164 totpost = 0L;
165 #endif
166 totterm = 0L;
167 numpost = 1;
168
169 /* set up as though a block had come and gone, i.e., set up for new block */
170 /* 3 longs needed for: numinvitems, next block, and previous block */
171 amtused = 3 * sizeof(long);
172 numinvitems = 0;
173 numlogblk = 0;
174 lastinblk = sizeof(t_logicalblk);
175
176 /* now loop as long as more to read (till eof) */
177 while (fgets(line, TERMMAX, infile) != NULL) {
178 #if DEBUG || STATS
179 ++totpost;
180 #endif
181 s = strchr(line, SEP);
182 if (s != NULL) {
183 *s = '\0';
184 }
185 else {
186 continue;
187 }
188 #if STATS
189 if ((i = strlen(line)) > maxtermlen) {
190 maxtermlen = i;
191 }
192 #endif
193 #if DEBUG
194 printf("%ld: %s ", totpost, line);
195 fflush(stdout);
196 #endif
197 if (strcmp(thisterm, line) == 0) {
198 if ((postptr + 10) > (POST + (postsize / sizeof(*POST)))) {
199 i = postptr - POST;
200 postsize += POSTINC * sizeof(*POST);
201 if ((POST = realloc(POST, postsize)) == NULL) {
202 invcannotalloc(postsize);
203 return(0);
204 }
205 postptr = i + POST;
206 #if DEBUG
207 printf("reallocated post space to %u, totpost=%ld\n",
208 postsize, totpost);
209 #endif
210 }
211 numpost++;
212 } else {
213 /* have a new term */
214 if (!invnewterm()) {
215 return(0);
216 }
217 strcpy(thisterm, line);
218 numpost = 1;
219 postptr = POST;
220 fileindex = 0;
221 }
222 /* get the new posting */
223 num = *++s - '!';
224 i = 1;
225 do {
226 num = BASE * num + *++s - '!';
227 } while (++i < PRECISION);
228 posting.lineoffset = num;
229 while (++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
230 ;
231 }
232 posting.fileindex = --fileindex;
233 posting.type = *++s;
234 ++s;
235 if (*s != '\n') {
236 num = *++s - '!';
237 while (*++s != '\n') {
238 num = BASE * num + *s - '!';
239 }
240 posting.fcnoffset = num;
241 }
242 else {
243 posting.fcnoffset = 0;
244 }
245 *postptr++ = posting;
246 #if DEBUG
247 printf("%ld %ld %ld %ld\n", posting.fileindex,
248 posting.fcnoffset, posting.lineoffset, posting.type);
249 fflush(stdout);
250 #endif
251 }
252 if (!invnewterm()) {
253 return(0);
254 }
255 /* now clean up final block */
256 logicalblk.invblk[0] = numinvitems;
257 /* loops pointer around to start */
258 logicalblk.invblk[1] = 0;
259 logicalblk.invblk[2] = numlogblk - 1;
260 if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
261 goto cannotwrite;
262 }
263 numlogblk++;
264 /* write out block to save space. what in it doesn't matter */
265 if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
266 goto cannotwrite;
267 }
268 /* finish up the super finger */
269 *SUPINT = numlogblk;
270 /* add to the offsets the size of the offset pointers */
271 intptr = (SUPINT + 1);
272 i = (char *)supint - (char *)SUPINT;
273 while (intptr < supint)
274 *intptr++ += i;
275 /* write out the offsets (1 for the N at start) and the super finger */
276 if (fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
277 fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
278 goto cannotwrite;
279 }
280 /* save the size for reference later */
281 nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
282 /* make sure the file ends at a logical block boundary. This is
283 necessary for invinsert to correctly create extended blocks
284 */
285 i = nextsupfing % sizeof(t_logicalblk);
286 /* write out junk to fill log blk */
287 if (fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
288 fflush(outfile) == EOF) { /* rewind doesn't check for write failure */
289 goto cannotwrite;
290 }
291 /* write the control area */
292 rewind(outfile);
293 param.version = FMTVERSION;
294 param.filestat = 0;
295 param.sizeblk = sizeof(t_logicalblk);
296 param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;;
297 param.supsize = nextsupfing;
298 param.cntlsize = BUFSIZ;
299 param.share = 0;
300 if (fwrite(¶m, sizeof(param), 1, outfile) == 0) {
301 goto cannotwrite;
302 }
303 for (i = 0; i < 10; i++) /* for future use */
304 if (fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) {
305 goto cannotwrite;
306 }
307
308 /* make first block loop backwards to last block */
309 if (fflush(outfile) == EOF) { /* fseek doesn't check for write failure */
310 goto cannotwrite;
311 }
312 /* get to second word first block */
313 fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
314 tlong = numlogblk - 1;
315 if (fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 ||
316 fclose(outfile) == EOF) {
317 cannotwrite:
318 invcannotwrite(invname);
319 return(0);
320 }
321 if (fclose(fpost) == EOF) {
322 invcannotwrite(postingfile);
323 return(0);
324 }
325 --totterm; /* don't count null term */
326 #if STATS
327 printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
328 numlogblk, totpost, totterm, maxtermlen);
329 if (showzipf) {
330 printf("\n************* ZIPF curve ****************\n");
331 for (j = ZIPFSIZE; j > 1; j--)
332 if (zipf[j])
333 break;
334 for (i = 1; i < j; ++i) {
335 printf("%3d -%6d ", i, zipf[i]);
336 if (i % 6 == 0) putchar('\n');
337 }
338 printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
339 }
340 #endif
341 /* free all malloc'd memory */
342 free(POST);
343 free(SUPFING);
344 free(SUPINT);
345 return(totterm);
346 }
347
348 /* add a term to the data base */
349
350 static int
invnewterm(void)351 invnewterm(void)
352 {
353 int backupflag, i, j, holditems, gooditems, howfar;
354 unsigned int maxback, len, numwilluse, wdlen;
355 char *tptr, *tptr3;
356
357 union {
358 unsigned long packword[2];
359 ENTRY e;
360 } iteminfo;
361
362 gooditems = 0; /* initialize, to avoid warning */
363 totterm++;
364 #if STATS
365 /* keep zipfian info on the distribution */
366 if (numpost <= ZIPFSIZE)
367 zipf[numpost]++;
368 else
369 zipf[0]++;
370 #endif
371 len = strlen(thisterm);
372 /* length of term rounded up to long boundary */
373 wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
374 /* each term needs 2 longs for its iteminfo and
375 * 1 long for its offset */
376 numwilluse = (wdlen + 3) * sizeof(long);
377 /* new block if at least 1 item in block */
378 if (numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
379 /* set up new block */
380 if (supfing + 500u > SUPFING + supersize) {
381 i = supfing - SUPFING;
382 supersize += 20000u;
383 if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
384 invcannotalloc(supersize);
385 return(0);
386 }
387 supfing = i + SUPFING;
388 #if DEBUG
389 printf("reallocated superfinger space to %d, totpost=%ld\n",
390 supersize, totpost);
391 #endif
392 }
393 /* check that room for the offset as well */
394 /* FIXME HBB: magic number alert (10) */
395 if ((numlogblk + 10) > supintsize) {
396 i = supint - SUPINT;
397 supintsize += SUPERINC;
398 if ((SUPINT = realloc(SUPINT, supintsize * sizeof(*SUPINT))) == NULL) {
399 invcannotalloc(supintsize * sizeof(*SUPINT));
400 return(0);
401 }
402 supint = i + SUPINT;
403 #if DEBUG
404 printf("reallocated superfinger offset to %d, totpost = %ld\n", supintsize * sizeof(*SUPINT), totpost);
405 #endif
406 }
407 /* See if backup is efficatious */
408 backupflag = 0;
409 maxback = (int) strlen(thisterm) / 10;
410 holditems = numinvitems;
411 if (maxback > numinvitems)
412 maxback = numinvitems - 2;
413 howfar = 0;
414 while (maxback-- > 1) {
415 howfar++;
416 iteminfo.packword[0] =
417 logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
418 if ((i = iteminfo.e.size / 10) < maxback) {
419 maxback = i;
420 backupflag = howfar;
421 gooditems = holditems;
422 }
423 }
424 /* see if backup will occur */
425 if (backupflag) {
426 numinvitems = gooditems;
427 }
428 logicalblk.invblk[0] = numinvitems;
429 /* set forward pointer pointing to next */
430 logicalblk.invblk[1] = numlogblk + 1;
431 /* set back pointer to last block */
432 logicalblk.invblk[2] = numlogblk - 1;
433 if (fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
434 invcannotwrite(indexfile);
435 return(0);
436 }
437 /* 3 longs needed for: numinvitems, next block, and previous block */
438 amtused = 3 * sizeof(long);
439 numlogblk++;
440 /* check if had to back up, if so do it */
441 if (backupflag) {
442 char *tptr2;
443
444 /* find out where the end of the new block is */
445 iteminfo.packword[0] = logicalblk.invblk[numinvitems*2+1];
446 tptr3 = logicalblk.chrblk + iteminfo.e.offset;
447 /* move the index for this block */
448 for (i = 3; i <= (backupflag * 2 + 2); i++)
449 logicalblk.invblk[i] = logicalblk.invblk[numinvitems*2+i];
450 /* move the word into the super index */
451 iteminfo.packword[0] = logicalblk.invblk[3];
452 iteminfo.packword[1] = logicalblk.invblk[4];
453 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
454 strncpy(supfing, tptr2, (int) iteminfo.e.size);
455 *(supfing + iteminfo.e.size) = '\0';
456 #if DEBUG
457 printf("backup %d at term=%s to term=%s\n",
458 backupflag, thisterm, supfing);
459 #endif
460 *supint++ = nextsupfing;
461 nextsupfing += strlen(supfing) + 1;
462 supfing += strlen(supfing) + 1;
463 /* now fix up the logical block */
464 tptr = logicalblk.chrblk + lastinblk;
465 lastinblk = sizeof(t_logicalblk);
466 tptr2 = logicalblk.chrblk + lastinblk;
467 j = tptr3 - tptr;
468 while (tptr3 > tptr)
469 *--tptr2 = *--tptr3;
470 lastinblk -= j;
471 amtused += ((2 * sizeof(long)) * backupflag + j);
472 for (i = 3; i < (backupflag * 2 + 2); i += 2) {
473 iteminfo.packword[0] = logicalblk.invblk[i];
474 iteminfo.e.offset += (tptr2 - tptr3);
475 logicalblk.invblk[i] = iteminfo.packword[0];
476 }
477 numinvitems = backupflag;
478 } else { /* no backup needed */
479 numinvitems = 0;
480 lastinblk = sizeof(t_logicalblk);
481 /* add new term to superindex */
482 strcpy(supfing, thisterm);
483 supfing += strlen(thisterm) + 1;
484 *supint++ = nextsupfing;
485 nextsupfing += strlen(thisterm) + 1;
486 }
487 }
488 /* HBB 20010501: Fixed bug by replacing magic number '8' by
489 * what it actually represents. */
490 lastinblk -= (numwilluse - 2 * sizeof(long));
491 iteminfo.e.offset = lastinblk;
492 iteminfo.e.size = len;
493 iteminfo.e.space = 0;
494 iteminfo.e.post = numpost;
495 strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
496 amtused += numwilluse;
497 logicalblk.invblk[(lastinblk/sizeof(long))+wdlen] = nextpost;
498 if ((i = postptr - POST) > 0) {
499 if (fwrite(POST, sizeof(*POST), i, fpost) == 0) {
500 invcannotwrite(postingfile);
501 return(0);
502 }
503 nextpost += i * sizeof(*POST);
504 }
505 logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
506 logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
507 return(1);
508 }
509
510 /*
511 * If 'invname' ends with the 'from' substring, it is replaced inline with the
512 * 'to' substring (which must be of the exact same length), and the function
513 * returns 0. Otherwise, returns -1.
514 */
515
516 static int
invflipname(char * invname,const char * from,const char * to)517 invflipname(char * invname, const char *from, const char *to)
518 {
519 char *temp, *i = NULL;
520
521 assert(strlen(from) == strlen(to));
522
523 temp = invname - 1;
524 while( (temp = strstr(temp + 1, from)))
525 i = temp;
526 if (!i || i[strlen(from)] != '\0')
527 return -1;
528 while(*to)
529 *i++ = *to++;
530 return 0;
531 }
532
533 /* small helper function to centralize handling of binary opening
534 * for reading, and use of the 'stat" flag */
535 static FILE *
open_for_reading(char * name,int stat)536 open_for_reading(char *name, int stat)
537 {
538 return vpfopen(name, ((stat == 0) ? "rb" : "r+b"));
539 }
540
541 /* handle opening of a file under a possibly "flipped" name */
542 /* If db created without '-f', but now invoked with '-f cscope.out',
543 * we need to check for 'cscope.in.out', rather than 'cscope.out.in':
544 * I.e, hack around our own violation of the inverse db naming convention */
545 /* more silliness: if you create the db with '-f cscope', then try to open
546 * it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
547 * here. */
548 static FILE *
open_file_with_flipped_name(char * name,const char * flip_in,const char * flip_out,int stat)549 open_file_with_flipped_name(char *name, const char *flip_in, const char *flip_out, int stat)
550 {
551 if (! invflipname(name, flip_in, flip_out)) {
552 FILE *fptr = open_for_reading(name, stat);
553 if (! fptr)
554 /* flip back for error message */
555 invflipname(name, flip_out, flip_in);
556 return fptr;
557 };
558 return 0;
559 }
560
561 static FILE *
open_file_with_possibly_flipped_name(char * name,const char * flip1,const char * flip2,int stat)562 open_file_with_possibly_flipped_name(char *name, const char *flip1, const char *flip2, int stat)
563 {
564 FILE *fptr = open_for_reading(name, stat);
565
566 if (! fptr)
567 fptr = open_file_with_flipped_name(name, flip2, flip1, stat);
568 if (! fptr)
569 fptr = open_file_with_flipped_name(name, flip1, flip2, stat);
570 return fptr;
571 }
572
573 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)574 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
575 {
576 int read_index;
577
578 invcntl->invfile = open_file_with_possibly_flipped_name(invname, INVNAME, INVNAME2, stat);
579 if (! invcntl->invfile) {
580 invcannotopen(invname);
581 return(-1);
582 }
583 if (fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
584 fprintf(stderr, "%s: empty inverted file\n", argv0);
585 fclose(invcntl->invfile);
586 return(-1);
587 }
588 if (invcntl->param.version != FMTVERSION) {
589 fprintf(stderr, "%s: cannot read old index format; use -U option to force database to rebuild\n", argv0);
590 fclose(invcntl->invfile);
591 return(-1);
592 }
593 assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
594
595 if (stat == 0 && invcntl->param.filestat == INVALONE) {
596 fprintf(stderr, "%s: inverted file is locked\n", argv0);
597 fclose(invcntl->invfile);
598 return(-1);
599 }
600
601 invcntl->postfile = open_file_with_possibly_flipped_name(invpost, INVPOST, INVPOST2, stat);
602 if (! invcntl->postfile) {
603 invcannotopen(invpost);
604 fclose(invcntl->invfile);
605 return(-1);
606 }
607
608 /* allocate core for a logical block */
609 if ((invcntl->logblk = malloc((size_t) invcntl->param.sizeblk)) == NULL) {
610 invcannotalloc((size_t) invcntl->param.sizeblk);
611 fclose(invcntl->postfile);
612 fclose(invcntl->invfile);
613 return(-1);
614 }
615 /* allocate for and read in superfinger */
616 read_index = 1;
617 invcntl->iindex = NULL;
618 #if SHARE
619 if (invcntl->param.share == 1) {
620 key_t shm_key;
621 struct shmid_ds shm_buf;
622 int shm_id;
623
624 /* see if the shared segment exists */
625 shm_key = ftok(invname, 2);
626 shm_id = shmget(shm_key, 0, 0);
627 /* Failure simply means (hopefully) that segment doesn't exists */
628 if (shm_id == -1) {
629 /* Have to give general write permission due to AMdahl not having protected segments */
630 shm_id = shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
631 if (shm_id == -1)
632 perror("Could not create shared memory segment");
633 } else
634 read_index = 0;
635
636 if (shm_id != -1) {
637 invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
638 if (invcntl->iindex == (char *)ERR) {
639 fprintf(stderr, "%s: shared memory link failed\n", argv0);
640 invcntl->iindex = NULL;
641 read_index = 1;
642 }
643 }
644 }
645 #endif
646 if (invcntl->iindex == NULL)
647 /* FIXME HBB: magic number alert (4, sizeof(long)) */
648 invcntl->iindex = malloc((size_t) invcntl->param.supsize + 4 *sizeof(long));
649 if (invcntl->iindex == NULL) {
650 invcannotalloc((size_t) invcntl->param.supsize);
651 free(invcntl->logblk);
652 fclose(invcntl->postfile);
653 fclose(invcntl->invfile);
654 return(-1);
655 }
656 if (read_index) {
657 fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
658 fread(invcntl->iindex, (int) invcntl->param.supsize, 1,
659 invcntl->invfile);
660 }
661 invcntl->numblk = -1;
662 if (boolready() == -1) {
663 fclose(invcntl->postfile);
664 fclose(invcntl->invfile);
665 return(-1);
666 }
667 /* write back out the control block if anything changed */
668 invcntl->param.filestat = stat;
669 if (stat > invcntl->param.filestat ) {
670 rewind(invcntl->invfile);
671 fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
672 }
673 return(1);
674 }
675
676 /** invclose must be called to wrap things up and deallocate core **/
677 void
invclose(INVCONTROL * invcntl)678 invclose(INVCONTROL *invcntl)
679 {
680 /* write out the control block in case anything changed */
681 if (invcntl->param.filestat > 0) {
682 invcntl->param.filestat = 0;
683 rewind(invcntl->invfile);
684 fwrite(&invcntl->param, 1,
685 sizeof(invcntl->param), invcntl->invfile);
686 }
687 if (invcntl->param.filestat == INVALONE) {
688 /* write out the super finger */
689 fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
690 fwrite(invcntl->iindex, 1,
691 (int) invcntl->param.supsize, invcntl->invfile);
692 }
693 fclose(invcntl->invfile);
694 fclose(invcntl->postfile);
695 #if SHARE
696 if (invcntl->param.share > 0) {
697 shmdt(invcntl->iindex);
698 invcntl->iindex = NULL;
699 }
700 #endif
701 if (invcntl->iindex != NULL)
702 free(invcntl->iindex);
703 free(invcntl->logblk);
704 }
705
706 /** invstep steps the inverted file forward one item **/
707 static void
invstep(INVCONTROL * invcntl)708 invstep(INVCONTROL *invcntl)
709 {
710 if (invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
711 invcntl->keypnt++;
712 return;
713 }
714
715 /* move forward a block else wrap */
716 invcntl->numblk = invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
717
718 /* now read in the block */
719 fseek(invcntl->invfile,
720 invcntl->numblk*invcntl->param.sizeblk + invcntl->param.cntlsize,
721 SEEK_SET);
722 fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
723 invcntl->invfile);
724 invcntl->keypnt = 0;
725 }
726
727 /** invforward moves forward one term in the inverted file **/
728 int
invforward(INVCONTROL * invcntl)729 invforward(INVCONTROL *invcntl)
730 {
731 invstep(invcntl);
732 /* skip things with 0 postings */
733 /* FIXME HBB: magic number alert! (3) */
734 while (((ENTRY * )(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
735 invstep(invcntl);
736 }
737 /* Check for having wrapped - reached start of inverted file! */
738 if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
739 return(0);
740 return(1);
741 }
742
743 /** invterm gets the present term from the present logical block **/
744 long
invterm(INVCONTROL * invcntl,char * term)745 invterm(INVCONTROL *invcntl, char *term)
746 {
747 ENTRY * entryptr;
748
749 /* FIXME HBB: magic number alert! (3) */
750 entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
751 strncpy(term, invcntl->logblk->chrblk + entryptr->offset,
752 (int) entryptr->size);
753 *(term + entryptr->size) = '\0';
754 return(entryptr->post);
755 }
756
757 /** invfind searches for an individual item in the inverted file **/
758 long
invfind(INVCONTROL * invcntl,char * searchterm)759 invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for */
760 {
761 int imid, ilow, ihigh;
762 long num;
763 int i;
764 unsigned long *intptr, *intptr2;
765 ENTRY *entryptr;
766
767 /* make sure it is initialized via invready */
768 if (invcntl->invfile == 0)
769 return(-1L);
770
771 /* now search for the appropriate finger block */
772 intptr = (unsigned long *)invcntl->iindex;
773
774 ilow = 0;
775 ihigh = *intptr++ - 1;
776 while (ilow <= ihigh) {
777 imid = (ilow + ihigh) / 2;
778 intptr2 = intptr + imid;
779 i = strcmp(searchterm, (invcntl->iindex + *intptr2));
780 if (i < 0)
781 ihigh = imid - 1;
782 else if (i > 0)
783 ilow = ++imid;
784 else {
785 ilow = imid + 1;
786 break;
787 }
788 }
789 /* be careful about case where searchterm is after last in this block */
790 imid = (ilow) ? ilow - 1 : 0;
791
792 /* fetch the appropriate logical block if not in core */
793 /* note always fetch it if the file is busy */
794 if ((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
795 fseek(invcntl->invfile,
796 (imid*invcntl->param.sizeblk) + invcntl->param.cntlsize,
797 SEEK_SET);
798 invcntl->numblk = imid;
799 fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1,
800 invcntl->invfile);
801 }
802
803 srch_ext:
804 /* now find the term in this block. tricky this */
805 intptr = (unsigned long *) invcntl->logblk->invblk;
806
807 ilow = 0;
808 ihigh = *intptr - 1;
809 intptr += 3;
810 num = 0;
811 while (ilow <= ihigh) {
812 imid = (ilow + ihigh) / 2;
813 entryptr = (ENTRY *)intptr + imid;
814 i = strncmp(searchterm, invcntl->logblk->chrblk + entryptr->offset,
815 (int) entryptr->size );
816 if (i == 0)
817 i = strlen(searchterm) - entryptr->size;
818 if (i < 0)
819 ihigh = imid - 1;
820 else if (i > 0)
821 ilow = ++imid;
822 else {
823 num = entryptr->post;
824 break;
825 }
826 }
827 /* be careful about case where searchterm is after last in this block */
828 if (imid >= invcntl->logblk->invblk[0]) {
829 invcntl->keypnt = invcntl->logblk->invblk[0];
830 invstep(invcntl);
831 /* note if this happens the term could be in extended block */
832 if (invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
833 goto srch_ext;
834 } else
835 invcntl->keypnt = imid;
836 return(num);
837 }
838
839 #if DEBUG
840
841 /** invdump dumps the block the term parameter is in **/
842 void
invdump(INVCONTROL * invcntl,char * term)843 invdump(INVCONTROL *invcntl, char *term)
844 {
845 long i, j, n, *longptr;
846 ENTRY * entryptr;
847 char temp[512], *ptr;
848
849 /* dump superindex if term is "-" */
850 if (*term == '-') {
851 j = atoi(term + 1);
852 longptr = (long *)invcntl->iindex;
853 n = *longptr++;
854 printf("Superindex dump, num blocks=%ld\n", n);
855 longptr += j;
856 while ((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
857 printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
858 longptr++;
859 }
860 return;
861 } else if (*term == '#') {
862 j = atoi(term + 1);
863 /* fetch the appropriate logical block */
864 invcntl->numblk = j;
865 fseek(invcntl->invfile,
866 (j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
867 SEEK_SET);
868 fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
869 invcntl->invfile);
870 } else
871 i = abs((int) invfind(invcntl, term));
872 longptr = invcntl->logblk->invblk;
873 n = *longptr++;
874 printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n"
875 , term, i, *(longptr), *(longptr + 1));
876 /* FIXME HBB: magic number alert! (3) */
877 entryptr = (ENTRY *) (invcntl->logblk->invblk + 3);
878 printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
879 printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
880 for (j = 0; j < n && invbreak == 0; j++) {
881 ptr = invcntl->logblk->chrblk + entryptr->offset;
882 strncpy(temp, ptr, (int) entryptr->size);
883 temp[entryptr->size] = '\0';
884 ptr += (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
885 printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, entryptr->post,
886 entryptr->size, entryptr->offset, entryptr->space,
887 *(long *)ptr);
888 entryptr++;
889 }
890 }
891 #endif
892
893 static int
boolready(void)894 boolready(void)
895 {
896 numitems = 0;
897 if (item1 != NULL)
898 free(item1);
899 setsize1 = SETINC;
900 if ((item1 = malloc(SETINC * sizeof(*item1))) == NULL) {
901 invcannotalloc(SETINC);
902 return(-1);
903 }
904 if (item2 != NULL)
905 free(item2);
906 setsize2 = SETINC;
907 if ((item2 = malloc(SETINC * sizeof(*item2))) == NULL) {
908 invcannotalloc(SETINC);
909 return(-1);
910 }
911 item = item1;
912 enditem = item;
913 return(0);
914 }
915
916 void
boolclear(void)917 boolclear(void)
918 {
919 numitems = 0;
920 item = item1;
921 enditem = item;
922 }
923
924 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int boolarg)925 boolfile(INVCONTROL *invcntl, long *num, int boolarg)
926 {
927 ENTRY *entryptr;
928 FILE *file;
929 void *ptr;
930 unsigned long *ptr2;
931 POSTING *newitem = NULL; /* initialize, to avoid warning */
932 POSTING posting;
933 unsigned u;
934 POSTING *newsetp = NULL, *set1p;
935 long newsetc, set1c, set2c;
936
937 /* FIXME HBB: magic number alert! (3) */
938 entryptr = (ENTRY *) (invcntl->logblk->invblk + 3) + invcntl->keypnt;
939 ptr = invcntl->logblk->chrblk + entryptr->offset;
940 ptr2 = ((unsigned long *) ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
941 *num = entryptr->post;
942 switch (boolarg) {
943 case BOOL_OR:
944 case NOT:
945 if (*num == 0) {
946 *num = numitems;
947 return(item);
948 }
949 }
950 /* make room for the new set */
951 u = 0;
952 switch (boolarg) {
953 case AND:
954 case NOT:
955 newsetp = item;
956 break;
957
958 case BOOL_OR:
959 u = enditem - item;
960 /* FALLTHROUGH */
961 case REVERSENOT:
962 u += *num;
963 if (item == item2) {
964 if (u > setsize1) {
965 u += SETINC;
966 if ((item1 = realloc(item1, u * sizeof(*item1))) == NULL) {
967 invcannotalloc(u * sizeof(*item1));
968 boolready();
969 *num = -1;
970 return(NULL);
971 }
972 setsize1 = u;
973 }
974 newitem = item1;
975 }
976 else {
977 if (u > setsize2) {
978 u += SETINC;
979 if ((item2 = realloc(item2, u * sizeof(*item2))) == NULL) {
980 invcannotalloc(u * sizeof(*item2));
981 boolready();
982 *num = -1;
983 return(NULL);
984 }
985 setsize2 = u;
986 }
987 newitem = item2;
988 }
989 #if 0 /* this write is only need by commented-out code later */
990 set1p = item;
991 #endif
992 newsetp = newitem;
993 }
994 file = invcntl->postfile;
995 fseek(file, *ptr2, SEEK_SET);
996 fread(&posting, sizeof(posting), 1, file);
997 newsetc = 0;
998 switch (boolarg) {
999 case BOOL_OR:
1000 /* while something in both sets */
1001 set1p = item;
1002 newsetp = newitem;
1003 for (set1c = 0, set2c = 0;
1004 set1c < numitems && set2c < *num; newsetc++) {
1005 if (set1p->lineoffset < posting.lineoffset) {
1006 *newsetp++ = *set1p++;
1007 set1c++;
1008 }
1009 else if (set1p->lineoffset > posting.lineoffset) {
1010 *newsetp++ = posting;
1011 fread(&posting, (int) sizeof(posting), 1, file);
1012 set2c++;
1013 }
1014 else if (set1p->type < posting.type) {
1015 *newsetp++ = *set1p++;
1016 set1c++;
1017 }
1018 else if (set1p->type > posting.type) {
1019 *newsetp++ = posting;
1020 fread(&posting, (int) sizeof(posting), 1, file);
1021 set2c++;
1022 }
1023 else { /* identical postings */
1024 *newsetp++ = *set1p++;
1025 set1c++;
1026 fread(&posting, (int) sizeof(posting), 1, file);
1027 set2c++;
1028 }
1029 }
1030 /* find out what ran out and move the rest in */
1031 if (set1c < numitems) {
1032 newsetc += numitems - set1c;
1033 while (set1c++ < numitems) {
1034 *newsetp++ = *set1p++;
1035 }
1036 } else {
1037 while (set2c++ < *num) {
1038 *newsetp++ = posting;
1039 newsetc++;
1040 fread(&posting, (int) sizeof(posting), 1, file);
1041 }
1042 }
1043 item = newitem;
1044 break; /* end of BOOL_OR */
1045 #if 0
1046 case AND:
1047 for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1048 if (set1p->lineoffset < posting.lineoffset) {
1049 set1p++;
1050 set1c++;
1051 }
1052 else if (set1p->lineoffset > posting.lineoffset) {
1053 fread(&posting, (int) sizeof(posting), 1, file);
1054 set2c++;
1055 }
1056 else if (set1p->type < posting.type) {
1057 *set1p++;
1058 set1c++;
1059 }
1060 else if (set1p->type > posting.type) {
1061 fread(&posting, (int) sizeof(posting), 1, file);
1062 set2c++;
1063 }
1064 else { /* identical postings */
1065 *newsetp++ = *set1p++;
1066 newsetc++;
1067 set1c++;
1068 fread(&posting, (int) sizeof(posting), 1, file);
1069 set2c++;
1070 }
1071 }
1072 break; /* end of AND */
1073
1074 case NOT:
1075 for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1076 if (set1p->lineoffset < posting.lineoffset) {
1077 *newsetp++ = *set1p++;
1078 newsetc++;
1079 set1c++;
1080 }
1081 else if (set1p->lineoffset > posting.lineoffset) {
1082 fread(&posting, (int) sizeof(posting), 1, file);
1083 set2c++;
1084 }
1085 else if (set1p->type < posting.type) {
1086 *newsetp++ = *set1p++;
1087 newsetc++;
1088 set1c++;
1089 }
1090 else if (set1p->type > posting.type) {
1091 fread(&posting, (int) sizeof(posting), 1, file);
1092 set2c++;
1093 }
1094 else { /* identical postings */
1095 set1c++;
1096 set1p++;
1097 fread(&posting, (int) sizeof(posting), 1, file);
1098 set2c++;
1099 }
1100 }
1101 newsetc += numitems - set1c;
1102 while (set1c++ < numitems) {
1103 *newsetp++ = *set1p++;
1104 }
1105 break; /* end of NOT */
1106
1107 case REVERSENOT: /* core NOT incoming set */
1108 for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
1109 if (set1p->lineoffset < posting.lineoffset) {
1110 set1p++;
1111 set1c++;
1112 }
1113 else if (set1p->lineoffset > posting.lineoffset) {
1114 *newsetp++ = posting;
1115 fread(&posting, (int) sizeof(posting), 1, file);
1116 set2c++;
1117 }
1118 else if (set1p->type < posting.type) {
1119 set1p++;
1120 set1c++;
1121 }
1122 else if (set1p->type > posting.type) {
1123 *newsetp++ = posting;
1124 fread(&posting, (int) sizeof(posting), 1, file);
1125 set2c++;
1126 }
1127 else { /* identical postings */
1128 set1c++;
1129 set1p++;
1130 fread(&posting, (int) sizeof(posting), 1, file);
1131 set2c++;
1132 }
1133 }
1134 while (set2c++ < *num) {
1135 *newsetp++ = posting;
1136 newsetc++;
1137 fread(&posting, (int) sizeof(posting), 1, file);
1138 }
1139 item = newitem;
1140 break; /* end of REVERSENOT */
1141 #endif
1142 }
1143 numitems = newsetc;
1144 *num = newsetc;
1145 enditem = (POSTING *) newsetp;
1146 return((POSTING *) item);
1147 }
1148
1149 #if 0
1150 POSTING *
1151 boolsave(int clear) /* flag about whether to clear core */
1152 {
1153 int i;
1154 POSTING *ptr;
1155 POSTING *oldstuff, *newstuff;
1156
1157 if (numitems == 0) {
1158 if (clear)
1159 boolclear();
1160 return(NULL);
1161 }
1162 /* if clear then give them what we have and use boolready to realloc */
1163 if (clear) {
1164 ptr = item;
1165 /* free up the space we didn't give them */
1166 if (item == item1)
1167 item1 = NULL;
1168 else
1169 item2 = NULL;
1170 boolready();
1171 return(ptr);
1172 }
1173 i = (enditem - item) * sizeof(*ptr) + 100;
1174 if ((ptr = malloc(i)) == NULL) {
1175 invcannotalloc(i);
1176 return(ptr);
1177 }
1178 /* move present set into place */
1179 oldstuff = item;
1180 newstuff = ptr;
1181 while (oldstuff < enditem)
1182 *newstuff++ = *oldstuff++;
1183 return(ptr);
1184 }
1185 #endif
1186
1187 static void
invcannotalloc(unsigned n)1188 invcannotalloc(unsigned n)
1189 {
1190 fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1191 }
1192
1193 static void
invcannotopen(char * file)1194 invcannotopen(char *file)
1195 {
1196 fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1197 }
1198
1199 static void
invcannotwrite(char * file)1200 invcannotwrite(char *file)
1201 {
1202 perror(argv0); /* must be first to preserve errno */
1203 fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1204 }
1205