1 #ifndef lint 2 static char sccsid[] = "@(#)invert.c 2.6 03/05/87"; 3 #endif not lint 4 # 5 /* input: records of lines, separated by blank lines 6 output: key:file1 start/length ... start/length:file2 start/length ... 7 */ 8 9 # include "stdio.h" 10 # include "streams.h" 11 # include "bib.h" 12 # define isnull(x) (*(x) == NULL) 13 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) 14 15 int max_kcnt = 100; /* max number of keys */ 16 int max_klen = 6; /* max length of keys */ 17 char *ignore = /* string of line starts to ignore */ 18 "CNOPVX"; 19 char *common = /* name of file of common words */ 20 COMFILE; 21 char *INDEX= /* name of output file */ 22 INDXFILE; 23 24 char *bibtmpfile = /* name of temporary file */ 25 INVTEMPFILE; 26 27 int silent = 0; /* 0 => statistics printed */ 28 /* 1 => no statisitics printed */ 29 30 char *sort_it = 31 "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; 32 char sortcmd[maxstr]; 33 34 int argc; 35 char **argv; 36 37 main(argcount,arglist) 38 int argcount; 39 char **arglist; 40 { char *filename; 41 FILE *input, *output; 42 long int start,length; 43 char word[maxstr]; 44 int kcnt; 45 char tag_line[maxstr]; 46 47 long int records = 0; /* number of records read */ 48 long int keys = 0; /* number of keys read (occurences) */ 49 long int distinct; /* number of distinct keys */ 50 long int shorten(); 51 52 strcpy(COMFILE, N_COMFILE); 53 strcpy(BMACLIB, N_BMACLIB); 54 55 argc= argcount-1; 56 argv= arglist+1; 57 mktemp(bibtmpfile); 58 output= fopen(bibtmpfile,"w"); 59 60 for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) 61 { /* open input file */ 62 filename= *argv; 63 input= fopen(filename,"r"); 64 if (input==NULL) 65 { fprintf(stderr, "invert: error in open of %s\n", filename); 66 continue; 67 } 68 start= 0L; 69 length= 0L; 70 71 for(;;) /* each record */ 72 { /* find start of next record (exit if none) */ 73 start= nextrecord(input,start+length); 74 if (start==EOF) break; 75 records++; 76 kcnt= 0; 77 length= recsize(input,start); 78 sprintf(tag_line, " %s %d %d\n", filename, start, length); 79 80 while (ftell(input) < start+length && kcnt < max_kcnt) 81 { getword(input,word,ignore); 82 makekey(word,max_klen,common); 83 if (!isnull(word)) 84 { fputs(word,output); fputs(tag_line,output); 85 kcnt++; keys++; 86 } 87 } 88 } 89 fclose(input); 90 } 91 fclose(output); 92 93 sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile); 94 system(sortcmd); 95 96 distinct = shorten(bibtmpfile,INDEX); 97 if( silent == 0 ) 98 fprintf(stderr, 99 "%d documents %d distinct keys %d key occurrences\n", 100 records, distinct, keys); 101 exit(0); 102 } 103 104 105 106 /* Flag Meaning Default 107 -ki Keys per record 100 108 -li max Length of keys 6 109 -%str ignore lines that begin with %x CNOPVX 110 where x is in str 111 str is a seq of chars 112 -cfile file contains Common words /usr/new/lib/bib/common 113 do not use common words as keys 114 -pfile name of output file INDEX 115 -s do not print statistics statistics printed 116 */ 117 118 # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) 119 120 flags() 121 { 122 char *p; 123 for (; argc>0 && *argv[0]=='-'; argc--,argv++) 124 { switch ((*argv)[1]) 125 { case 'k': max_kcnt= atoi(operand); 126 break; 127 case 'l': max_klen= atoi(operand); 128 break; 129 case 'c': common= operand; 130 break; 131 case '%': ignore= *argv+2; 132 break; 133 case 'p': INDEX= operand; 134 break; 135 case 's': silent= 1; 136 break; 137 case 'd': 138 p = &argv[0][2]; 139 if (!p) { 140 argv++; 141 p = &argv[0][0]; 142 } 143 strreplace(COMFILE, BMACLIB, p); 144 strcpy(BMACLIB, p); 145 break; 146 default: fprintf(stderr, "unknown flag '%s'\n", *argv); 147 } 148 } 149 } 150 151 152 /* shorten(inf,outf): file "inf" consists of lines of the form: 153 key file start length 154 sorted by key and file. replace lines with the same key 155 with one line of the form: 156 key:file1 start/length ... start/length:file2 start/length ... 157 rename as file "outf" 158 returns number of lines in output 159 */ 160 long shorten(inf,outf) 161 char *inf, *outf; 162 { FILE *in, *out; 163 char line[maxstr]; 164 char key[maxstr], newkey[maxstr], 165 file[maxstr], newfile[maxstr]; 166 long int start, length; 167 long int lines = 0; 168 169 in= fopen(inf, "r"); 170 out= fopen(outf, "w"); 171 if (in==NULL || out==NULL) 172 { fprintf(stderr, "invert: error in opening file for compression\n"); 173 return(0); 174 } 175 176 getline(in,line); 177 sscanf(line,"%s%s%d%d", key, file, &start, &length); 178 fprintf(out, "%s :%s %d/%d", key, file, start, length); 179 for ( getline(in, line) ; !feof(in); getline(in, line)) 180 { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); 181 if (strcmp(key,newkey)!=0) 182 { strcpy(key, newkey); 183 strcpy(file, newfile); 184 fprintf(out, "\n%s :%s %d/%d", key, file, start, length); 185 lines++; 186 } 187 else if (strcmp(file,newfile)!=0) 188 { strcpy(file,newfile); 189 fprintf(out, ":%s %d/%d", file, start, length); 190 } 191 else 192 fprintf(out, " %d/%d", start, length); 193 } 194 fprintf(out, "\n"); 195 lines++; 196 197 fclose(in); fclose(out); 198 unlink(inf); 199 return (lines); 200 } 201