1 #ifndef lint 2 static char sccsid[] = "@(#)invert.c 2.7 05/27/93"; 3 #endif not lint 4 # 5 /* input: records of lines, separated by blank lines 6 output: key:file1 start/length ... start/length:file2 start/length ... 7 */ 8 9 # include "stdio.h" 10 # include "streams.h" 11 # include "bib.h" 12 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) 13 14 int max_kcnt = 100; /* max number of keys */ 15 int max_klen = 6; /* max length of keys */ 16 char *ignore = /* string of line starts to ignore */ 17 "CNOPVX"; 18 char *INDEX= /* name of output file */ 19 INDXFILE; 20 21 char *bibtmpfile = /* name of temporary file */ 22 INVTEMPFILE; 23 24 int silent = 0; /* 0 => statistics printed */ 25 /* 1 => no statisitics printed */ 26 27 char *sort_it = 28 "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; 29 char sortcmd[maxstr]; 30 31 int argc; 32 char **argv; 33 34 main(argcount,arglist) 35 int argcount; 36 char **arglist; 37 { char *filename; 38 FILE *input, *output; 39 long int start,length; 40 char word[maxstr]; 41 int kcnt; 42 char tag_line[maxstr]; 43 int bol = 1; /* at beginning of line */ 44 45 long int records = 0; /* number of records read */ 46 long int keys = 0; /* number of keys read (occurences) */ 47 long int distinct; /* number of distinct keys */ 48 long int shorten(); 49 50 InitDirectory(BMACLIB,N_BMACLIB); 51 InitDirectory(COMFILE,N_COMFILE); 52 53 argc= argcount-1; 54 argv= arglist+1; 55 mktemp(bibtmpfile); 56 output= fopen(bibtmpfile,"w"); 57 58 for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) 59 { /* open input file */ 60 filename= *argv; 61 input= fopen(filename,"r"); 62 if (input==NULL) 63 { fprintf(stderr,"invert: error in open of %s\n", filename); 64 continue; 65 } 66 start= 0L; 67 length= 0L; 68 69 for(;;) /* each record */ { 70 /* find start of next record (exit if none) */ 71 start= nextrecord(input,start+length); 72 if (start==EOF) break; 73 records++; 74 kcnt= 0; 75 length= recsize(input,start); 76 sprintf(tag_line, " %s %d %d\n", filename, start, length); 77 78 while (ftell(input) < start+length && kcnt < max_kcnt) { 79 getword(input,word,ignore,&bol); 80 makekey(word,max_klen,COMFILE); 81 if (*word != NULL) { 82 fputs(word,output); fputs(tag_line,output); 83 kcnt++; keys++; 84 } 85 } 86 } 87 fclose(input); 88 } 89 fclose(output); 90 91 sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile); 92 system(sortcmd); 93 94 distinct = shorten(bibtmpfile,INDEX); 95 if( silent == 0 ) 96 fprintf(stderr, 97 "%d documents %d distinct keys %d key occurrences\n", 98 records, distinct, keys); 99 exit(0); 100 } 101 102 103 104 /* Flag Meaning Default 105 -ki Keys per record 100 106 -li max Length of keys 6 107 -%str ignore lines that begin with %x CNOPVX 108 where x is in str 109 str is a seq of chars 110 -cfile file contains Common words /usr/new/lib/bib/common 111 do not use common words as keys 112 -pfile name of output file INDEX 113 -s do not print statistics statistics printed 114 */ 115 116 # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) 117 118 flags() 119 { 120 char *p; 121 for (; argc>0 && *argv[0]=='-'; argc--,argv++) 122 { switch ((*argv)[1]) 123 { case 'k': max_kcnt= atoi(operand); 124 break; 125 case 'l': max_klen= atoi(operand); 126 break; 127 case 'c': strcpy(COMFILE,operand); 128 break; 129 case '%': ignore= *argv+2; 130 break; 131 case 'p': INDEX= operand; 132 break; 133 case 's': silent= 1; 134 break; 135 case 'd': 136 p = &argv[0][2]; 137 if (!p) { 138 argv++; 139 p = &argv[0][0]; 140 } 141 strreplace(COMFILE, BMACLIB, p); 142 strcpy(BMACLIB, p); 143 break; 144 default: fprintf(stderr,"unknown flag '%s'\n", *argv); 145 } 146 } 147 } 148 149 150 /* shorten(inf,outf): file "inf" consists of lines of the form: 151 key file start length 152 sorted by key and file. replace lines with the same key 153 with one line of the form: 154 key:file1 start/length ... start/length:file2 start/length ... 155 rename as file "outf" 156 returns number of lines in output 157 */ 158 long shorten(inf,outf) 159 char *inf, *outf; 160 { FILE *in, *out; 161 char line[maxstr]; 162 char key[maxstr], newkey[maxstr], 163 file[maxstr], newfile[maxstr]; 164 long int start, length; 165 long int lines = 0; 166 167 in= fopen(inf, "r"); 168 out= fopen(outf, "w"); 169 if (in==NULL || out==NULL) 170 { fprintf(stderr,"invert: error in opening file for compression\n"); 171 return(0); 172 } 173 174 getline(in,line); 175 sscanf(line,"%s%s%d%d", key, file, &start, &length); 176 fprintf(out, "%s :%s %d/%d", key, file, start, length); 177 for ( getline(in, line) ; !feof(in); getline(in, line)) 178 { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); 179 if (strcmp(key,newkey)!=0) 180 { strcpy(key, newkey); 181 strcpy(file, newfile); 182 fprintf(out, "\n%s :%s %d/%d", key, file, start, length); 183 lines++; 184 } 185 else if (strcmp(file,newfile)!=0) 186 { strcpy(file,newfile); 187 fprintf(out, ":%s %d/%d", file, start, length); 188 } 189 else 190 fprintf(out, " %d/%d", start, length); 191 } 192 fprintf(out, "\n"); 193 lines++; 194 195 fclose(in); fclose(out); 196 unlink(inf); 197 return (lines); 198 } 199