1 #ifndef lint
2 static char sccsid[] = "@(#)invert.c 2.7 05/27/93";
3 #endif not lint
4 #
5 /* input: records of lines, separated by blank lines
6 output: key:file1 start/length ... start/length:file2 start/length ...
7 */
8
9 # include "stdio.h"
10 # include "streams.h"
11 # include "bib.h"
12 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
13
14 int max_kcnt = 100; /* max number of keys */
15 int max_klen = 6; /* max length of keys */
16 char *ignore = /* string of line starts to ignore */
17 "CNOPVX";
18 char *INDEX= /* name of output file */
19 INDXFILE;
20
21 char *bibtmpfile = /* name of temporary file */
22 INVTEMPFILE;
23
24 int silent = 0; /* 0 => statistics printed */
25 /* 1 => no statisitics printed */
26
27 char *sort_it =
28 "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29 char sortcmd[maxstr];
30
31 int argc;
32 char **argv;
33
main(argcount,arglist)34 main(argcount,arglist)
35 int argcount;
36 char **arglist;
37 { char *filename;
38 FILE *input, *output;
39 long int start,length;
40 char word[maxstr];
41 int kcnt;
42 char tag_line[maxstr];
43 int bol = 1; /* at beginning of line */
44
45 long int records = 0; /* number of records read */
46 long int keys = 0; /* number of keys read (occurences) */
47 long int distinct; /* number of distinct keys */
48 long int shorten();
49
50 InitDirectory(BMACLIB,N_BMACLIB);
51 InitDirectory(COMFILE,N_COMFILE);
52
53 argc= argcount-1;
54 argv= arglist+1;
55 mktemp(bibtmpfile);
56 output= fopen(bibtmpfile,"w");
57
58 for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
59 { /* open input file */
60 filename= *argv;
61 input= fopen(filename,"r");
62 if (input==NULL)
63 { fprintf(stderr,"invert: error in open of %s\n", filename);
64 continue;
65 }
66 start= 0L;
67 length= 0L;
68
69 for(;;) /* each record */ {
70 /* find start of next record (exit if none) */
71 start= nextrecord(input,start+length);
72 if (start==EOF) break;
73 records++;
74 kcnt= 0;
75 length= recsize(input,start);
76 sprintf(tag_line, " %s %d %d\n", filename, start, length);
77
78 while (ftell(input) < start+length && kcnt < max_kcnt) {
79 getword(input,word,ignore,&bol);
80 makekey(word,max_klen,COMFILE);
81 if (*word != NULL) {
82 fputs(word,output); fputs(tag_line,output);
83 kcnt++; keys++;
84 }
85 }
86 }
87 fclose(input);
88 }
89 fclose(output);
90
91 sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
92 system(sortcmd);
93
94 distinct = shorten(bibtmpfile,INDEX);
95 if( silent == 0 )
96 fprintf(stderr,
97 "%d documents %d distinct keys %d key occurrences\n",
98 records, distinct, keys);
99 exit(0);
100 }
101
102
103
104 /* Flag Meaning Default
105 -ki Keys per record 100
106 -li max Length of keys 6
107 -%str ignore lines that begin with %x CNOPVX
108 where x is in str
109 str is a seq of chars
110 -cfile file contains Common words /usr/new/lib/bib/common
111 do not use common words as keys
112 -pfile name of output file INDEX
113 -s do not print statistics statistics printed
114 */
115
116 # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
117
flags()118 flags()
119 {
120 char *p;
121 for (; argc>0 && *argv[0]=='-'; argc--,argv++)
122 { switch ((*argv)[1])
123 { case 'k': max_kcnt= atoi(operand);
124 break;
125 case 'l': max_klen= atoi(operand);
126 break;
127 case 'c': strcpy(COMFILE,operand);
128 break;
129 case '%': ignore= *argv+2;
130 break;
131 case 'p': INDEX= operand;
132 break;
133 case 's': silent= 1;
134 break;
135 case 'd':
136 p = &argv[0][2];
137 if (!p) {
138 argv++;
139 p = &argv[0][0];
140 }
141 strreplace(COMFILE, BMACLIB, p);
142 strcpy(BMACLIB, p);
143 break;
144 default: fprintf(stderr,"unknown flag '%s'\n", *argv);
145 }
146 }
147 }
148
149
150 /* shorten(inf,outf): file "inf" consists of lines of the form:
151 key file start length
152 sorted by key and file. replace lines with the same key
153 with one line of the form:
154 key:file1 start/length ... start/length:file2 start/length ...
155 rename as file "outf"
156 returns number of lines in output
157 */
shorten(inf,outf)158 long shorten(inf,outf)
159 char *inf, *outf;
160 { FILE *in, *out;
161 char line[maxstr];
162 char key[maxstr], newkey[maxstr],
163 file[maxstr], newfile[maxstr];
164 long int start, length;
165 long int lines = 0;
166
167 in= fopen(inf, "r");
168 out= fopen(outf, "w");
169 if (in==NULL || out==NULL)
170 { fprintf(stderr,"invert: error in opening file for compression\n");
171 return(0);
172 }
173
174 getline(in,line);
175 sscanf(line,"%s%s%d%d", key, file, &start, &length);
176 fprintf(out, "%s :%s %d/%d", key, file, start, length);
177 for ( getline(in, line) ; !feof(in); getline(in, line))
178 { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
179 if (strcmp(key,newkey)!=0)
180 { strcpy(key, newkey);
181 strcpy(file, newfile);
182 fprintf(out, "\n%s :%s %d/%d", key, file, start, length);
183 lines++;
184 }
185 else if (strcmp(file,newfile)!=0)
186 { strcpy(file,newfile);
187 fprintf(out, ":%s %d/%d", file, start, length);
188 }
189 else
190 fprintf(out, " %d/%d", start, length);
191 }
192 fprintf(out, "\n");
193 lines++;
194
195 fclose(in); fclose(out);
196 unlink(inf);
197 return (lines);
198 }
199