1e02f99adSrrh #ifndef lint
2*99c1622cSbostic static char sccsid[] = "@(#)invert.c 2.7 05/27/93";
3e02f99adSrrh #endif not lint
4ae10e56bSgarrison #
5a2c03686Sgarrison /* input: records of lines, separated by blank lines
6a2c03686Sgarrison output: key:file1 start/length ... start/length:file2 start/length ...
7a2c03686Sgarrison */
8a2c03686Sgarrison
9a2c03686Sgarrison # include "stdio.h"
10a2c03686Sgarrison # include "streams.h"
11a2c03686Sgarrison # include "bib.h"
12a2c03686Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
13a2c03686Sgarrison
14a2c03686Sgarrison int max_kcnt = 100; /* max number of keys */
15a2c03686Sgarrison int max_klen = 6; /* max length of keys */
16a2c03686Sgarrison char *ignore = /* string of line starts to ignore */
17a2c03686Sgarrison "CNOPVX";
18a2c03686Sgarrison char *INDEX= /* name of output file */
19a2c03686Sgarrison INDXFILE;
20a2c03686Sgarrison
2177e364ccSgarrison char *bibtmpfile = /* name of temporary file */
22a2c03686Sgarrison INVTEMPFILE;
23a2c03686Sgarrison
24a2c03686Sgarrison int silent = 0; /* 0 => statistics printed */
25a2c03686Sgarrison /* 1 => no statisitics printed */
26a2c03686Sgarrison
27a2c03686Sgarrison char *sort_it =
28a2c03686Sgarrison "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29a2c03686Sgarrison char sortcmd[maxstr];
30a2c03686Sgarrison
31a2c03686Sgarrison int argc;
32a2c03686Sgarrison char **argv;
33a2c03686Sgarrison
main(argcount,arglist)34a2c03686Sgarrison main(argcount,arglist)
35a2c03686Sgarrison int argcount;
36a2c03686Sgarrison char **arglist;
37a2c03686Sgarrison { char *filename;
38a2c03686Sgarrison FILE *input, *output;
39a2c03686Sgarrison long int start,length;
40a2c03686Sgarrison char word[maxstr];
41a2c03686Sgarrison int kcnt;
42a2c03686Sgarrison char tag_line[maxstr];
43*99c1622cSbostic int bol = 1; /* at beginning of line */
44a2c03686Sgarrison
45a2c03686Sgarrison long int records = 0; /* number of records read */
46a2c03686Sgarrison long int keys = 0; /* number of keys read (occurences) */
47a2c03686Sgarrison long int distinct; /* number of distinct keys */
48a2c03686Sgarrison long int shorten();
49a2c03686Sgarrison
50*99c1622cSbostic InitDirectory(BMACLIB,N_BMACLIB);
51*99c1622cSbostic InitDirectory(COMFILE,N_COMFILE);
526f5ac60cSrrh
53a2c03686Sgarrison argc= argcount-1;
54a2c03686Sgarrison argv= arglist+1;
5577e364ccSgarrison mktemp(bibtmpfile);
5677e364ccSgarrison output= fopen(bibtmpfile,"w");
57a2c03686Sgarrison
58a2c03686Sgarrison for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
59a2c03686Sgarrison { /* open input file */
60a2c03686Sgarrison filename= *argv;
61a2c03686Sgarrison input= fopen(filename,"r");
62a2c03686Sgarrison if (input==NULL)
63a2c03686Sgarrison { fprintf(stderr,"invert: error in open of %s\n", filename);
64a2c03686Sgarrison continue;
65a2c03686Sgarrison }
66a2c03686Sgarrison start= 0L;
67a2c03686Sgarrison length= 0L;
68a2c03686Sgarrison
69*99c1622cSbostic for(;;) /* each record */ {
70*99c1622cSbostic /* find start of next record (exit if none) */
71a2c03686Sgarrison start= nextrecord(input,start+length);
72a2c03686Sgarrison if (start==EOF) break;
73a2c03686Sgarrison records++;
74a2c03686Sgarrison kcnt= 0;
75a2c03686Sgarrison length= recsize(input,start);
76ae10e56bSgarrison sprintf(tag_line, " %s %d %d\n", filename, start, length);
77a2c03686Sgarrison
78*99c1622cSbostic while (ftell(input) < start+length && kcnt < max_kcnt) {
79*99c1622cSbostic getword(input,word,ignore,&bol);
80*99c1622cSbostic makekey(word,max_klen,COMFILE);
81*99c1622cSbostic if (*word != NULL) {
82*99c1622cSbostic fputs(word,output); fputs(tag_line,output);
83a2c03686Sgarrison kcnt++; keys++;
84a2c03686Sgarrison }
85a2c03686Sgarrison }
86a2c03686Sgarrison }
87a2c03686Sgarrison fclose(input);
88a2c03686Sgarrison }
89a2c03686Sgarrison fclose(output);
90a2c03686Sgarrison
9177e364ccSgarrison sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
92a2c03686Sgarrison system(sortcmd);
93a2c03686Sgarrison
9477e364ccSgarrison distinct = shorten(bibtmpfile,INDEX);
95a2c03686Sgarrison if( silent == 0 )
96a2c03686Sgarrison fprintf(stderr,
97ae10e56bSgarrison "%d documents %d distinct keys %d key occurrences\n",
98a2c03686Sgarrison records, distinct, keys);
99494e4512Sralph exit(0);
100a2c03686Sgarrison }
101a2c03686Sgarrison
102a2c03686Sgarrison
103a2c03686Sgarrison
104a2c03686Sgarrison /* Flag Meaning Default
105a2c03686Sgarrison -ki Keys per record 100
106a2c03686Sgarrison -li max Length of keys 6
107a2c03686Sgarrison -%str ignore lines that begin with %x CNOPVX
108a2c03686Sgarrison where x is in str
109a2c03686Sgarrison str is a seq of chars
11029d940b6Smckusick -cfile file contains Common words /usr/new/lib/bib/common
111a2c03686Sgarrison do not use common words as keys
112a2c03686Sgarrison -pfile name of output file INDEX
113a2c03686Sgarrison -s do not print statistics statistics printed
114a2c03686Sgarrison */
115a2c03686Sgarrison
116a2c03686Sgarrison # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
117a2c03686Sgarrison
flags()118a2c03686Sgarrison flags()
1196f5ac60cSrrh {
1206f5ac60cSrrh char *p;
1216f5ac60cSrrh for (; argc>0 && *argv[0]=='-'; argc--,argv++)
122a2c03686Sgarrison { switch ((*argv)[1])
123a2c03686Sgarrison { case 'k': max_kcnt= atoi(operand);
124a2c03686Sgarrison break;
125a2c03686Sgarrison case 'l': max_klen= atoi(operand);
126a2c03686Sgarrison break;
127*99c1622cSbostic case 'c': strcpy(COMFILE,operand);
128a2c03686Sgarrison break;
129a2c03686Sgarrison case '%': ignore= *argv+2;
130a2c03686Sgarrison break;
131a2c03686Sgarrison case 'p': INDEX= operand;
132a2c03686Sgarrison break;
133a2c03686Sgarrison case 's': silent= 1;
134a2c03686Sgarrison break;
1356f5ac60cSrrh case 'd':
1366f5ac60cSrrh p = &argv[0][2];
1376f5ac60cSrrh if (!p) {
1386f5ac60cSrrh argv++;
1396f5ac60cSrrh p = &argv[0][0];
1406f5ac60cSrrh }
1416f5ac60cSrrh strreplace(COMFILE, BMACLIB, p);
1426f5ac60cSrrh strcpy(BMACLIB, p);
1436f5ac60cSrrh break;
144a2c03686Sgarrison default: fprintf(stderr,"unknown flag '%s'\n", *argv);
145a2c03686Sgarrison }
146a2c03686Sgarrison }
147a2c03686Sgarrison }
148a2c03686Sgarrison
149a2c03686Sgarrison
150a2c03686Sgarrison /* shorten(inf,outf): file "inf" consists of lines of the form:
151a2c03686Sgarrison key file start length
152a2c03686Sgarrison sorted by key and file. replace lines with the same key
153a2c03686Sgarrison with one line of the form:
154a2c03686Sgarrison key:file1 start/length ... start/length:file2 start/length ...
155a2c03686Sgarrison rename as file "outf"
156a2c03686Sgarrison returns number of lines in output
157a2c03686Sgarrison */
shorten(inf,outf)158a2c03686Sgarrison long shorten(inf,outf)
159a2c03686Sgarrison char *inf, *outf;
160a2c03686Sgarrison { FILE *in, *out;
161a2c03686Sgarrison char line[maxstr];
162a2c03686Sgarrison char key[maxstr], newkey[maxstr],
163a2c03686Sgarrison file[maxstr], newfile[maxstr];
164a2c03686Sgarrison long int start, length;
165a2c03686Sgarrison long int lines = 0;
166a2c03686Sgarrison
167a2c03686Sgarrison in= fopen(inf, "r");
168a2c03686Sgarrison out= fopen(outf, "w");
169a2c03686Sgarrison if (in==NULL || out==NULL)
170a2c03686Sgarrison { fprintf(stderr,"invert: error in opening file for compression\n");
171a2c03686Sgarrison return(0);
172a2c03686Sgarrison }
173a2c03686Sgarrison
174a2c03686Sgarrison getline(in,line);
175ae10e56bSgarrison sscanf(line,"%s%s%d%d", key, file, &start, &length);
176ae10e56bSgarrison fprintf(out, "%s :%s %d/%d", key, file, start, length);
177a2c03686Sgarrison for ( getline(in, line) ; !feof(in); getline(in, line))
178ae10e56bSgarrison { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
179a2c03686Sgarrison if (strcmp(key,newkey)!=0)
180a2c03686Sgarrison { strcpy(key, newkey);
181a2c03686Sgarrison strcpy(file, newfile);
182ae10e56bSgarrison fprintf(out, "\n%s :%s %d/%d", key, file, start, length);
183a2c03686Sgarrison lines++;
184a2c03686Sgarrison }
185a2c03686Sgarrison else if (strcmp(file,newfile)!=0)
186a2c03686Sgarrison { strcpy(file,newfile);
187ae10e56bSgarrison fprintf(out, ":%s %d/%d", file, start, length);
188a2c03686Sgarrison }
189a2c03686Sgarrison else
190ae10e56bSgarrison fprintf(out, " %d/%d", start, length);
191a2c03686Sgarrison }
192a2c03686Sgarrison fprintf(out, "\n");
193a2c03686Sgarrison lines++;
194a2c03686Sgarrison
195a2c03686Sgarrison fclose(in); fclose(out);
196a2c03686Sgarrison unlink(inf);
197a2c03686Sgarrison return (lines);
198a2c03686Sgarrison }
199