xref: /original-bsd/contrib/bib/src/invert.c (revision 542201aa)
1 #ifndef lint
2 static char sccsid[] = "@(#)invert.c	2.6	03/05/87";
3 #endif not lint
4 #
5 /*  input:  records of lines, separated by blank lines
6     output: key:file1 start/length ... start/length:file2 start/length ...
7 */
8 
9 # include "stdio.h"
10 # include "streams.h"
11 # include "bib.h"
12 # define isnull(x)  (*(x) == NULL)
13 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
14 
15 int     max_kcnt = 100;     /*  max number of keys                      */
16 int     max_klen =   6;     /*  max length of keys                      */
17 char    *ignore =           /*  string of line starts to ignore         */
18             "CNOPVX";
19 char    *common =           /*  name of file of common words            */
20             COMFILE;
21 char    *INDEX=             /*  name of output file                     */
22             INDXFILE;
23 
24 char    *bibtmpfile =          /*  name of temporary file                  */
25             INVTEMPFILE;
26 
27 int	silent = 0;	    /*  0 => statistics printed			*/
28 			    /*  1 => no statisitics printed		*/
29 
30 char *sort_it =
31         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
32 char sortcmd[maxstr];
33 
34 int     argc;
35 char    **argv;
36 
37 main(argcount,arglist)
38 int argcount;
39 char **arglist;
40 {   char            *filename;
41     FILE            *input, *output;
42     long int        start,length;
43     char            word[maxstr];
44     int             kcnt;
45     char            tag_line[maxstr];
46 
47     long int	    records = 0;  /*  number of records read           */
48     long int	    keys    = 0;  /*  number of keys read (occurences) */
49     long int	    distinct;     /*  number of distinct keys          */
50     long int	    shorten();
51 
52     strcpy(COMFILE, N_COMFILE);
53     strcpy(BMACLIB, N_BMACLIB);
54 
55     argc= argcount-1;
56     argv= arglist+1;
57     mktemp(bibtmpfile);
58     output= fopen(bibtmpfile,"w");
59 
60     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
61     {   /* open input file              */
62             filename=   *argv;
63             input=      fopen(filename,"r");
64             if (input==NULL)
65             {   fprintf(stderr, "invert: error in open of %s\n", filename);
66                 continue;
67             }
68             start=      0L;
69             length=     0L;
70 
71         for(;;) /* each record  */
72         {   /* find start of next record (exit if none)     */
73                 start= nextrecord(input,start+length);
74                 if (start==EOF)   break;
75             records++;
76 	    kcnt= 0;
77             length= recsize(input,start);
78             sprintf(tag_line, " %s %d %d\n", filename, start, length);
79 
80             while (ftell(input) < start+length && kcnt < max_kcnt)
81             {   getword(input,word,ignore);
82                 makekey(word,max_klen,common);
83                 if (!isnull(word))
84                 {   fputs(word,output); fputs(tag_line,output);
85                     kcnt++; keys++;
86                 }
87             }
88         }
89         fclose(input);
90     }
91     fclose(output);
92 
93     sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
94     system(sortcmd);
95 
96     distinct = shorten(bibtmpfile,INDEX);
97     if( silent == 0 )
98 	fprintf(stderr,
99 	    "%d documents   %d distinct keys  %d key occurrences\n",
100 	    records, distinct, keys);
101     exit(0);
102 }
103 
104 
105 
106 /*  Flag    Meaning                             Default
107     -ki     Keys per record                     100
108     -li     max Length of keys                  6
109     -%str   ignore lines that begin with %x     CNOPVX
110             where x is in str
111             str is a seq of chars
112     -cfile  file contains Common words          /usr/new/lib/bib/common
113             do not use common words as keys
114     -pfile  name of output file                 INDEX
115     -s	    do not print statistics		statistics printed
116 */
117 
118 # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
119 
120 flags()
121 {
122     char *p;
123     for (; argc>0 && *argv[0]=='-';  argc--,argv++)
124     {   switch ((*argv)[1])
125         {   case 'k':   max_kcnt= atoi(operand);
126                         break;
127             case 'l':   max_klen= atoi(operand);
128                         break;
129             case 'c':   common=  operand;
130                         break;
131             case '%':   ignore=  *argv+2;
132                         break;
133             case 'p':   INDEX=  operand;
134                         break;
135 	    case 's':	silent= 1;
136 			break;
137 	    case 'd':
138 		p = &argv[0][2];
139 		if (!p) {
140 			argv++;
141 			p = &argv[0][0];
142 		}
143 		strreplace(COMFILE, BMACLIB, p);
144 		strcpy(BMACLIB, p);
145 		break;
146             default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
147         }
148     }
149 }
150 
151 
152 /*  shorten(inf,outf): file "inf" consists of lines of the form:
153         key file start length
154     sorted by key and file.  replace lines with the same key
155     with one line of the form:
156         key:file1 start/length ... start/length:file2 start/length ...
157     rename as file "outf"
158     returns number of lines in output
159 */
160 long shorten(inf,outf)
161 char *inf, *outf;
162 {   FILE *in, *out;
163     char line[maxstr];
164     char key[maxstr],  newkey[maxstr],
165          file[maxstr], newfile[maxstr];
166     long int start, length;
167     long int lines = 0;
168 
169     in=  fopen(inf, "r");
170     out= fopen(outf, "w");
171     if (in==NULL || out==NULL)
172     {   fprintf(stderr, "invert: error in opening file for compression\n");
173         return(0);
174     }
175 
176     getline(in,line);
177     sscanf(line,"%s%s%d%d", key, file, &start, &length);
178     fprintf(out, "%s :%s %d/%d", key, file, start, length);
179     for ( getline(in, line) ; !feof(in);  getline(in, line))
180     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
181         if (strcmp(key,newkey)!=0)
182         {   strcpy(key, newkey);
183             strcpy(file, newfile);
184             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
185 	    lines++;
186         }
187         else if (strcmp(file,newfile)!=0)
188         {   strcpy(file,newfile);
189             fprintf(out, ":%s %d/%d", file, start, length);
190         }
191         else
192             fprintf(out, " %d/%d", start, length);
193     }
194     fprintf(out, "\n");
195     lines++;
196 
197     fclose(in); fclose(out);
198     unlink(inf);
199     return (lines);
200 }
201