xref: /original-bsd/contrib/bib/src/invert.c (revision 0cad3712)
1 #ifndef lint
2 static char sccsid[] = "@(#)invert.c	2.7	05/27/93";
3 #endif not lint
4 #
5 /*  input:  records of lines, separated by blank lines
6     output: key:file1 start/length ... start/length:file2 start/length ...
7 */
8 
9 # include "stdio.h"
10 # include "streams.h"
11 # include "bib.h"
12 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
13 
14 int     max_kcnt = 100;     /*  max number of keys                      */
15 int     max_klen =   6;     /*  max length of keys                      */
16 char    *ignore =           /*  string of line starts to ignore         */
17             "CNOPVX";
18 char    *INDEX=             /*  name of output file                     */
19             INDXFILE;
20 
21 char    *bibtmpfile =          /*  name of temporary file                  */
22             INVTEMPFILE;
23 
24 int	silent = 0;	    /*  0 => statistics printed			*/
25 			    /*  1 => no statisitics printed		*/
26 
27 char *sort_it =
28         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29 char sortcmd[maxstr];
30 
31 int     argc;
32 char    **argv;
33 
34 main(argcount,arglist)
35 int argcount;
36 char **arglist;
37 {   char            *filename;
38     FILE            *input, *output;
39     long int        start,length;
40     char            word[maxstr];
41     int             kcnt;
42     char            tag_line[maxstr];
43     int 	    bol = 1; /* at beginning of line */
44 
45     long int	    records = 0;  /*  number of records read           */
46     long int	    keys    = 0;  /*  number of keys read (occurences) */
47     long int	    distinct;     /*  number of distinct keys          */
48     long int	    shorten();
49 
50     InitDirectory(BMACLIB,N_BMACLIB);
51     InitDirectory(COMFILE,N_COMFILE);
52 
53     argc= argcount-1;
54     argv= arglist+1;
55     mktemp(bibtmpfile);
56     output= fopen(bibtmpfile,"w");
57 
58     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
59     {   /* open input file              */
60             filename=   *argv;
61             input=      fopen(filename,"r");
62             if (input==NULL)
63             {   fprintf(stderr,"invert: error in open of %s\n", filename);
64                 continue;
65             }
66       start=      0L;
67       length=     0L;
68 
69       for(;;) /* each record  */ {
70 	 /* find start of next record (exit if none)     */
71 	 start= nextrecord(input,start+length);
72 	 if (start==EOF)   break;
73 	 records++;
74 	 kcnt= 0;
75 	 length= recsize(input,start);
76 	 sprintf(tag_line, " %s %d %d\n", filename, start, length);
77 
78 	 while (ftell(input) < start+length && kcnt < max_kcnt) {
79 	    getword(input,word,ignore,&bol);
80 	    makekey(word,max_klen,COMFILE);
81 	    if (*word != NULL) {
82 	       fputs(word,output); fputs(tag_line,output);
83 	       kcnt++; keys++;
84 	       }
85 	    }
86 	 }
87        fclose(input);
88        }
89     fclose(output);
90 
91     sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
92     system(sortcmd);
93 
94     distinct = shorten(bibtmpfile,INDEX);
95     if( silent == 0 )
96 	fprintf(stderr,
97 	    "%d documents   %d distinct keys  %d key occurrences\n",
98 	    records, distinct, keys);
99     exit(0);
100 }
101 
102 
103 
104 /*  Flag    Meaning                             Default
105     -ki     Keys per record                     100
106     -li     max Length of keys                  6
107     -%str   ignore lines that begin with %x     CNOPVX
108             where x is in str
109             str is a seq of chars
110     -cfile  file contains Common words          /usr/new/lib/bib/common
111             do not use common words as keys
112     -pfile  name of output file                 INDEX
113     -s	    do not print statistics		statistics printed
114 */
115 
116 # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
117 
118 flags()
119 {
120     char *p;
121     for (; argc>0 && *argv[0]=='-';  argc--,argv++)
122     {   switch ((*argv)[1])
123         {   case 'k':   max_kcnt= atoi(operand);
124                         break;
125             case 'l':   max_klen= atoi(operand);
126                         break;
127             case 'c':   strcpy(COMFILE,operand);
128                         break;
129             case '%':   ignore=  *argv+2;
130                         break;
131             case 'p':   INDEX=  operand;
132                         break;
133 	    case 's':	silent= 1;
134 			break;
135 	    case 'd':
136 		p = &argv[0][2];
137 		if (!p) {
138 			argv++;
139 			p = &argv[0][0];
140 		}
141 		strreplace(COMFILE, BMACLIB, p);
142 		strcpy(BMACLIB, p);
143 		break;
144             default:    fprintf(stderr,"unknown flag '%s'\n", *argv);
145         }
146     }
147 }
148 
149 
150 /*  shorten(inf,outf): file "inf" consists of lines of the form:
151         key file start length
152     sorted by key and file.  replace lines with the same key
153     with one line of the form:
154         key:file1 start/length ... start/length:file2 start/length ...
155     rename as file "outf"
156     returns number of lines in output
157 */
158 long shorten(inf,outf)
159 char *inf, *outf;
160 {   FILE *in, *out;
161     char line[maxstr];
162     char key[maxstr],  newkey[maxstr],
163          file[maxstr], newfile[maxstr];
164     long int start, length;
165     long int lines = 0;
166 
167     in=  fopen(inf, "r");
168     out= fopen(outf, "w");
169     if (in==NULL || out==NULL)
170     {   fprintf(stderr,"invert: error in opening file for compression\n");
171         return(0);
172     }
173 
174     getline(in,line);
175     sscanf(line,"%s%s%d%d", key, file, &start, &length);
176     fprintf(out, "%s :%s %d/%d", key, file, start, length);
177     for ( getline(in, line) ; !feof(in);  getline(in, line))
178     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
179         if (strcmp(key,newkey)!=0)
180         {   strcpy(key, newkey);
181             strcpy(file, newfile);
182             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
183 	    lines++;
184         }
185         else if (strcmp(file,newfile)!=0)
186         {   strcpy(file,newfile);
187             fprintf(out, ":%s %d/%d", file, start, length);
188         }
189         else
190             fprintf(out, " %d/%d", start, length);
191     }
192     fprintf(out, "\n");
193     lines++;
194 
195     fclose(in); fclose(out);
196     unlink(inf);
197     return (lines);
198 }
199