xref: /original-bsd/contrib/bib/src/invert.c (revision e74403ba)
1 #ifndef lint
2 static char sccsid[] = "@(#)invert.c	2.2	10/06/83";
3 #endif not lint
4 #
5 /*  input:  records of lines, separated by blank lines
6     output: key:file1 start/length ... start/length:file2 start/length ...
7 */
8 
9 # include "stdio.h"
10 # include "streams.h"
11 # include "bib.h"
12 # define isnull(x)  (*(x) == NULL)
13 # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
14 
15 int     max_kcnt = 100;     /*  max number of keys                      */
16 int     max_klen =   6;     /*  max length of keys                      */
17 char    *ignore =           /*  string of line starts to ignore         */
18             "CNOPVX";
19 char    *common =           /*  name of file of common words            */
20             COMFILE;
21 char    *INDEX=             /*  name of output file                     */
22             INDXFILE;
23 
24 char    *tmpfile =          /*  name of temporary file                  */
25             INVTEMPFILE;
26 
27 int	silent = 0;	    /*  0 => statistics printed			*/
28 			    /*  1 => no statisitics printed		*/
29 
30 char *sort_it =
31         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
32 char sortcmd[maxstr];
33 
34 int     argc;
35 char    **argv;
36 
37 main(argcount,arglist)
38 int argcount;
39 char **arglist;
40 {   char            *filename;
41     FILE            *input, *output;
42     long int        start,length;
43     char            word[maxstr];
44     int             kcnt;
45     char            tag_line[maxstr];
46 
47     long int	    records = 0;  /*  number of records read           */
48     long int	    keys    = 0;  /*  number of keys read (occurences) */
49     long int	    distinct;     /*  number of distinct keys          */
50     long int	    shorten();
51 
52     argc= argcount-1;
53     argv= arglist+1;
54     mktemp(tmpfile);
55     output= fopen(tmpfile,"w");
56 
57     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
58     {   /* open input file              */
59             filename=   *argv;
60             input=      fopen(filename,"r");
61             if (input==NULL)
62             {   fprintf(stderr, "invert: error in open of %s\n", filename);
63                 continue;
64             }
65             start=      0L;
66             length=     0L;
67 
68         for(;;) /* each record  */
69         {   /* find start of next record (exit if none)     */
70                 start= nextrecord(input,start+length);
71                 if (start==EOF)   break;
72             records++;
73 	    kcnt= 0;
74             length= recsize(input,start);
75             sprintf(tag_line, " %s %d %d\n", filename, start, length);
76 
77             while (ftell(input) < start+length && kcnt < max_kcnt)
78             {   getword(input,word,ignore);
79                 makekey(word,max_klen,common);
80                 if (!isnull(word))
81                 {   fputs(word,output); fputs(tag_line,output);
82                     kcnt++; keys++;
83                 }
84             }
85         }
86         fclose(input);
87     }
88     fclose(output);
89 
90     sprintf(sortcmd, sort_it, tmpfile, tmpfile);
91     system(sortcmd);
92 
93     distinct = shorten(tmpfile,INDEX);
94     if( silent == 0 )
95 	fprintf(stderr,
96 	    "%d documents   %d distinct keys  %d key occurrences\n",
97 	    records, distinct, keys);
98 }
99 
100 
101 
102 /*  Flag    Meaning                             Default
103     -ki     Keys per record                     100
104     -li     max Length of keys                  6
105     -%str   ignore lines that begin with %x     CNOPVX
106             where x is in str
107             str is a seq of chars
108     -cfile  file contains Common words          /usr/src/local/bib/common
109             do not use common words as keys
110     -pfile  name of output file                 INDEX
111     -s	    do not print statistics		statistics printed
112 */
113 
114 # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
115 
116 flags()
117 {   for (; argc>0 && *argv[0]=='-';  argc--,argv++)
118     {   switch ((*argv)[1])
119         {   case 'k':   max_kcnt= atoi(operand);
120                         break;
121             case 'l':   max_klen= atoi(operand);
122                         break;
123             case 'c':   common=  operand;
124                         break;
125             case '%':   ignore=  *argv+2;
126                         break;
127             case 'p':   INDEX=  operand;
128                         break;
129 	    case 's':	silent= 1;
130 			break;
131             default:    fprintf(stderr, "unknown flag '%s'\n", *argv);
132         }
133     }
134 }
135 
136 
137 /*  shorten(inf,outf): file "inf" consists of lines of the form:
138         key file start length
139     sorted by key and file.  replace lines with the same key
140     with one line of the form:
141         key:file1 start/length ... start/length:file2 start/length ...
142     rename as file "outf"
143     returns number of lines in output
144 */
145 long shorten(inf,outf)
146 char *inf, *outf;
147 {   FILE *in, *out;
148     char line[maxstr];
149     char key[maxstr],  newkey[maxstr],
150          file[maxstr], newfile[maxstr];
151     long int start, length;
152     long int lines = 0;
153 
154     in=  fopen(inf, "r");
155     out= fopen(outf, "w");
156     if (in==NULL || out==NULL)
157     {   fprintf(stderr, "invert: error in opening file for compression\n");
158         return(0);
159     }
160 
161     getline(in,line);
162     sscanf(line,"%s%s%d%d", key, file, &start, &length);
163     fprintf(out, "%s :%s %d/%d", key, file, start, length);
164     for ( getline(in, line) ; !feof(in);  getline(in, line))
165     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
166         if (strcmp(key,newkey)!=0)
167         {   strcpy(key, newkey);
168             strcpy(file, newfile);
169             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
170 	    lines++;
171         }
172         else if (strcmp(file,newfile)!=0)
173         {   strcpy(file,newfile);
174             fprintf(out, ":%s %d/%d", file, start, length);
175         }
176         else
177             fprintf(out, " %d/%d", start, length);
178     }
179     fprintf(out, "\n");
180     lines++;
181 
182     fclose(in); fclose(out);
183     unlink(inf);
184     return (lines);
185 }
186