xref: /original-bsd/contrib/bib/src/invert.c (revision 99c1622c)
1e02f99adSrrh #ifndef lint
2*99c1622cSbostic static char sccsid[] = "@(#)invert.c	2.7	05/27/93";
3e02f99adSrrh #endif not lint
4ae10e56bSgarrison #
5a2c03686Sgarrison /*  input:  records of lines, separated by blank lines
6a2c03686Sgarrison     output: key:file1 start/length ... start/length:file2 start/length ...
7a2c03686Sgarrison */
8a2c03686Sgarrison 
9a2c03686Sgarrison # include "stdio.h"
10a2c03686Sgarrison # include "streams.h"
11a2c03686Sgarrison # include "bib.h"
12a2c03686Sgarrison # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
13a2c03686Sgarrison 
14a2c03686Sgarrison int     max_kcnt = 100;     /*  max number of keys                      */
15a2c03686Sgarrison int     max_klen =   6;     /*  max length of keys                      */
16a2c03686Sgarrison char    *ignore =           /*  string of line starts to ignore         */
17a2c03686Sgarrison             "CNOPVX";
18a2c03686Sgarrison char    *INDEX=             /*  name of output file                     */
19a2c03686Sgarrison             INDXFILE;
20a2c03686Sgarrison 
2177e364ccSgarrison char    *bibtmpfile =          /*  name of temporary file                  */
22a2c03686Sgarrison             INVTEMPFILE;
23a2c03686Sgarrison 
24a2c03686Sgarrison int	silent = 0;	    /*  0 => statistics printed			*/
25a2c03686Sgarrison 			    /*  1 => no statisitics printed		*/
26a2c03686Sgarrison 
27a2c03686Sgarrison char *sort_it =
28a2c03686Sgarrison         "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29a2c03686Sgarrison char sortcmd[maxstr];
30a2c03686Sgarrison 
31a2c03686Sgarrison int     argc;
32a2c03686Sgarrison char    **argv;
33a2c03686Sgarrison 
main(argcount,arglist)34a2c03686Sgarrison main(argcount,arglist)
35a2c03686Sgarrison int argcount;
36a2c03686Sgarrison char **arglist;
37a2c03686Sgarrison {   char            *filename;
38a2c03686Sgarrison     FILE            *input, *output;
39a2c03686Sgarrison     long int        start,length;
40a2c03686Sgarrison     char            word[maxstr];
41a2c03686Sgarrison     int             kcnt;
42a2c03686Sgarrison     char            tag_line[maxstr];
43*99c1622cSbostic     int 	    bol = 1; /* at beginning of line */
44a2c03686Sgarrison 
45a2c03686Sgarrison     long int	    records = 0;  /*  number of records read           */
46a2c03686Sgarrison     long int	    keys    = 0;  /*  number of keys read (occurences) */
47a2c03686Sgarrison     long int	    distinct;     /*  number of distinct keys          */
48a2c03686Sgarrison     long int	    shorten();
49a2c03686Sgarrison 
50*99c1622cSbostic     InitDirectory(BMACLIB,N_BMACLIB);
51*99c1622cSbostic     InitDirectory(COMFILE,N_COMFILE);
526f5ac60cSrrh 
53a2c03686Sgarrison     argc= argcount-1;
54a2c03686Sgarrison     argv= arglist+1;
5577e364ccSgarrison     mktemp(bibtmpfile);
5677e364ccSgarrison     output= fopen(bibtmpfile,"w");
57a2c03686Sgarrison 
58a2c03686Sgarrison     for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
59a2c03686Sgarrison     {   /* open input file              */
60a2c03686Sgarrison             filename=   *argv;
61a2c03686Sgarrison             input=      fopen(filename,"r");
62a2c03686Sgarrison             if (input==NULL)
63a2c03686Sgarrison             {   fprintf(stderr,"invert: error in open of %s\n", filename);
64a2c03686Sgarrison                 continue;
65a2c03686Sgarrison             }
66a2c03686Sgarrison       start=      0L;
67a2c03686Sgarrison       length=     0L;
68a2c03686Sgarrison 
69*99c1622cSbostic       for(;;) /* each record  */ {
70*99c1622cSbostic 	 /* find start of next record (exit if none)     */
71a2c03686Sgarrison 	 start= nextrecord(input,start+length);
72a2c03686Sgarrison 	 if (start==EOF)   break;
73a2c03686Sgarrison 	 records++;
74a2c03686Sgarrison 	 kcnt= 0;
75a2c03686Sgarrison 	 length= recsize(input,start);
76ae10e56bSgarrison 	 sprintf(tag_line, " %s %d %d\n", filename, start, length);
77a2c03686Sgarrison 
78*99c1622cSbostic 	 while (ftell(input) < start+length && kcnt < max_kcnt) {
79*99c1622cSbostic 	    getword(input,word,ignore,&bol);
80*99c1622cSbostic 	    makekey(word,max_klen,COMFILE);
81*99c1622cSbostic 	    if (*word != NULL) {
82*99c1622cSbostic 	       fputs(word,output); fputs(tag_line,output);
83a2c03686Sgarrison 	       kcnt++; keys++;
84a2c03686Sgarrison 	       }
85a2c03686Sgarrison 	    }
86a2c03686Sgarrison 	 }
87a2c03686Sgarrison        fclose(input);
88a2c03686Sgarrison        }
89a2c03686Sgarrison     fclose(output);
90a2c03686Sgarrison 
9177e364ccSgarrison     sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
92a2c03686Sgarrison     system(sortcmd);
93a2c03686Sgarrison 
9477e364ccSgarrison     distinct = shorten(bibtmpfile,INDEX);
95a2c03686Sgarrison     if( silent == 0 )
96a2c03686Sgarrison 	fprintf(stderr,
97ae10e56bSgarrison 	    "%d documents   %d distinct keys  %d key occurrences\n",
98a2c03686Sgarrison 	    records, distinct, keys);
99494e4512Sralph     exit(0);
100a2c03686Sgarrison }
101a2c03686Sgarrison 
102a2c03686Sgarrison 
103a2c03686Sgarrison 
104a2c03686Sgarrison /*  Flag    Meaning                             Default
105a2c03686Sgarrison     -ki     Keys per record                     100
106a2c03686Sgarrison     -li     max Length of keys                  6
107a2c03686Sgarrison     -%str   ignore lines that begin with %x     CNOPVX
108a2c03686Sgarrison             where x is in str
109a2c03686Sgarrison             str is a seq of chars
11029d940b6Smckusick     -cfile  file contains Common words          /usr/new/lib/bib/common
111a2c03686Sgarrison             do not use common words as keys
112a2c03686Sgarrison     -pfile  name of output file                 INDEX
113a2c03686Sgarrison     -s	    do not print statistics		statistics printed
114a2c03686Sgarrison */
115a2c03686Sgarrison 
116a2c03686Sgarrison # define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
117a2c03686Sgarrison 
flags()118a2c03686Sgarrison flags()
1196f5ac60cSrrh {
1206f5ac60cSrrh     char *p;
1216f5ac60cSrrh     for (; argc>0 && *argv[0]=='-';  argc--,argv++)
122a2c03686Sgarrison     {   switch ((*argv)[1])
123a2c03686Sgarrison         {   case 'k':   max_kcnt= atoi(operand);
124a2c03686Sgarrison                         break;
125a2c03686Sgarrison             case 'l':   max_klen= atoi(operand);
126a2c03686Sgarrison                         break;
127*99c1622cSbostic             case 'c':   strcpy(COMFILE,operand);
128a2c03686Sgarrison                         break;
129a2c03686Sgarrison             case '%':   ignore=  *argv+2;
130a2c03686Sgarrison                         break;
131a2c03686Sgarrison             case 'p':   INDEX=  operand;
132a2c03686Sgarrison                         break;
133a2c03686Sgarrison 	    case 's':	silent= 1;
134a2c03686Sgarrison 			break;
1356f5ac60cSrrh 	    case 'd':
1366f5ac60cSrrh 		p = &argv[0][2];
1376f5ac60cSrrh 		if (!p) {
1386f5ac60cSrrh 			argv++;
1396f5ac60cSrrh 			p = &argv[0][0];
1406f5ac60cSrrh 		}
1416f5ac60cSrrh 		strreplace(COMFILE, BMACLIB, p);
1426f5ac60cSrrh 		strcpy(BMACLIB, p);
1436f5ac60cSrrh 		break;
144a2c03686Sgarrison             default:    fprintf(stderr,"unknown flag '%s'\n", *argv);
145a2c03686Sgarrison         }
146a2c03686Sgarrison     }
147a2c03686Sgarrison }
148a2c03686Sgarrison 
149a2c03686Sgarrison 
150a2c03686Sgarrison /*  shorten(inf,outf): file "inf" consists of lines of the form:
151a2c03686Sgarrison         key file start length
152a2c03686Sgarrison     sorted by key and file.  replace lines with the same key
153a2c03686Sgarrison     with one line of the form:
154a2c03686Sgarrison         key:file1 start/length ... start/length:file2 start/length ...
155a2c03686Sgarrison     rename as file "outf"
156a2c03686Sgarrison     returns number of lines in output
157a2c03686Sgarrison */
shorten(inf,outf)158a2c03686Sgarrison long shorten(inf,outf)
159a2c03686Sgarrison char *inf, *outf;
160a2c03686Sgarrison {   FILE *in, *out;
161a2c03686Sgarrison     char line[maxstr];
162a2c03686Sgarrison     char key[maxstr],  newkey[maxstr],
163a2c03686Sgarrison          file[maxstr], newfile[maxstr];
164a2c03686Sgarrison     long int start, length;
165a2c03686Sgarrison     long int lines = 0;
166a2c03686Sgarrison 
167a2c03686Sgarrison     in=  fopen(inf, "r");
168a2c03686Sgarrison     out= fopen(outf, "w");
169a2c03686Sgarrison     if (in==NULL || out==NULL)
170a2c03686Sgarrison     {   fprintf(stderr,"invert: error in opening file for compression\n");
171a2c03686Sgarrison         return(0);
172a2c03686Sgarrison     }
173a2c03686Sgarrison 
174a2c03686Sgarrison     getline(in,line);
175ae10e56bSgarrison     sscanf(line,"%s%s%d%d", key, file, &start, &length);
176ae10e56bSgarrison     fprintf(out, "%s :%s %d/%d", key, file, start, length);
177a2c03686Sgarrison     for ( getline(in, line) ; !feof(in);  getline(in, line))
178ae10e56bSgarrison     {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
179a2c03686Sgarrison         if (strcmp(key,newkey)!=0)
180a2c03686Sgarrison         {   strcpy(key, newkey);
181a2c03686Sgarrison             strcpy(file, newfile);
182ae10e56bSgarrison             fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
183a2c03686Sgarrison 	    lines++;
184a2c03686Sgarrison         }
185a2c03686Sgarrison         else if (strcmp(file,newfile)!=0)
186a2c03686Sgarrison         {   strcpy(file,newfile);
187ae10e56bSgarrison             fprintf(out, ":%s %d/%d", file, start, length);
188a2c03686Sgarrison         }
189a2c03686Sgarrison         else
190ae10e56bSgarrison             fprintf(out, " %d/%d", start, length);
191a2c03686Sgarrison     }
192a2c03686Sgarrison     fprintf(out, "\n");
193a2c03686Sgarrison     lines++;
194a2c03686Sgarrison 
195a2c03686Sgarrison     fclose(in); fclose(out);
196a2c03686Sgarrison     unlink(inf);
197a2c03686Sgarrison     return (lines);
198a2c03686Sgarrison }
199